In [119]:
import pandas as pd
from datetime import datetime, timedelta
import re

# Load Raw Data
inputDF = pd.read_excel('loan_default_data.xlsx')

# Drop irrelevent columns
# 1) Unnamed Column: there are no details provided for these columns
unnamed_col = [i for i in inputDF.columns if 'unnamed' in i.lower()]
inputDF.drop(unnamed_col, axis=1, inplace=True)

# 2) ID Column: IDs are randomly (or increasing continuously) assigned numbers
# it has no correlation to the decision
id_col = ['id', 'member_id']
inputDF.drop(id_col, axis=1, inplace=True)

# 3) zip_code and address_state
# we might be able to include these columns if we can find a suitable way to group the states into fewer groups
# as for now, I will drop this
id_col = ['zip_code', 'address_state']
inputDF.drop(id_col, axis=1, inplace=True)

# 4) loan_status
# i am not quite understand this column, but seems like repay_fail depends on this column
# e.g.: Fully Paid  -> repay_fail False
#       Charged Off -> repay_fail True
# since this columns seems highly correlated to the target variable, I will drop this
inputDF.drop('loan_status', axis=1, inplace=True)

# 5) loan_amount and funded_amount_investors
# we only consider funded_amount which is the amount the client received
amt_list = ['loan_amount', 'funded_amount_investors']
inputDF.drop(amt_list, axis=1, inplace=True)









# Modify Data
# 1) change term to int column
inputDF['term'] = inputDF['term'].apply(lambda x: re.search('\d+',x).group()).astype(int)

# 2) change home_ownership value 'NONE' to None
inputDF['home_ownership'] = inputDF['home_ownership'].replace('NONE', None)

# 3) split verification_status to verified and source_verified (bool columns instead of category)
# Not Verified:     verified: False     source_verified: False
# Verified:         verified: True      source_verified: False
# Source Verified:  verified: True      source_verified: True
inputDF['verified'] = inputDF['verification_status'].isin(['Verified', 'Source Verified'])
inputDF['source_verified'] = inputDF['verification_status'].isin(['Source Verified'])
# drop verification_status column
inputDF.drop('verification_status', axis=1, inplace=True)

# 4) Group employment_length into quantiles
# because "more than 10 years" has been grouped into 10+, this is no longer a regression
emp_year = inputDF['employment_length']\
    .dropna()\
    .replace('< 1 year', '0 year')\
    .replace('10+ years', '10 years')\
    .apply(lambda x: int(re.search('\d+', x).group()))
emp_q = emp_year.quantile([0.25,0.50,0.75])
def getEmpLengthGroup(year):
    if year==None:
        return None
    else:
        if year <= emp_q[0.25]:
            return f'0 - {int(emp_q[0.25])}'
        elif year <= emp_q[0.50]:
            return f'{int(emp_q[0.25])+1} - {int(emp_q[0.50])}'
        elif year <= emp_q[0.75]:
            return f'{int(emp_q[0.50])+1} - {int(emp_q[0.75])}'
        else:
            return f'{int(emp_q[0.75])+1}+'
inputDF['employment_length_group'] = emp_year.apply(getEmpLengthGroup)
# drop employment_length column
inputDF.drop('employment_length', axis=1, inplace=True)

# 5) Change repay_fail column dtype to bool
inputDF['repay_fail'] = inputDF['repay_fail'].astype(bool)


In [120]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score

# TEST: drop datetime columns
inputDF.drop(['issue_date','earliest_credit_line','last_payment_date','next_payment_date','last_credit_pull_date'], axis=1, inplace=True)

# Drop rows with missing values
inputDF.dropna(inplace=True)

# One-hot encode categorical columns
inputDF = pd.get_dummies(inputDF, columns=['home_ownership', 'purpose', 'employment_length_group'], drop_first=True)

# Split the data into features (X) and target variable (y)
X = inputDF.drop('repay_fail', axis=1)
y = inputDF['repay_fail']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Train a logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Test
y_pred = model.predict(X_test)

# NOTE: Accuracy seems too good to be true
# Need to check for other columns that are directly affects the repay_fail
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))

[[2283    3]
 [   8  462]]
0.9960087082728593


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
