In [124]:
# Import dependencies
%matplotlib inline
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from matplotlib import pyplot as plt
from sklearn.feature_selection import SelectFromModel

In [82]:
# Create dataframes
train_df = pd.read_csv(Path('Resources/Generator/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/Generator/2020Q1loans.csv'))


In [83]:
# Convert categorical data to numeric and separate target feature for testing data
train_x = train_df.drop(columns='target')
train_x = pd.get_dummies(train_x)
train_y = train_df['target']

label_enc = LabelEncoder()
train_y = label_enc.fit_transform(train_y)

# add missing dummy variables to testing set
test_x = test_df.drop(columns='target')
test_x = pd.get_dummies(test_x)
test_y = test_df['target']
test_y = label_enc.fit_transform(test_y)

In [84]:
train_x.shape

(12180, 92)

In [85]:
test_x.shape

(4702, 91)

In [86]:
matching_columns = []
for item in train_x.columns:
    if item in test_x.columns:
        pass
    else:
        matching_columns.append(item)
print(matching_columns)

train_x = train_x.drop(columns='debt_settlement_flag_Y')

['debt_settlement_flag_Y']


In [87]:
# Train the Logistic Regression model on the unscaled data and print the model score
clf = LogisticRegression(solver='lbfgs', max_iter=200000)
clf.fit(train_x, train_y)
print(f'Training Score: {clf.score(train_x, train_y)}')
print(f'Testing Socre: {clf.score(test_x, test_y)}')

Training Score: 0.7017241379310345
Testing Socre: 0.5650786899191833


In [72]:
# Train a Random Forest Classifier model and print the model score
clf2 = RandomForestClassifier()
clf2.fit(train_x, train_y)
print(f'Training Score: {clf2.score(train_x, train_y)}')
print(f'Testing Socre: {clf2.score(test_x, test_y)}')


Training Score: 1.0
Testing Socre: 0.6393024245002127


In [102]:
# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(train_x)
X_test_scaled = scaler.fit_transform(test_x)

In [103]:
# Train the Logistic Regression model on the scaled data and print the model score
clf = LogisticRegression(solver='lbfgs', max_iter=200000)
clf.fit(X_train_scaled, train_y)
print(f'Training Score: {clf.score(X_train_scaled, train_y)}')
print(f'Testing Socre: {clf.score(X_test_scaled, test_y)}')

Training Score: 0.7108374384236453
Testing Socre: 0.6586558911101659


In [105]:
# Train a Random Forest Classifier model on the scaled data and print the model score
clf3 = RandomForestClassifier()
clf3.fit(X_train_scaled, train_y)
print(f'Training Score: {clf.score(X_train_scaled, train_y)}')
print(f'Testing Score: {clf.score(X_test_scaled, test_y)}')

Training Score: 0.7108374384236453
Testing Score: 0.6586558911101659


In [123]:
# use the feature importance function to identify importance features
# sort them from highest to lowest
features = clf3.feature_importances_
indices = np.argsort(features)[::-1]
for i, feature_idx in enumerate(indices):
    feature_name = train_x.columns[feature_idx]  # Replace with appropriate feature names if using pandas DataFrame
    importance = features[feature_idx]
    print(f"Feature {i+1}: {feature_name} - Importance: {importance:.4f}")

Feature 1: last_pymnt_amnt - Importance: 0.1028
Feature 2: total_rec_prncp - Importance: 0.0548
Feature 3: total_pymnt - Importance: 0.0468
Feature 4: total_rec_int - Importance: 0.0466
Feature 5: total_pymnt_inv - Importance: 0.0454
Feature 6: int_rate - Importance: 0.0342
Feature 7: installment - Importance: 0.0308
Feature 8: out_prncp_inv - Importance: 0.0307
Feature 9: out_prncp - Importance: 0.0285
Feature 10: loan_amnt - Importance: 0.0167
Feature 11: mo_sin_old_rev_tl_op - Importance: 0.0166
Feature 12: total_rec_late_fee - Importance: 0.0166
Feature 13: max_bal_bc - Importance: 0.0158
Feature 14: dti - Importance: 0.0155
Feature 15: total_rev_hi_lim - Importance: 0.0154
Feature 16: total_bc_limit - Importance: 0.0153
Feature 17: bc_open_to_buy - Importance: 0.0153
Feature 18: revol_bal - Importance: 0.0151
Feature 19: mo_sin_old_il_acct - Importance: 0.0149
Feature 20: tot_hi_cred_lim - Importance: 0.0145
Feature 21: annual_inc - Importance: 0.0144
Feature 22: avg_cur_bal - Imp

In [138]:
# select threshold parameter
threshold = .04

# Select the important features
sel = SelectFromModel(clf3, threshold=threshold)
sel.fit(X_train_scaled, train_y)

# prints the values that are important
# sel.get_support()

# train with selected features
train_selected = sel.transform(X_train_scaled)
test_selected = sel.transform(X_test_scaled)

# Logistic Regression
clf = LogisticRegression(solver='lbfgs', max_iter=200000)
clf.fit(train_selected, train_y)
print(f'Log Training Score: {clf.score(train_selected, train_y)}')
print(f'Log Testing Score: {clf.score(test_selected, test_y)}')

print("-"*30)
# Random Forest Classifer
clf2 = RandomForestClassifier()
clf2.fit(train_selected, train_y)
print(f'RFC Training Score: {clf.score(train_selected, train_y)}')
print(f'RFC Testing Score: {clf.score(test_selected, test_y)}')

Log Training Score: 0.6547619047619048
Log Testing Score: 0.5284985112717993
----------
RFC Training Score: 0.6547619047619048
RFC Testing Score: 0.5284985112717993
