# Load Libraries and Data for Machine Learning models

# This notebook runs our two best models: Logistic Regression and Random Forest, both on binary classification

In [1]:
#Import warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
# General imports
import numpy as np
import pandas as pd
import os
from pathlib import Path
from collections import Counter

# For preprocessing ahead of running ML Models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import sklearn as skl 
from imblearn.over_sampling import RandomOverSampler
from imblearn.combine import SMOTEENN

#For ML models
from sklearn.datasets import make_blobs, make_classification
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# For model evaluation
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score, classification_report  
from imblearn.metrics import classification_report_imbalanced

In [3]:
# Load the data
file_path = Path('C:/Users/esobieski/Documents/Berkeley/TeamPySpark/bc_df.csv')
bc_df = pd.read_csv(file_path)  
bc_df.head(5)

Unnamed: 0.1,Unnamed: 0,ORIGINAL_LANGUAGE,FUNDED_AMOUNT,ACTIVITY_NAME,SECTOR_NAME,COUNTRY_NAME,PARTNER_ID,NUM_LENDERS_TOTAL,REPAYMENT_INTERVAL,DISTRIBUTION_MODEL,MALE,FEMALE,SUCCESS
0,102097,English,1100.0,Grocery Store,Food,Afghanistan,34.0,38,monthly,field_partner,1.0,0.0,1
1,1541219,English,150.0,Retail,Retail,Philippines,125.0,6,monthly,field_partner,0.0,1.0,1
2,549381,English,125.0,Vehicle Repairs,Services,Philippines,125.0,5,monthly,field_partner,0.0,1.0,1
3,1604539,English,1000.0,Tailoring,Services,Philippines,123.0,2,monthly,field_partner,1.0,0.0,1
4,117448,Spanish,300.0,General Store,Retail,El Salvador,167.0,11,monthly,field_partner,0.0,1.0,0


# Prepare data for machine learning models that will use binary classification

For logistic regression, our binary classification is that a successful borrowing event results in full funding within 12 days.  An unsucessful event would be funding taking longer than 12 days, as a reflection of less lender enthusiasm to fund the loan.  This removes the issue in the data that 99%+ of loans get funded and thus the data is very unbalanced if you just look at funding vs didn't fund.  

In [4]:
# Create our features  

X = bc_df.copy()
X = X.drop('SUCCESS', axis=1)

# Create our target
y = bc_df[['SUCCESS']]

In [5]:
# Describe X  -- MOST BORROWERS ARE SOLO FEMALES

X.describe()

Unnamed: 0.1,Unnamed: 0,FUNDED_AMOUNT,PARTNER_ID,NUM_LENDERS_TOTAL,MALE,FEMALE
count,89188.0,89188.0,89188.0,89188.0,89188.0,89188.0
mean,970162.2,781.656165,165.761179,21.653754,0.374019,1.601325
std,561101.8,2013.42264,104.358173,59.716169,1.010964,3.024326
min,1.0,25.0,4.0,1.0,0.0,0.0
25%,484137.0,275.0,109.0,8.0,0.0,1.0
50%,971660.0,500.0,145.0,14.0,0.0,1.0
75%,1457676.0,925.0,185.0,26.0,1.0,1.0
max,1941971.0,499975.0,607.0,15265.0,34.0,50.0


In [6]:
# List out y

y

Unnamed: 0,SUCCESS
0,1
1,1
2,1
3,1
4,0
...,...
89183,1
89184,1
89185,1
89186,0


In [7]:
# Check the balance of our target values  
# Used a calculated value of TIME TO FULL FUNDING using date stamps in prep for LOGISTIC REGRESSION
# SUCCESS is funding in 12 days or less

y['SUCCESS'].value_counts()

1    57166
0    32022
Name: SUCCESS, dtype: int64

In [8]:
# Encode Labels using Get Dummies

X = pd.get_dummies(X)
X.head()

Unnamed: 0.1,Unnamed: 0,FUNDED_AMOUNT,PARTNER_ID,NUM_LENDERS_TOTAL,MALE,FEMALE,ORIGINAL_LANGUAGE_Arabic,ORIGINAL_LANGUAGE_English,ORIGINAL_LANGUAGE_French,ORIGINAL_LANGUAGE_Indonesian,...,COUNTRY_NAME_Ukraine,COUNTRY_NAME_United States,COUNTRY_NAME_Vietnam,COUNTRY_NAME_Yemen,COUNTRY_NAME_Zambia,COUNTRY_NAME_Zimbabwe,REPAYMENT_INTERVAL_bullet,REPAYMENT_INTERVAL_irregular,REPAYMENT_INTERVAL_monthly,DISTRIBUTION_MODEL_field_partner
0,102097,1100.0,34.0,38,1.0,0.0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,1
1,1541219,150.0,125.0,6,0.0,1.0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,1
2,549381,125.0,125.0,5,0.0,1.0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,1
3,1604539,1000.0,123.0,2,1.0,0.0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,1
4,117448,300.0,167.0,11,0.0,1.0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1


# Train-Test split for Binary Classification (BC)

In [9]:
# ADD TRAIN-TEST SPLIT AFTER GETTING DUMMIES AND BEFORE SCALING
# Dataset is split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X,
    y, random_state=1, stratify=y)

# Scaling X_train and X_test for Binary Classification (BC)

In [10]:
# SCALING X_TRAIN STEP 
X_train_scaled = StandardScaler().fit_transform(X_train)
print(X_train_scaled[0:5])

[[-1.65051091  0.06289566 -0.58352357 ... -0.20975552 -2.55576252
   0.        ]
 [ 1.22049967  0.30837014 -0.19029816 ... -0.20975552  0.39127266
   0.        ]
 [ 0.4352047  -0.2830002   1.39219435 ... -0.20975552  0.39127266
   0.        ]
 [ 1.16626083 -0.24952641 -0.39170629 ... -0.20975552 -2.55576252
   0.        ]
 [ 0.92943056 -0.26068434 -0.26702507 ... -0.20975552  0.39127266
   0.        ]]


In [11]:
# SCALING X_TEST STEP HERE 
X_test_scaled = StandardScaler().fit_transform(X_test)
print(X_test_scaled[0:5])

[[-0.95602246 -0.53447836 -0.09098467 ... -0.21245991  0.3845136
   0.        ]
 [-0.04463413 -0.13900886  2.78576956 ... -0.21245991 -2.60068825
   0.        ]
 [ 0.61417544 -0.20879759 -1.49591115 ... -0.21245991  0.3845136
   0.        ]
 [-1.08757661  0.11688317 -0.62619476 ... -0.21245991  0.3845136
   0.        ]
 [ 0.3441734  -0.60426709 -0.28213113 ... -0.21245991  0.3845136
   0.        ]]


# SAMPLING TECHNIQUES (inputting scaled X and y)

Oversampling: because "y" is very right skewed leading to 50% of the data in one of 6 one-week buckets, thus model performance is 50%.  When it is 2 buckets, set at the mean of 12 days, then model performance is 69%, equal to the amount of "y" data in the 1 position, so again model is not effective, same prediction as the distribution of "y" data.

# Random Oversampling 

In [12]:
# Check imbalance in y
Counter(y_train)

Counter({'SUCCESS': 1})

In [13]:
# Random oversampling to rebalance
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train_scaled, y_train)

# Check on how much Y was rebalanced for Random oversampling
Counter(y_resampled)

Counter({'SUCCESS': 1})

In [14]:
y_resampled

Unnamed: 0,SUCCESS
0,1
1,1
2,1
3,1
4,1
...,...
85743,0
85744,0
85745,0
85746,0


# Resampling: combination oversampling and undersampling with SMOTEEN.  Commented out because not enough performance boost for multiples of compute time.

In [15]:
# # Resampling: combination oversampling and undersampling with SMOTEEN on binary classification model
# smote_enn = SMOTEENN(random_state=0)
# X_resampled, y_resampled = smote_enn.fit_resample(X_train_scaled, y_train)

# # See results of SMOTEEN Resampling
# Counter(y_resampled)

# Run PCA on Binary Classification model

In [16]:
# PCA model intialization 277 columns, reducing complexity
pca = PCA(n_components=32)

In [17]:
# PCA fit and transform for training
train_loans_pca = pca.fit_transform(X_train_scaled)

In [18]:
# Transform testing data using PCA to a DataFrame 
test_loans_pca = pca.transform(X_test_scaled)
X_test_pca_df = pd.DataFrame(data=test_loans_pca)
X_test_pca_df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,1.653393,-1.828051,-0.321642,-0.903934,0.758961,-0.470528,-0.400863,0.157932,-0.296038,-0.41623,...,1.394063,-1.074494,-0.077777,0.488513,-0.166559,0.09713,-0.116913,-0.544173,0.539636,-0.844476
1,4.157745,1.232101,-0.387643,-1.497846,-0.839095,-0.059351,-1.374155,0.189478,0.09175,-0.460244,...,-1.022818,1.129464,0.088889,-0.403046,0.916579,0.534934,1.051205,0.978721,-0.041353,-0.063715
2,1.505645,-1.735546,0.00521,0.054408,-0.261666,-0.13749,-0.460864,-0.443596,0.199964,-0.35056,...,1.434525,0.409403,-0.745082,0.144515,-0.109409,-0.132328,-0.51528,-0.26444,0.325695,-0.586698
3,1.711824,-1.052999,1.122125,-0.515712,1.450792,-0.26846,0.279,-0.828542,-0.858724,0.531137,...,1.022701,0.105597,0.089688,0.790038,0.759182,0.858358,-0.138548,-0.478965,0.150287,-1.108563
4,0.907031,-1.897766,-0.252942,-1.489939,0.590424,-0.319982,-0.424242,-0.060309,-0.193652,-0.285662,...,0.286219,-0.166439,0.010761,0.323917,0.101242,0.032686,0.088891,-0.096498,0.172166,-0.268599


In [19]:
# transform PCA data to a DataFrame 
X_train_pca_df = pd.DataFrame(data=train_loans_pca)
X_train_pca_df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,2.093313,3.015578,-4.875729,-0.720803,-0.330164,3.53143,2.777607,0.585727,0.476394,1.420963,...,-2.530509,1.651795,-0.953039,0.080486,0.015037,-0.68884,-0.570678,-0.290485,-1.570569,-0.644882
1,-0.182816,-1.015818,2.512122,2.374156,2.550709,0.91126,1.876432,-0.293834,2.455743,-2.559733,...,0.353918,-0.060283,-2.008904,2.294666,-1.265357,-0.986274,-1.215606,0.421507,-0.236215,-1.112425
2,0.210223,-3.136989,-0.477267,4.761312,-4.678685,1.587249,-1.66407,-2.397286,0.500933,0.561777,...,1.186723,1.064479,-0.361524,-0.365923,-0.063348,-0.40927,-0.774554,0.20208,0.03289,-0.408957
3,3.488715,-1.880091,-0.790174,-1.668218,0.43265,-0.094493,-0.11933,0.279598,0.280553,-0.34556,...,-0.751256,0.427059,0.299095,0.216486,0.243104,-0.281438,0.091415,-0.02267,0.04133,-0.071348
4,-0.668356,-0.194415,-0.069559,1.06868,0.473101,0.26158,0.437259,-0.431339,-3.476768,-3.640262,...,-0.298781,-0.271568,0.072951,0.231726,0.271688,-0.415495,-0.501856,-0.361927,0.144321,-0.359491


In [20]:
# See explained variance ratio sum - Optimized to explain as much as possible - 225 components is ideal at 95%, no one feature explains 
# All features are equally important  - 10 features = 9%  - 100 features 50%  -
# Mostly linear relationship number of features and explainabilty
# PCA DID NOT HELP, WHEN IT REDUCES THE NUMBER OF FEATURES IT ALSO REDUCES EXPLAINABILITY IN A NEARLY LINEAR RELATIONSHIP
# THIS ALSO SHOWED UP IN THE ML MODELS, WHERE USING X-SCALED WAS BETTER THAN THE PCA VERSION.
sum(pca.explained_variance_ratio_)

0.21150654015045633

# Run Logistic Regression on SUCCESS (binary classification of y) and Analyze Results

In [21]:
# Instantiate a Logistic Regression Model, Step 1 of 2
classifier = LogisticRegression(solver='lbfgs', random_state=1)
classifier

LogisticRegression(random_state=1)

In [22]:
# Instantiate a Logistic Regression Model, Step 2 of 2
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
   intercept_scaling=1, l1_ratio=None, max_iter=100,
   multi_class='warn', n_jobs=None, penalty='12',
   random_state=1, solver='lbfgs', tol=0.0001, verbose=0,
   warm_start=False)

LogisticRegression(multi_class='warn', penalty='12', random_state=1)

In [23]:
# Train the Logistic Regression Model  - Using scaled data takes prediction accuracy from 66% to 69%
classifier.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [24]:
# Validate the Logistic Regression Model
y_pred = classifier.predict(X_test_scaled)

In [25]:
# Step 2 run predictions
pd.DataFrame({"Prediction": y_pred, "Actual": y_test["SUCCESS"]})

Unnamed: 0,Prediction,Actual
9357,1,1
38816,1,1
54453,1,1
24503,0,1
3118,1,1
...,...,...
73819,0,1
7558,1,0
84568,1,0
72945,1,1


In [26]:
# ANALYZE RESULTS
# Assess the Accuracy Score
balanced_accuracy_score(y_test, y_pred)
print(balanced_accuracy_score)

# Run the Confusion Matrix and print imbalanced classification report
confusion_matrix(y_test, y_pred)
print(classification_report_imbalanced(y_test, y_pred))

<function balanced_accuracy_score at 0x000000000A0F6558>
                   pre       rec       spe        f1       geo       iba       sup

          0       0.56      0.60      0.73      0.58      0.67      0.44      8005
          1       0.77      0.73      0.60      0.75      0.67      0.45     14292

avg / total       0.69      0.69      0.65      0.69      0.67      0.44     22297



# Random Forest Model on Binary Classification after tuning steps taken delineated in first code block

In [27]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=50, random_state=78, criterion='entropy', max_features='log2') 

# Fitting the model
rf_model.fit(X_resampled, y_resampled)

# TUNING: Do each line once, i.e. only change n-estimators and run 3 times, go with best result. Then next line.
# (1) n_estimators: Start at 100, go down to 50, up to 200 *** 50 works best***
# (2) random state: leave as-is
# (3) max-depth: start at 5, then remove to see how far it goes. * removing max_depth adds 6.8%age points to accuracy ***
# (4) criterion - change between 'gini' and 'entropy'  *** entropy works best ***
# (5) max-features: try 'auto', 'sqrt', 'log2'  ***log2 works best***

RandomForestClassifier(criterion='entropy', max_features='log2',
                       n_estimators=50, random_state=78)

In [28]:
# Count y_resampled
Counter(y_resampled)

Counter({'SUCCESS': 1})

In [29]:
# Making predictions using the testing data.
y_pred = rf_model.predict(X_test_scaled)
y_pred[5]

1

In [30]:
# Look at Importance of Each Parameter 
rf_model.get_params(deep=True)

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'entropy',
 'max_depth': None,
 'max_features': 'log2',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 50,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 78,
 'verbose': 0,
 'warm_start': False}

In [31]:
# RETURN MEAN ACCURACY
rf_model.score(X_test_scaled, y_test)

0.6912140646723774

In [32]:
# ANALYZE RESULTS
# Assess the Accuracy Score
balanced_accuracy_score(y_test, y_pred)
print(balanced_accuracy_score)

# Run the Confusion Matrix and print imbalanced classification report
confusion_matrix(y_test, y_pred)
print(classification_report_imbalanced(y_test, y_pred))

<function balanced_accuracy_score at 0x000000000A0F6558>
                   pre       rec       spe        f1       geo       iba       sup

          0       0.60      0.43      0.83      0.50      0.60      0.35      8005
          1       0.73      0.83      0.43      0.78      0.60      0.38     14292

avg / total       0.68      0.69      0.58      0.68      0.60      0.37     22297

