# Loan Fullfilment Prediction

Use logistic regression to predict loan fullfillment probability. The definition of loan been NOT fullfilled is that it is not fully funded 60 days after original loand start date.

In [4]:
%matplotlib inline
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np
from sqlalchemy import create_engine, Column, Integer, String, Table, MetaData
from sqlalchemy.orm import sessionmaker
from sqlalchemy.ext.declarative import declarative_base

import pymysql
pymysql.install_as_MySQLdb()

from config import MYSQL_URL

source_engine = create_engine(MYSQL_URL, encoding='utf-8')
source_session = sessionmaker(source_engine)

target_engine = create_engine("sqlite:///db/kiva.sqlite")
target_session = sessionmaker(target_engine)


sqlite_conn = target_engine.connect()
mysql_conn = source_engine.connect()

metadata = MetaData()

In [5]:
sql = """
SELECT 
    *
FROM kiva.loan 
where
    posted_time between '2017-06-01' and '2017-09-01'

"""

In [6]:
df = pd.read_sql(sql, mysql_conn)

In [7]:
df.columns

Index(['loan_id', 'loan_name', 'original_language', 'description',
       'description_translated', 'funded_amount', 'loan_amount', 'status',
       'image_id', 'video_id', 'activity_name', 'sector_name', 'loan_use',
       'country_code', 'country_name', 'town_name', 'currency_policy',
       'currency_exchange_coverage_rate', 'currency', 'partner_id',
       'posted_time', 'planned_expiration_time', 'disburse_time',
       'raised_time', 'lender_term', 'num_lenders_total',
       'num_journal_entries', 'num_bulk_entries', 'tags', 'borrower_names',
       'borrower_genders', 'borrower_pictured', 'repayment_interval',
       'distribution_model', 'gender'],
      dtype='object')

## Predictor Selection
Observe the data and select fields which are either quantative or has limit amount of unique values. This model might be expanded to include more features later on.

In [8]:
df.original_language.unique()

array(['English', 'Spanish', 'French', 'Russian', 'Portuguese', None],
      dtype=object)

In [9]:
df.sector_name.unique()

array(['Agriculture', 'Food', 'Arts', 'Clothing', 'Education', 'Housing',
       'Personal Use', 'Services', 'Transportation', 'Retail',
       'Manufacturing', 'Construction', 'Health', 'Wholesale',
       'Entertainment'], dtype=object)

In [10]:
df.country_code.unique()

array(['PH', 'KE', 'IN', 'JO', 'KH', 'TJ', 'PY', 'BO', 'SV', 'VN', 'GT',
       'MG', 'XK', 'TR', 'NI', 'MX', 'ID', 'EC', 'SN', 'PK', 'RW', 'LB',
       'AM', 'CO', 'PE', 'LA', 'UG', 'HT', 'LR', 'MZ', 'TG', 'CD', 'HN',
       'MM', 'ZW', 'PS', 'NG', 'WS', 'GE', 'SB', 'MD', 'EG', 'TL', 'BF',
       'SL', 'US', 'AL', 'TZ', 'CR', 'MW', 'ZM', 'CM', 'GH', 'UA', 'ML',
       'NP', 'BR', 'LS', 'KG', 'YE', 'TH', 'PR', 'DO', 'IL', 'BT', 'SS',
       'ZA'], dtype=object)

In [11]:
df.country_name.unique()

array(['Philippines', 'Kenya', 'India', 'Jordan', 'Cambodia',
       'Tajikistan', 'Paraguay', 'Bolivia', 'El Salvador', 'Vietnam',
       'Guatemala', 'Madagascar', 'Kosovo', 'Turkey', 'Nicaragua',
       'Mexico', 'Indonesia', 'Ecuador', 'Senegal', 'Pakistan', 'Rwanda',
       'Lebanon', 'Armenia', 'Colombia', 'Peru',
       "Lao People's Democratic Republic", 'Uganda', 'Haiti', 'Liberia',
       'Mozambique', 'Togo', 'The Democratic Republic of the Congo',
       'Honduras', 'Myanmar (Burma)', 'Zimbabwe', 'Palestine', 'Nigeria',
       'Samoa', 'Georgia', 'Solomon Islands', 'Moldova', 'Egypt',
       'Timor-Leste', 'Burkina Faso', 'Sierra Leone', 'United States',
       'Albania', 'Tanzania', 'Costa Rica', 'Malawi', 'Zambia',
       'Cameroon', 'Ghana', 'Ukraine', 'Mali', 'Nepal', 'Brazil',
       'Lesotho', 'Kyrgyzstan', 'Yemen', 'Thailand', 'Puerto Rico',
       'Dominican Republic', 'Israel', 'Bhutan', 'South Sudan',
       'South Africa'], dtype=object)

In [12]:
df.lender_term.unique()

array([ 14.,   8.,  16.,  18.,  11.,  20.,  13.,  10.,  15.,  26.,  21.,
        62.,  25.,  12.,  19.,   7.,   9.,  22.,  17.,   5.,  27.,   6.,
        30.,   4.,  38.,  33.,  24.,  32.,  23.,  36.,  37.,  28.,  39.,
        31.,  34.,  42.,  51.,  29.,  53.,  35.,  60.,   3.,  50.,  45.,
        88., 115.,  48.,  40.,  41.,  95.,  98.,  59.,  56.,  43., 142.,
        63.,  74.,  76.,  61., 134., 121., 113.,  75.,  86.,  44.,  57.,
         2.,  67., 143., 109.,  83.,  49.])

In [13]:
df.gender.unique()

array(['F', 'M', 'U'], dtype=object)

In [14]:
df.repayment_interval.unique()

array(['irregular', 'monthly', 'bullet'], dtype=object)

In [15]:
df.distribution_model.unique()

array(['field_partner', 'direct'], dtype=object)

## Data Pre-processing

In [16]:
feature_names = [
    'funded_amount',
    'original_language',
    'country_name',
    'sector_name',
    'lender_term',
    'gender',
    'repayment_interval',
    'distribution_model',
]

X = df[feature_names]
y = (df.loan_amount == df.funded_amount)

X = pd.get_dummies(X)
print(np.bincount(y))

[ 2807 53970]


In [17]:
X.columns

Index(['funded_amount', 'lender_term', 'original_language_English',
       'original_language_French', 'original_language_Portuguese',
       'original_language_Russian', 'original_language_Spanish',
       'country_name_Albania', 'country_name_Armenia', 'country_name_Bhutan',
       'country_name_Bolivia', 'country_name_Brazil',
       'country_name_Burkina Faso', 'country_name_Cambodia',
       'country_name_Cameroon', 'country_name_Colombia',
       'country_name_Costa Rica', 'country_name_Dominican Republic',
       'country_name_Ecuador', 'country_name_Egypt',
       'country_name_El Salvador', 'country_name_Georgia',
       'country_name_Ghana', 'country_name_Guatemala', 'country_name_Haiti',
       'country_name_Honduras', 'country_name_India', 'country_name_Indonesia',
       'country_name_Israel', 'country_name_Jordan', 'country_name_Kenya',
       'country_name_Kosovo', 'country_name_Kyrgyzstan',
       'country_name_Lao People's Democratic Republic', 'country_name_Lebanon',


In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

No need to scale Y as it is boolean

In [19]:
from sklearn.preprocessing import StandardScaler
X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Logistic Regression Model Training and Verification

In [54]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
from sklearn.metrics import mean_squared_error

In [55]:
def print_test_matrix(y, predictions):
    df = pd.DataFrame({
        'y': y,
        'predictions': predictions,
        'value': [1] * len(y)
    })
    print(pd.pivot_table(df, index=['y'], columns=['predictions'], aggfunc=np.sum))
    

def run_classifier(classifier, X_train, y_train, X_test, y_test):
    '''
    Show Percentage of identifying funded and NOT funded correctly
    '''
    classifier.fit(X_train, y_train)
    print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
    
    predictions = classifier.predict(X_train)
    print('Evaluate Train Data Prediction Result: ')
    print_test_matrix(y_train, predictions)
   
    print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")
    predictions = classifier.predict(X_test)
    print('Evaluate Test Data Prediction Result: ')
    print_test_matrix(y_test, predictions)
    
    
    return classifier

In [56]:
run_classifier(classifier, X_train_scaled, y_train, X_test_scaled, y_test)

Training Data Score: 0.9592008412197687
Evaluate Train Data Prediction Result: 
            value       
predictions False  True 
y                       
False         407   1426
True          126  36081
Testing Data Score: 0.956663286545338
Evaluate Test Data Prediction Result: 
            value       
predictions False  True 
y                       
False         234    740
True           72  17691


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

It looks like the result from logistic regression is not very precise as both type 1 error and type 2 error ratio is significant. This is likely contribute by all features are categorical.

## Alternative Models
We will try decision tree and randon forest, nural network, then compare the model accuracy and adopt the one with best prediction accuracy. 

### Decision Tree

In [41]:
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf = run_classifier(clf, X_train_scaled, y_train, X_test_scaled, y_test)

Training Data Score: 0.9936908517350158
Evaluate Train Data Prediction Result: 
            value       
predictions False  True 
y                       
False        1674    159
True           81  36126
Testing Data Score: 0.9616267278646529
Evaluate Test Data Prediction Result: 
            value       
predictions False  True 
y                       
False         587    387
True          332  17431


Result from decision tree model looks much better. However the model looks slighted overfitted.

### Random Forest

In [29]:
from sklearn.ensemble import RandomForestClassifier
rf = tree.DecisionTreeClassifier()
rf = clf.fit(X_train, y_train)
run_classifier(rf, X_train_scaled, y_train, X_test_scaled, y_test)

Training Data Score: 0.9936908517350158
Evaluate Train Data Prediction Result: 
            value       
predictions False  True 
y                       
False        1674    159
True           81  36126
Testing Data Score: 0.961893579548487
Evaluate Test Data Prediction Result: 
            value       
predictions False  True 
y                       
False         576    398
True          316  17447


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

## Nural Networks


In [33]:
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical

y_train_categorical = to_categorical(y_train)
y_test_categorical = to_categorical(y_test)

model = Sequential()
model.add(Dense(units=6, activation='relu', input_dim=2))
model.add(Dense(units=2, activation='softmax'))

In [34]:
# Compile the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [38]:
X_train_scaled

array([[-0.45546311, -0.7963329 ,  0.58257142, ...,  0.98808361,
        -0.10190879,  0.10190879],
       [ 0.07110527,  0.14305206,  0.58257142, ..., -1.0120601 ,
        -0.10190879,  0.10190879],
       [-0.5699345 , -0.95289706,  0.58257142, ...,  0.98808361,
        -0.10190879,  0.10190879],
       ...,
       [-0.45546311, -0.7963329 ,  0.58257142, ..., -1.0120601 ,
        -0.10190879,  0.10190879],
       [ 0.59767364,  0.14305206,  0.58257142, ...,  0.98808361,
        -0.10190879,  0.10190879],
       [-0.43256883, -0.7963329 ,  0.58257142, ..., -1.0120601 ,
        -0.10190879,  0.10190879]])

In [35]:
# Fit the model to the training data
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=100,
    shuffle=True,
    verbose=2
)

ValueError: Error when checking input: expected dense_7_input to have shape (2,) but got array with shape (97,)