## Load the Lending club data into SFrame

In [1]:
import sframe as sf
loans = sf.SFrame('../data/lending-club-data.gl/')

[INFO] sframe.cython.cy_server: SFrame v2.1 started. Logging /tmp/sframe_server_1474316371.log


## Add column 'safe-loans'

In [2]:
# safe_loans =  1 => safe
# safe_loans = -1 => risky
loans['safe_loans'] = loans['bad_loans'].apply(lambda x : +1 if x==0 else -1)
loans = loans.remove_column('bad_loans')

In [3]:
target = 'safe_loans'
features = ['grade',                     # grade of the loan (categorical)
            'sub_grade_num',             # sub-grade of the loan as a number from 0 to 1
            'short_emp',                 # one year or less of employment
            'emp_length_num',            # number of years of employment
            'home_ownership',            # home_ownership status: own, mortgage or rent
            'dti',                       # debt to income ratio
            'purpose',                   # the purpose of the loan
            'payment_inc_ratio',         # ratio of the monthly payment to income
            'delinq_2yrs',               # number of delinquincies
             'delinq_2yrs_zero',          # no delinquincies in last 2 years
            'inq_last_6mths',            # number of creditor inquiries in last 6 months
            'last_delinq_none',          # has borrower had a delinquincy
            'last_major_derog_none',     # has borrower had 90 day or worse rating
            'open_acc',                  # number of open credit accounts
            'pub_rec',                   # number of derogatory public records
            'pub_rec_zero',              # no derogatory public records
            'revol_util',                # percent of available credit being used
            'total_rec_late_fee',        # total late fees received to day
            'int_rate',                  # interest rate of the loan
            'total_rec_int',             # interest received to date
            'annual_inc',                # annual income of borrower
            'funded_amnt',               # amount committed to the loan
            'funded_amnt_inv',           # amount committed by investors for the loan
            'installment',               # monthly payment owed by the borrower
           ]

## Skip observations with missing data

In [4]:
loans, loans_with_na = loans[[target] + features].dropna_split()

# Count the number of rows with missing data
num_rows_with_na = loans_with_na.num_rows()
num_rows = loans.num_rows()
print 'Dropping %s observations; keeping %s ' % (num_rows_with_na, num_rows)

Dropping 29 observations; keeping 122578 


## Balance the dataset

This dataset is imbalance. Have fair sampling of positive and negative experiences.
> There are different ways to handle the imbalance data. Please look at this [paper](http://ieeexplore.ieee.org/xpl/login.jsp?tp=&arnumber=5128907&url=http%3A%2F%2Fieeexplore.ieee.org%2Fiel5%2F69%2F5173046%2F05128907.pdf%3Farnumber%3D5128907)

In [5]:
safe_loans_raw = loans[loans[target] == 1]
risky_loans_raw = loans[loans[target] == -1]

# Undersample the safe loans.
percentage = len(risky_loans_raw)/float(len(safe_loans_raw))
safe_loans = safe_loans_raw.sample(percentage, seed = 1)
risky_loans = risky_loans_raw
loans_data = risky_loans.append(safe_loans)

print "Percentage of safe loans                 :", len(safe_loans) / float(len(loans_data))
print "Percentage of risky loans                :", len(risky_loans) / float(len(loans_data))
print "Total number of loans in our new dataset :", len(loans_data)

Percentage of safe loans                 : 0.502247166849
Percentage of risky loans                : 0.497752833151
Total number of loans in our new dataset : 46503


## Convert Categorical Variables into Binary features using <span style="color:red">One-Hot Encoding</span>

In [6]:
loans_data = risky_loans.append(safe_loans)

categorical_variables = []
for feat_name, feat_type in zip(loans_data.column_names(), loans_data.column_types()):
    if feat_type == str:
        categorical_variables.append(feat_name)

for feature in categorical_variables:
    loans_data_one_hot_encoded = loans_data[feature].apply(lambda x: {x: 1})
    loans_data_unpacked = loans_data_one_hot_encoded.unpack(column_name_prefix=feature)

    # Change None's to 0's
    for column in loans_data_unpacked.column_names():
        loans_data_unpacked[column] = loans_data_unpacked[column].fillna(0)

    loans_data.remove_column(feature)
    loans_data.add_columns(loans_data_unpacked)

loans_data.column_names()

['safe_loans',
 'sub_grade_num',
 'short_emp',
 'emp_length_num',
 'dti',
 'payment_inc_ratio',
 'delinq_2yrs',
 'delinq_2yrs_zero',
 'inq_last_6mths',
 'last_delinq_none',
 'last_major_derog_none',
 'open_acc',
 'pub_rec',
 'pub_rec_zero',
 'revol_util',
 'total_rec_late_fee',
 'int_rate',
 'total_rec_int',
 'annual_inc',
 'funded_amnt',
 'funded_amnt_inv',
 'installment',
 'grade.A',
 'grade.B',
 'grade.C',
 'grade.D',
 'grade.E',
 'grade.F',
 'grade.G',
 'home_ownership.MORTGAGE',
 'home_ownership.OTHER',
 'home_ownership.OWN',
 'home_ownership.RENT',
 'purpose.car',
 'purpose.credit_card',
 'purpose.debt_consolidation',
 'purpose.home_improvement',
 'purpose.house',
 'purpose.major_purchase',
 'purpose.medical',
 'purpose.moving',
 'purpose.other',
 'purpose.small_business',
 'purpose.vacation',
 'purpose.wedding']

## Split the data - training, and validation set

In [87]:
train_data, validation_data = loans_data.random_split(.8, seed=1)

### Train the Gradient Boost machine classifier

> Ref: See this [paper](http://homes.cs.washington.edu/~tqchen/pdf/BoostedTree.pdf) for more details on GBM.

In [88]:
import sklearn
import sklearn.ensemble as en
import numpy as np

In [89]:
clf = en.GradientBoostingClassifier(max_depth=6, n_estimators=5)

In [90]:
def trainClassifier(clf, data, target):
    feature_names = data.column_names()
    feature_names.remove(target)
    feature_data = data.select_columns(feature_names)
        
    clf.fit(X = feature_data.to_numpy(), y = data[target].to_numpy())
    return clf

In [91]:
validation_safe_loans = validation_data[validation_data[target] == 1]
validation_risky_loans = validation_data[validation_data[target] == -1]

sample_validation_data_risky = validation_risky_loans[0:2]
sample_validation_data_safe = validation_safe_loans[0:2]

sample_validation_data = sample_validation_data_safe.append(sample_validation_data_risky)

In [92]:
model_5 = trainClassifier(clf, train_data, target = target)

## Run on validation set

In [93]:
def predict(model, data, target):
    feature_names = data.column_names()
    feature_names.remove(target)
    feature_data = data.select_columns(feature_names).to_numpy()
    predictions = model.predict(feature_data)
    probabilities = model.predict_proba(feature_data)
    scores = model.score(feature_data, data[target].to_numpy())
    return predictions, probabilities, scores

def accuracy(predictions, target_sarray):
    target_data = target_sarray.to_numpy()
    correct = len(target_data[target_data == predictions])
    return correct / float(len(target_data))

In [44]:
sample_predictions, sample_probabilities, sample_scores = predict(clf, sample_validation_data, target)
print "Predictions:", sample_predictions
print "Probabilities:", sample_probabilities
print "Scores:", sample_scores

Predictions: [ 1  1 -1  1]
Probabilities: [[ 0.41642331  0.58357669]
 [ 0.46949689  0.53050311]
 [ 0.53807792  0.46192208]
 [ 0.39591639  0.60408361]]
Scores: 0.75


In [45]:
predictions_vd, probabilities_vd, scores_vd = predict(clf, validation_data, target)

In [47]:
target_vd = validation_data[target].to_numpy()
predictions_vd == +1 & target_vd == -1
    

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [59]:
len(predictions_vd[predictions_vd[(predictions_vd == +1) & (target_vd == -1)] == True])

  if __name__ == '__main__':


1653

In [61]:
from sklearn.metrics import confusion_matrix
confusion_matrix(target_vd, predictions_vd)

array([[3019, 1653],
       [1491, 3121]])

In [64]:
false_positives = 1653
false_negatives = 1491
print "Q5: False Positives:", 1653

print "Q6: Cost:", (10000 * false_negatives  + 20000 * false_positives)

Q5: False Positives: 1653
Q6: Cost: 47970000


## Finding Most Positive/Negative Loans

In [73]:
validation_data['probabilities'] = sf.SArray(probabilities_vd[:,1])
most_negatives = validation_data.sort('probabilities')
most_positives = validation_data.sort('probabilities', ascending=False)

In [74]:
most_positives.head()

safe_loans,sub_grade_num,short_emp,emp_length_num,dti,payment_inc_ratio,delinq_2yrs,delinq_2yrs_zero
-1,0.4,0,4,12.73,12.167,0,1
1,0.2,0,11,15.74,8.95065,0,1
1,0.6,0,11,0.52,2.59067,0,1
1,0.8,0,2,13.65,1.75065,0,1
1,0.8,0,5,9.24,4.7496,0,1
1,0.2,0,3,10.59,9.13075,0,1
-1,0.2,0,6,11.11,4.38276,0,1
1,0.4,0,4,13.75,2.03093,0,1
1,0.2,0,8,10.02,3.49357,0,1
1,0.8,0,3,8.05,1.94841,0,1

inq_last_6mths,last_delinq_none,last_major_derog_none,open_acc,pub_rec,pub_rec_zero,revol_util,total_rec_late_fee
1,1,1,6,0,1,30.5,18.2281
1,1,1,6,0,1,11.8,0.0
0,1,1,4,0,1,13.2,0.0
1,1,1,6,0,1,22.3,0.0
0,1,1,15,0,1,2.2,0.0
0,1,1,19,0,1,8.1,0.0
0,1,1,12,0,1,15.7,0.0
1,1,1,19,0,1,12.5,0.0
0,1,1,14,0,1,7.9,0.0
3,0,1,8,0,1,16.3,0.0

int_rate,total_rec_int,annual_inc,funded_amnt,funded_amnt_inv,installment,grade.A,grade.B,grade.C,grade.D
5.99,1202.3,36000,12000,12000,365.01,1,0,0,0
6.03,566.24,71000,17400,17400,529.58,1,0,0,0
6.99,1111.86,143004,10000,9625,308.73,1,0,0,0
7.29,21.55,74400,3500,3500,108.54,1,0,0,0
6.54,871.28,80000,10325,10300,316.64,1,0,0,0
6.03,1751.58,96000,24000,24000,730.46,1,0,0,0
6.03,118.96,100000,12000,12000,365.23,1,0,0,0
6.49,130.1,72428,4000,4000,122.58,1,0,0,0
6.03,161.9,115000,11000,11000,334.8,1,0,0,0
6.54,240.89,68000,3600,3600,110.41,1,0,0,0

grade.E,grade.F,grade.G,home_ownership.MORTGAGE,home_ownership.OTHER,home_ownership.OWN,home_ownership.RENT
0,0,0,1,0,0,0
0,0,0,0,0,0,1
0,0,0,1,0,0,0
0,0,0,0,0,0,1
0,0,0,1,0,0,0
0,0,0,0,0,0,1
0,0,0,1,0,0,0
0,0,0,0,0,0,1
0,0,0,1,0,0,0
0,0,0,1,0,0,0

purpose.car,purpose.credit_card,purpose.debt_consolidatio n ...,purpose.home_improvement,purpose.house,purpose.major_purchase
0,0,1,0,0,0
0,0,1,0,0,0
0,0,0,0,0,0
1,0,0,0,0,0
0,0,1,0,0,0
0,0,1,0,0,0
0,0,1,0,0,0
1,0,0,0,0,0
0,0,0,0,0,0
0,0,0,0,0,1

purpose.medical,...
0,...
0,...
0,...
0,...
0,...
0,...
0,...
0,...
0,...
0,...


## Evaluating the affect of number of trees

In [94]:
models = {}
for n_trees in [10, 50, 100, 200, 500]:
    models['model_' + str(n_trees)] = trainClassifier(en.GradientBoostingClassifier(max_depth=6, n_estimators=n_trees), data=train_data, target = target) 

In [95]:
for model_name, model in models.iteritems():
    predictions, probabilities, scores = predict(model, validation_data, target)
    print "Accuracy:", model_name, " is", scores 

Accuracy: model_50  is 0.68472641103
Accuracy: model_500  is 0.690004308488
Accuracy: model_10  is 0.665445928479
Accuracy: model_100  is 0.690219732874
Accuracy: model_200  is 0.686449806118


In [79]:
models

{'model_10': GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
               max_depth=6, max_features=None, max_leaf_nodes=None,
               min_samples_leaf=1, min_samples_split=2,
               min_weight_fraction_leaf=0.0, n_estimators=10,
               presort='auto', random_state=None, subsample=1.0, verbose=0,
               warm_start=False),
 'model_100': GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
               max_depth=6, max_features=None, max_leaf_nodes=None,
               min_samples_leaf=1, min_samples_split=2,
               min_weight_fraction_leaf=0.0, n_estimators=100,
               presort='auto', random_state=None, subsample=1.0, verbose=0,
               warm_start=False),
 'model_20': GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
               max_depth=6, max_features=None, max_leaf_nodes=None,
               min_samples_leaf=1, min_samples_split=2,
               min_we