In [1]:
### All packages and utility modules

In [81]:
import pandas as pd
import numpy as np
import os
import json
from dtree_utils import categorical_2_binary, visualize_decision_tree
from sklearn.tree import DecisionTreeClassifier

In [4]:
loans = pd.read_csv("lending-club-data.csv")
print("No of observations in loans dataset %d" %(loans.shape[0]))
print("No of columns in loans dataset %d" %(loans.shape[1]))

  interactivity=interactivity, compiler=compiler, result=result)


No of observations in loans dataset 122607
No of columns in loans dataset 68


In [5]:
### Exploring some features

In [6]:
loans_cols = loans.columns
loans_cols = loans_cols.tolist()

In [7]:
### The target column (label column) of the dataset that we are interested in is called bad_loans.
### In this column 1 means a risky (bad) loan 0 means a safe loan

In [8]:
### In order to make this more intuitive and consistent with the lectures, we reassign the target to be:
### +1 as a safe loan
### -1 as a risky (bad) loan

In [9]:
### put this in a new column called safe_loans.
loans["safe_loans"] = loans["bad_loans"].apply(lambda x : 1 if x == 0 else -1)

In [10]:
### store a text file of labels of original and after modification
'''
z = loans["safe_loans"].values.tolist()
z = [str(i) for i in z]
z = str(",".join(z))
with open("loan_label_modified.txt","w") as file:
    file.write(z)
k = loans["bad_loans"].values.tolist()
k = [str(i) for i in k]
k = str(",".join(k))
with open("loan_label_original.txt","w") as file:
    file.write(k)
'''

'\nz = loans["safe_loans"].values.tolist()\nz = [str(i) for i in z]\nz = str(",".join(z))\nwith open("loan_label_modified.txt","w") as file:\n    file.write(z)\nk = loans["bad_loans"].values.tolist()\nk = [str(i) for i in k]\nk = str(",".join(k))\nwith open("loan_label_original.txt","w") as file:\n    file.write(k)\n'

In [11]:
### drop the old label from loans dataframe
### loans = loans.drop(["bad_loans"],axis = 1)

In [12]:
safe = [ s for s in loans["safe_loans"].values.tolist() if s == 1]

In [13]:
unsafe = [s for s in loans["safe_loans"].values.tolist() if s == -1]

In [14]:
print ("percentage of safe loan %f" %((len(safe) / loans.shape[0])*100))
print ("percentage of safe loan %f" %((len(unsafe) / loans.shape[0])*100))

percentage of safe loan 81.118533
percentage of safe loan 18.881467


In [15]:
### we will be using a subset of features (categorical and numeric).
### The features we will be using are described in the code comments below.
### If you are a finance geek, the LendingClub website has a lot more details about these features.
### Extract these feature columns and target column from the dataset. We will only use these features.

In [16]:
features = ['grade',                     # grade of the loan
            'sub_grade',                 # sub-grade of the loan
            'short_emp',                 # one year or less of employment
            'emp_length_num',            # number of years of employment
            'home_ownership',            # home_ownership status: own, mortgage or rent
            'dti',                       # debt to income ratio
            'purpose',                   # the purpose of the loan
            'term',                      # the term of the loan
            'last_delinq_none',          # has borrower had a delinquincy
            'last_major_derog_none',     # has borrower had 90 day or worse rating
            'revol_util',                # percent of available credit being used
            'total_rec_late_fee']        # total late fees received to day

target = ['safe_loans']                  # prediction target (y) (+1 means safe, -1 is risky)

In [17]:
feat_loans = loans[features + target]

In [19]:
### List of indices for the training and validation sets
with open("module-5-assignment-1-train-idx.json",'r') as data:
    train_index = json.load(data)
with open("module-5-assignment-1-validation-idx.json",'r') as data:
    validation_index = json.load(data)

In [20]:
### Apply one-hot encoding to loans


In [21]:
feat_loans.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122607 entries, 0 to 122606
Data columns (total 13 columns):
grade                    122607 non-null object
sub_grade                122607 non-null object
short_emp                122607 non-null int64
emp_length_num           122607 non-null int64
home_ownership           122607 non-null object
dti                      122607 non-null float64
purpose                  122607 non-null object
term                     122607 non-null object
last_delinq_none         122607 non-null int64
last_major_derog_none    122607 non-null int64
revol_util               122607 non-null float64
total_rec_late_fee       122607 non-null float64
safe_loans               122607 non-null int64
dtypes: float64(3), int64(5), object(5)
memory usage: 12.2+ MB


In [22]:
### Columns having object datatype are possible categorica features in dataset
cat_feat_loans = feat_loans.select_dtypes(include =["object"]).copy()

In [23]:
cat_feat_loans.head()

Unnamed: 0,grade,sub_grade,home_ownership,purpose,term
0,B,B2,RENT,credit_card,36 months
1,C,C4,RENT,car,60 months
2,C,C5,RENT,small_business,36 months
3,C,C1,RENT,other,36 months
4,A,A4,RENT,wedding,36 months


In [24]:
feat_loans = categorical_2_binary(feat_loans, cat_feature = None, flag = False)

In [25]:
feat_loans.head()

Unnamed: 0,short_emp,emp_length_num,dti,last_delinq_none,last_major_derog_none,revol_util,total_rec_late_fee,safe_loans,grade_A,grade_B,...,purpose_house,purpose_major_purchase,purpose_medical,purpose_moving,purpose_other,purpose_small_business,purpose_vacation,purpose_wedding,term_ 36 months,term_ 60 months
0,0,11,27.65,1,1,83.7,0.0,1,0,1,...,0,0,0,0,0,0,0,0,1,0
1,1,1,1.0,1,1,9.4,0.0,-1,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,11,8.72,1,1,98.5,0.0,1,0,0,...,0,0,0,0,0,1,0,0,1,0
3,0,11,20.0,0,1,21.0,16.97,1,0,0,...,0,0,0,0,1,0,0,0,1,0
4,0,4,11.2,1,1,28.3,0.0,1,1,0,...,0,0,0,0,0,0,0,1,1,0


In [35]:
train_data = feat_loans.iloc[train_index]
train_data_X = train_data.drop("safe_loans", axis = 1)
validation_data = feat_loans.iloc[validation_index]

In [27]:
train_data["safe_loans"].value_counts()

 1    18748
-1    18476
Name: safe_loans, dtype: int64

In [28]:
train_Y = train_data["safe_loans"].as_matrix()
train_X = train_data.drop("safe_loans", axis = 1).as_matrix()
print(train_X.shape)
print(train_Y.shape)

(37224, 67)
(37224,)


  """Entry point for launching an IPython kernel.
  


In [29]:
### Build a decision tree classifier

In [30]:
decision_tree_model = DecisionTreeClassifier(max_depth = 6)
decision_tree_model = decision_tree_model.fit(train_X,train_Y)

In [31]:
### Also train a tree using with max_depth=2. Call this model small_model

In [56]:
small_model = DecisionTreeClassifier(max_depth = 2)
small_model = small_model.fit(train_X,train_Y)

In [65]:
import sklearn
print('The scikit-learn version is {}.'.format(sklearn.__version__))

The scikit-learn version is 0.19.2.


In [73]:
### decision tree visualization

In [85]:
visualize_decision_tree(decision_tree_model,train_data_X)

In [86]:
validation_data_safe = validation_data[validation_data["safe_loans"] == 1]
validation_data_risky = validation_data[validation_data["safe_loans"] == -1]

In [88]:
sample_validation_data_safe = validation_data_safe[0:2]
sample_validation_data_risky = validation_data_risky[0:2]
sample_validation_data = sample_validation_data_safe.append(sample_validation_data_risky)

In [95]:
sample_validation_data

Unnamed: 0,short_emp,emp_length_num,dti,last_delinq_none,last_major_derog_none,revol_util,total_rec_late_fee,safe_loans,grade_A,grade_B,...,purpose_house,purpose_major_purchase,purpose_medical,purpose_moving,purpose_other,purpose_small_business,purpose_vacation,purpose_wedding,term_ 36 months,term_ 60 months
19,0,11,11.18,1,1,82.4,0.0,1,0,1,...,0,0,0,0,0,0,0,0,1,0
79,0,10,16.85,1,1,96.4,0.0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
24,0,3,13.97,0,1,59.5,0.0,-1,0,0,...,0,0,0,0,1,0,0,0,0,1
41,0,11,16.33,1,1,62.1,0.0,-1,1,0,...,0,0,0,0,0,0,0,0,1,0


In [102]:
print(sample_validation_data["safe_loans"])
sample_validation_data_X = sample_validation_data.drop("safe_loans", axis = 1)
sample_validation_data_Y = sample_validation_data["safe_loans"]

19    1
79    1
24   -1
41   -1
Name: safe_loans, dtype: int64


In [91]:
decision_tree_model.predict(sample_validation_data_X)

array([ 1, -1, -1,  1], dtype=int64)

In [93]:
### Quiz Question: What percentage of the predictions on sample_validation_data did decision_tree_model get correct?
### ans: 50

In [94]:
decision_tree_model.predict_proba(sample_validation_data_X)

array([[0.34156543, 0.65843457],
       [0.53630646, 0.46369354],
       [0.64750958, 0.35249042],
       [0.20789474, 0.79210526]])

In [96]:
### Quiz Question: Which loan has the highest probability of being classified as a safe loan?
### ans. 41

In [97]:
### Checkpoint: Can you verify that for all the predictions with probability >= 0.5, the model predicted the label +1?
### True

In [98]:
### 14. Now, we will explore something pretty interesting. For each row in the sample_validation_data,
### what is the probability (according to small_model) of a loan being classified as safe?

In [99]:
small_model.predict_proba(sample_validation_data_X)

array([[0.41896585, 0.58103415],
       [0.59255339, 0.40744661],
       [0.59255339, 0.40744661],
       [0.23120112, 0.76879888]])

In [None]:
### Quiz Question: Notice that the probability preditions are the exact same for the 2nd and 3rd loans. Why would this happen?

In [100]:
### Quiz Question: Based on the visualized tree, what prediction would you make for this data point (according to small_model)?
### (If you don't have Graphviz, you can answer this quiz question by executing the next part.)

In [101]:
### 15. Now, verify your prediction by examining the prediction made using small_model.
### ans .50

In [None]:
### accuracy in sample validation data

In [103]:
small_model.score(sample_validation_data_X, sample_validation_data_Y)

0.5

In [104]:
decision_tree_model.score(sample_validation_data_X, sample_validation_data_Y)

0.5

In [None]:
### accuracy in training data

In [105]:
small_model.score(train_X,train_Y)

0.6135020416935311

In [106]:
decision_tree_model.score(train_X,train_Y)

0.6405276165914464

In [107]:
### 17. Now, evaluate the accuracy of the small_model and decision_tree_model on the entire validation_data,
### not just the subsample considered above.

In [120]:
validation_data_Y = validation_data["safe_loans"].as_matrix()
validation_data_X = validation_data.drop("safe_loans", axis = 1).as_matrix()

  """Entry point for launching an IPython kernel.
  


In [121]:
decision_tree_model.score(validation_data_X, validation_data_Y)

0.6361482119775959

In [122]:
small_model.score(validation_data_X, validation_data_Y)

0.6193451098664369

In [123]:
### Quiz Question: What is the accuracy of decision_tree_model on the validation set, rounded to the nearest .01?
### ans: .64

In [124]:
### Evaluating accuracy of a complex decision tree model
### Here, we will train a large decision tree with max_depth=10. This will allow the learned tree to become very deep,
### and result in a very complex model. Recall that in lecture, we prefer simpler models with similar predictive power.
### This will be an example of a more complicated model which has similar predictive power, i.e. something we don't want.

In [125]:
big_model = DecisionTreeClassifier(max_depth = 10)
big_model = big_model.fit(train_X,train_Y)

In [126]:
visualize_decision_tree(big_model,train_data_X)

In [127]:
### 19. Evaluate the accuracy of big_model on the training set and validation set

In [128]:
big_model.score(train_X,train_Y)

0.6637921770900495

In [129]:
big_model.score(validation_data_X, validation_data_Y)

0.6263464024127531

In [130]:
### Quiz Question: How does the performance of big_model on the validation set compare to decision_tree_model on the validation set?
### Is this a sign of overfitting?

In [131]:
prediction = decision_tree_model.predict(validation_data_X)

In [141]:
false_positives = ((prediction == 1) * (validation_data_Y == -1)).sum()
false_positives

1661

In [152]:
false_negatives = ((prediction == -1) * (validation_data_Y == 1)).sum()
false_negatives

1717

In [153]:
correct_prediction = (prediction == validation_data_Y).sum()
correct_prediction

5906

In [155]:
print(10000 * false_negatives + 20000 * false_positives)

50390000
