In [1]:
%matplotlib inline
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


from sklearn.model_selection import cross_val_score, train_test_split
from sklearn import tree
from sklearn import metrics

## Read Data

In [2]:
data = pd.read_csv("loan_sub.csv", sep=',')

  data = pd.read_csv("loan_sub.csv", sep=',')


## Print Useful Features

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122607 entries, 0 to 122606
Data columns (total 68 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   id                           122607 non-null  int64  
 1   member_id                    122607 non-null  int64  
 2   loan_amnt                    122607 non-null  int64  
 3   funded_amnt                  122607 non-null  int64  
 4   funded_amnt_inv              122607 non-null  int64  
 5   term                         122607 non-null  object 
 6   int_rate                     122607 non-null  float64
 7   installment                  122607 non-null  float64
 8   grade                        122607 non-null  object 
 9   sub_grade                    122607 non-null  object 
 10  emp_title                    115767 non-null  object 
 11  emp_length                   118516 non-null  object 
 12  home_ownership               122607 non-null  object 
 13 

In [4]:
data.head(10)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,sub_grade_num,delinq_2yrs_zero,pub_rec_zero,collections_12_mths_zero,short_emp,payment_inc_ratio,final_d,last_delinq_none,last_record_none,last_major_derog_none
0,1077501,1296599,5000,5000,4975,36 months,10.65,162.87,B,B2,...,0.4,1.0,1.0,1.0,0,8.1435,20141201T000000,1,1,1
1,1077430,1314167,2500,2500,2500,60 months,15.27,59.83,C,C4,...,0.8,1.0,1.0,1.0,1,2.3932,20161201T000000,1,1,1
2,1077175,1313524,2400,2400,2400,36 months,15.96,84.33,C,C5,...,1.0,1.0,1.0,1.0,0,8.25955,20141201T000000,1,1,1
3,1076863,1277178,10000,10000,10000,36 months,13.49,339.31,C,C1,...,0.2,1.0,1.0,1.0,0,8.27585,20141201T000000,0,1,1
4,1075269,1311441,5000,5000,5000,36 months,7.9,156.46,A,A4,...,0.8,1.0,1.0,1.0,0,5.21533,20141201T000000,1,1,1
5,1072053,1288686,3000,3000,3000,36 months,18.64,109.43,E,E1,...,0.2,1.0,1.0,1.0,0,2.73575,20141201T000000,1,1,1
6,1071795,1306957,5600,5600,5600,60 months,21.28,152.39,F,F2,...,0.4,1.0,1.0,1.0,0,4.5717,20161201T000000,1,1,1
7,1071570,1306721,5375,5375,5350,60 months,12.69,121.45,B,B5,...,1.0,1.0,1.0,1.0,1,9.716,20161201T000000,1,1,1
8,1070078,1305201,6500,6500,6500,60 months,14.65,153.45,C,C3,...,0.6,1.0,1.0,1.0,0,2.5575,20161201T000000,1,1,1
9,1069908,1305008,12000,12000,12000,36 months,12.69,402.54,B,B5,...,1.0,1.0,1.0,1.0,0,6.44064,20141201T000000,1,1,1


## Target

In [5]:
data['safe_loans'] = 1 - data['bad_loans'] * 2
data = data.drop('bad_loans', axis=1)

## Print safe loan rate

In [6]:
data['safe_loans'].value_counts(normalize=True)
#data is imbalance. most loans are good loans

 1    0.811185
-1    0.188815
Name: safe_loans, dtype: float64

## Select features to use for prediction

In [7]:
cols = ['grade', 'term', 'home_ownership', 'emp_length']
target = 'safe_loans'

data = data[cols + [target]]
data.head()

Unnamed: 0,grade,term,home_ownership,emp_length,safe_loans
0,B,36 months,RENT,10+ years,1
1,C,60 months,RENT,< 1 year,-1
2,C,36 months,RENT,10+ years,1
3,C,36 months,RENT,10+ years,1
4,A,36 months,RENT,3 years,1


## Downsampling to create balanced data

In [9]:
data[target].value_counts()

 1    99457
-1    23150
Name: safe_loans, dtype: int64

In [15]:
# use the percentage of bad and good loans to undersample the safe loans.
percentage = -sum(data[data[target]==-1][target])/sum(data[data[target]==1][target])
print(f"{percentage=}")
safe_loans = data[data[target]==1].sample(frac=percentage, random_state=33)
risky_loans = data[data[target]==-1]
data_set = pd.concat([risky_loans, safe_loans], axis = 0)
data_set.head()

percentage=0.2327639080205516


Unnamed: 0,grade,term,home_ownership,emp_length,safe_loans
1,C,60 months,RENT,< 1 year,-1
6,F,60 months,OWN,4 years,-1
7,B,60 months,RENT,< 1 year,-1
10,C,36 months,RENT,< 1 year,-1
12,B,36 months,RENT,3 years,-1


In [17]:
data_set[target].value_counts()

-1    23150
 1    23150
Name: safe_loans, dtype: int64

## Preprocessing

In [18]:
def label_encode(data, columns=['pclass','name_title','embarked', 'sex']):
    for col in columns:
        data[col] = data[col].apply(lambda x: str(x))
        new_cols = [col + '_' + i for i in data[col].unique()]
        data = pd.concat([data, pd.get_dummies(data[col], prefix=col)[new_cols]], axis=1)
        del data[col]
    return data

In [19]:
#grade, home_ownership, target
cols = ['grade', 'term','home_ownership', 'emp_length']
data_set = label_encode(data_set, columns=cols)
data_set.head()

Unnamed: 0,safe_loans,grade_C,grade_F,grade_B,grade_D,grade_A,grade_E,grade_G,term_ 60 months,term_ 36 months,...,emp_length_3 years,emp_length_10+ years,emp_length_1 year,emp_length_9 years,emp_length_2 years,emp_length_8 years,emp_length_7 years,emp_length_5 years,emp_length_nan,emp_length_6 years
1,-1,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
6,-1,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
7,-1,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
10,-1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
12,-1,0,0,1,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0


## Split train and test dataset

In [20]:
train_data, test_data = train_test_split(data_set, test_size=0.2, random_state=33)
trainX, trainY = train_data[train_data.columns[1:]], pd.DataFrame(train_data[target])
testX, testY = test_data[test_data.columns[1:]], pd.DataFrame(test_data[target])

## Build your own decision tree

#### Tasks
- implement best_split()
- implement best_split_entropy()
- implement TreeNode class
- implement MyDecisionTree class

In [21]:
def count_errors(labels_in_node):
    """
    Input: (Numpy Array/Pandas series)labels in node, eg: [-1,-1,1,-1,1]
    Output: (Int) if we do the major class voting, how many errors we make?
    """
    
    if len(labels_in_node) == 0:
        return 0
    
    positive_ones = labels_in_node.apply(lambda x: x==1).sum()
    negative_ones = labels_in_node.apply(lambda x: x==-1).sum()
    
    return min(positive_ones, negative_ones)

def best_split(data, features, target):
    """
    We want to select out the best feature such that it splits the data best based on your measurement(IG/accuracy)
    Input: (Pandas DataFrame)data
           (List of String) features  candidates we can choose feature from
           (String) target  the target name we shoot for. eg: 'safe_loan' 
           
    Output: (String) the best feature
    """
    
    best_feature = None
    best_error = 2.0
    num_data_points = float(len(data))
    
    for feature in features:
        left_split = data[data[feature] == 0]
        right_split = data[data[feature] == 1] 
        left_misses = count_errors(left_split[target])
        right_misses = count_errors(right_split[target])
        error = (left_misses + right_misses) * 1.0 / num_data_point
        
        if error < best_error:
            best_error = error
            best_feature = feature
    return best_feature

In [22]:
def entropy(labels_in_node):
    """
    Input: labels_in_node should be an array of 0,1  eg [0,0,1,0,1,0..]
    Output: the entropy of the array
    """
    
    n = len(labels_in_node)
    s1 = (labels_in_node==1).sum()
    if s1 == 0 or s1 == n:
        return 0
    
    p1 = float(s1) / n
    p0 = 1 - p1
    return -p0 * np.log2(p0) - p1 * np.log2(p1)

def best_split_entropy(data, features, target):
    """
    We want to select out the best feature such that it splits the data best based on your measurement(IG/accuracy)
    Input: (Pandas DataFrame)data
           (List of String) features  candidates we can choose feature from
           (String) target  the target name we shoot for. eg: 'safe_loan' 
           
    Output: (String) the best feature
    """
    
    best_feature = None
    best_info_gain = float('-inf')
    num_data_points = float(len(data))
    entropy_original = entropy(data[target])
    
    for feature in features:
        left_split = data[data[feature] == 0]
        right_split = data[data[feature] == 1] 
        left_entropy = entropy(left_split[target])
        right_entropy = entropy(right_split[target])
        entropy_split = len(left_split) / num_data_points * left_entropy + len(right_split) / num_data_points * right_entropy
        
        info_gain = entropy_original - entropy_split
        
        if info_gain > best_info_gain:
            best_info_gain = info_gain
            best_feature = feature
    return best_feature

In [23]:
class TreeNode:
    def __init__(self, is_leaf, prediction, split_feature):
        self.is_leaf = is_leaf
        self.prediction = prediction
        self.split_feature = split_feature
        self.left = None
        self.right = None

In [24]:
from sklearn.base import BaseEstimator
from sklearn.metrics import accuracy_score
class MyDecisionTree(BaseEstimator):
    
    def __init__(self, max_depth, min_error):
        self.max_depth = max_depth
        self.min_error = min_error
    
    def fit(self, X, Y, data_weights = None):
        
        data_set = pd.concat([X, Y], axis=1)
        features = X.columns
        target = Y.columns[0]
        self.root_node = self.create_tree(data_set, features, 
                               target, current_depth = 0, max_depth = self.max_depth, min_error=self.min_error)
        
    def predict(self, X):
        """
        Input:  (Pandas DataFrame/Numpy array, size: m * n) a matrix and each row indicates a data point
        Output: (Pandas DataFrame/Numpy array, size: m * 1) array of the predicted result
        
        Tips: each row is predicted by the function predict_single_data()
        """
        prediction = X.apply(lambda row: self.predict_single_data(self.root_node, row), axis=1)
        return prediction
    
    def score(self, testX, testY):
        target = testY.columns[0]
        result = self.predict(testX)
        return accuracy_score(testY[target], result)
    
    def create_tree(self, data, features, target, current_depth = 0, max_depth = 10, min_error=0):
        """
        Input
            data: (pandas data frame) the input data
            features: (pandas series/dataframe/numpy array) available features
            target: (pandas series/dataframe/numpy array)  the target to predict
            current_depth: (Int)  current depth of the tree
            max_depth: (Int)  the maximum depth of the tree
            min_error: (Float) the minimum error reduction  
            
        Output:
            (TreeNode)  root
        """
        
        """
        探索三种不同的终止划分数据集的条件  
  
        termination 1, error rate goes below min_error  
        termination 2, used all feature  
        termination 3, depth more than max_depth
        """
        
        remaining_features = features[:]
        target_values = data[target]
        
        # termination 1
        if count_errors(target_values) <= min_error:
            print("Termination 1 reached.")     
            return self.create_leaf(target_values)
        
        # termination 2
        if len(remaining_features) == 0:
            print("Termination 2 reached.")    
            return self.create_leaf(target_values)  
        
        # termination 3
        if current_depth >= max_depth: 
            print("Termination 3 reached.")
            return self.create_leaf(target_values)
        
        split_feature = best_split_entropy(data, features, target)
        left_split = data[data[split_feature] == 0]
        right_split = data[data[split_feature] == 1]
        remaining_features = remaining_features.drop(split_feature)
        print("Split on feature %s. (%s, %s)" % (split_feature, str(len(left_split)), str(len(right_split))))
        
        if len(left_split) == len(data):
            print("Perfect split!")
            return self.create_leaf(left_split[target])
        if len(right_split) == len(data):
            print("Perfect split!")
            return self.create_leaf(right_split[target])
        
        left_tree = self.create_tree(left_split, remaining_features, target, current_depth + 1, max_depth, min_error)        
        right_tree = self.create_tree(right_split,remaining_features,target, current_depth + 1, max_depth, min_error)
        
        result_node = TreeNode(False, None, split_feature)
        result_node.left = left_tree
        result_node.right = right_tree
        
        return result_node
    
    def create_leaf(self, target_values):
        """
        Input: (Pandas DataFrame/Numpy Array)  target_values  eg: [-1,1,-1,-1,1]
        Output: (TreeNode) node   Note that you should fill in the correct information for each attribute of the result
        """
        leaf = TreeNode(True, None, None)
        
        num_positive_ones = len(target_values[target_values == +1])
        num_negative_ones = len(target_values[target_values == -1])
        
        if num_positive_ones > num_negative_ones:
            leaf.prediction = 1
        else:
            leaf.prediction = -1
        return leaf
    
    def predict_single_data(self, tree, x, annotate = False): 
        """
        Input:  (TreeNode)  tree
                (Pandas DataFrame) x  it's a single array or one row from a pandas dataframe (one data point)
                (Bool)  annotate  if intermediate result is displayed
        Output:  (Int)  -1 or 1 in our case
        """  
        if tree.is_leaf:
            if annotate: 
                print("leaf node, predicting %s" % tree.prediction)
            return tree.prediction 
        else:
            split_feature_value = x[tree.split_feature]

            if annotate: 
                print("Split on %s = %s" % (tree.split_feature, split_feature_value))
            if split_feature_value == 0:
                return self.predict_single_data(tree.left, x, annotate)
            else:
                return self.predict_single_data(tree.right, x, annotate)    
    
    def count_leaves(self):
        return self.count_leaves_helper(self.root_node)
    
    def count_leaves_helper(self, tree):
        if tree.is_leaf:
            return 1
        return self.count_leaves_helper(tree.left) + self.count_leaves_helper(tree.right)

In [27]:
m = MyDecisionTree(max_depth = 10, min_error = 1e-15)

In [28]:
m.fit(trainX, trainY)

Split on feature grade_A. (31776, 5264)
Split on feature grade_B. (21587, 10189)
Split on feature grade_C. (12308, 9279)
Split on feature grade_D. (5553, 6755)
Split on feature term_ 60 months. (1743, 3810)
Split on feature grade_E. (459, 1284)
Split on feature emp_length_10+ years. (358, 101)
Split on feature emp_length_6 years. (328, 30)
Split on feature home_ownership_OTHER. (325, 3)
Split on feature emp_length_4 years. (297, 28)
Termination 3 reached.
Termination 3 reached.
Termination 1 reached.
Split on feature home_ownership_MORTGAGE. (23, 7)
Split on feature home_ownership_OWN. (21, 2)
Termination 3 reached.
Termination 3 reached.
Termination 1 reached.
Split on feature grade_F. (25, 76)
Split on feature home_ownership_OWN. (21, 4)
Split on feature home_ownership_RENT. (13, 8)
Termination 3 reached.
Termination 3 reached.
Split on feature grade_G. (0, 4)
Perfect split!
Split on feature home_ownership_RENT. (47, 29)
Split on feature home_ownership_OWN. (39, 8)
Termination 3 reac

Split on feature home_ownership_MORTGAGE. (4700, 4039)
Split on feature emp_length_2 years. (4091, 609)
Split on feature emp_length_6 years. (3795, 296)
Split on feature emp_length_4 years. (3405, 390)
Split on feature home_ownership_OTHER. (3392, 13)
Split on feature emp_length_8 years. (3207, 185)
Termination 3 reached.
Termination 3 reached.
Split on feature emp_length_10+ years. (7, 6)
Termination 3 reached.
Termination 1 reached.
Split on feature home_ownership_RENT. (42, 348)
Split on feature grade_C. (42, 0)
Perfect split!
Split on feature grade_C. (348, 0)
Perfect split!
Split on feature home_ownership_OTHER. (294, 2)
Split on feature home_ownership_RENT. (44, 250)
Split on feature grade_C. (44, 0)
Perfect split!
Split on feature grade_C. (250, 0)
Perfect split!
Termination 1 reached.
Split on feature home_ownership_OWN. (547, 62)
Split on feature home_ownership_RENT. (1, 546)
Termination 1 reached.
Split on feature grade_C. (546, 0)
Perfect split!
Split on feature grade_C. (62

In [29]:
m.score(testX, testY)

0.6193304535637149

## Discussion on decision tree complexity

In [30]:
m.count_leaves()

185

In [31]:
#impact from max depth
model_1 = MyDecisionTree(max_depth = 3, min_error = 1e-15)
model_2 = MyDecisionTree(max_depth = 7, min_error = 1e-15)
model_3 = MyDecisionTree(max_depth = 15, min_error = 1e-15)

In [32]:
model_1.fit(trainX, trainY)
model_2.fit(trainX, trainY)
model_3.fit(trainX, trainY)

Split on feature grade_A. (31776, 5264)
Split on feature grade_B. (21587, 10189)
Split on feature grade_C. (12308, 9279)
Termination 3 reached.
Termination 3 reached.
Split on feature term_ 60 months. (9134, 1055)
Termination 3 reached.
Termination 3 reached.
Split on feature emp_length_nan. (5037, 227)
Split on feature home_ownership_RENT. (3142, 1895)
Termination 3 reached.
Termination 3 reached.
Split on feature term_ 60 months. (220, 7)
Termination 3 reached.
Termination 3 reached.
Split on feature grade_A. (31776, 5264)
Split on feature grade_B. (21587, 10189)
Split on feature grade_C. (12308, 9279)
Split on feature grade_D. (5553, 6755)
Split on feature term_ 60 months. (1743, 3810)
Split on feature grade_E. (459, 1284)
Split on feature emp_length_10+ years. (358, 101)
Termination 3 reached.
Termination 3 reached.
Split on feature emp_length_nan. (1223, 61)
Termination 3 reached.
Termination 3 reached.
Split on feature home_ownership_MORTGAGE. (1919, 1891)
Split on feature emp_le

Split on feature home_ownership_OWN. (21, 4)
Split on feature home_ownership_RENT. (13, 8)
Split on feature grade_G. (0, 13)
Perfect split!
Split on feature grade_G. (0, 8)
Perfect split!
Split on feature grade_G. (0, 4)
Perfect split!
Split on feature home_ownership_RENT. (47, 29)
Split on feature home_ownership_OWN. (39, 8)
Split on feature grade_G. (39, 0)
Perfect split!
Split on feature grade_G. (8, 0)
Perfect split!
Split on feature grade_G. (29, 0)
Perfect split!
Split on feature emp_length_nan. (1223, 61)
Split on feature emp_length_1 year. (1117, 106)
Split on feature home_ownership_OTHER. (1113, 4)
Split on feature emp_length_2 years. (963, 150)
Split on feature home_ownership_MORTGAGE. (630, 333)
Split on feature emp_length_10+ years. (475, 155)
Split on feature emp_length_9 years. (446, 29)
Split on feature emp_length_7 years. (402, 44)
Split on feature emp_length_8 years. (367, 35)
Termination 3 reached.
Termination 3 reached.
Split on feature home_ownership_RENT. (6, 38)
T

Split on feature home_ownership_RENT. (42, 263)
Split on feature home_ownership_OWN. (2, 40)
Termination 1 reached.
Split on feature grade_F. (40, 0)
Perfect split!
Split on feature grade_F. (263, 0)
Perfect split!
Split on feature home_ownership_OTHER. (370, 1)
Split on feature home_ownership_RENT. (32, 338)
Split on feature grade_F. (32, 0)
Perfect split!
Split on feature grade_F. (338, 0)
Perfect split!
Termination 1 reached.
Split on feature home_ownership_RENT. (42, 102)
Split on feature grade_F. (42, 0)
Perfect split!
Split on feature grade_F. (102, 0)
Perfect split!
Split on feature emp_length_nan. (894, 25)
Split on feature emp_length_4 years. (831, 63)
Split on feature emp_length_2 years. (750, 81)
Split on feature emp_length_9 years. (716, 34)
Split on feature emp_length_< 1 year. (632, 84)
Split on feature home_ownership_RENT. (121, 511)
Split on feature emp_length_5 years. (107, 14)
Split on feature emp_length_7 years. (94, 13)
Split on feature emp_length_8 years. (89, 5)
T

Split on feature emp_length_7 years. (1131, 79)
Split on feature emp_length_3 years. (1047, 84)
Split on feature emp_length_9 years. (970, 77)
Split on feature emp_length_4 years. (893, 77)
Split on feature emp_length_8 years. (811, 82)
Split on feature emp_length_10+ years. (246, 565)
Split on feature emp_length_< 1 year. (171, 75)
Termination 3 reached.
Termination 3 reached.
Split on feature grade_F. (565, 0)
Perfect split!
Split on feature grade_F. (82, 0)
Perfect split!
Split on feature grade_F. (77, 0)
Perfect split!
Split on feature grade_F. (77, 0)
Perfect split!
Split on feature grade_F. (84, 0)
Perfect split!
Split on feature grade_F. (79, 0)
Perfect split!
Split on feature grade_F. (53, 0)
Perfect split!
Split on feature grade_F. (73, 0)
Perfect split!
Split on feature term_ 60 months. (321, 62)
Split on feature home_ownership_RENT. (176, 145)
Split on feature home_ownership_OWN. (131, 45)
Split on feature grade_F. (131, 0)
Perfect split!
Split on feature grade_F. (45, 0)
Pe

Split on feature grade_C. (116, 0)
Perfect split!
Split on feature home_ownership_MORTGAGE. (4, 172)
Termination 1 reached.
Split on feature grade_C. (172, 0)
Perfect split!
Split on feature grade_C. (170, 0)
Perfect split!
Split on feature home_ownership_MORTGAGE. (1, 192)
Termination 1 reached.
Split on feature grade_C. (192, 0)
Perfect split!
Split on feature home_ownership_MORTGAGE. (1, 166)
Termination 1 reached.
Split on feature grade_C. (166, 0)
Perfect split!
Split on feature emp_length_9 years. (338, 13)
Split on feature emp_length_4 years. (317, 21)
Split on feature emp_length_5 years. (284, 33)
Split on feature emp_length_6 years. (269, 15)
Split on feature emp_length_7 years. (245, 24)
Split on feature emp_length_8 years. (225, 20)
Split on feature emp_length_< 1 year. (182, 43)
Split on feature emp_length_1 year. (162, 20)
Split on feature emp_length_3 years. (129, 33)
Termination 3 reached.
Termination 3 reached.
Split on feature grade_C. (20, 0)
Perfect split!
Split on f

In [33]:
print("model_1 training accuracy :", model_1.score(trainX, trainY))
print("model_2 training accuracy :", model_2.score(trainX, trainY))
print("model_3 training accuracy :", model_3.score(trainX, trainY))

model_1 training accuracy : 0.6173326133909287
model_2 training accuracy : 0.6229481641468683
model_3 training accuracy : 0.6266198704103672


In [34]:
print("model_1 testing accuracy :", model_1.score(testX, testY))
print("model_2 testing accuracy :", model_2.score(testX, testY))
print("model_3 testing accuracy :", model_3.score(testX, testY))

model_1 testing accuracy : 0.6173866090712743
model_2 testing accuracy : 0.6206263498920086
model_3 testing accuracy : 0.6187904967602592


In [35]:
print("model_1 complexity is: ", model_1.count_leaves())
print("model_2 complexity is: ", model_2.count_leaves())
print("model_3 complexity is: ", model_3.count_leaves())

model_1 complexity is:  8
model_2 complexity is:  74
model_3 complexity is:  384
