In [1]:
import pandas as pd
import numpy as np
import json

# Implementing binary decision trees from scratch

In [2]:
loans = pd.read_csv(r"D:\Classification\Project 5\data\lending-club-data.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# safe_loans =  1 => safe
# safe_loans = -1 => risky
loans['safe_loans'] = loans['bad_loans'].apply(lambda x : +1 if x==0 else -1)
del loans['bad_loans']

In [4]:
features = ['grade',              # grade of the loan
            'term',               # the term of the loan
            'home_ownership',     # home_ownership status: own, mortgage or rent
            'emp_length',         # number of years of employment
           ]
target = 'safe_loans'

In [5]:
# Extract the feature columns and target column
loans = loans[features + [target]]

In [6]:
loans.head()

Unnamed: 0,grade,term,home_ownership,emp_length,safe_loans
0,B,36 months,RENT,10+ years,1
1,C,60 months,RENT,< 1 year,-1
2,C,36 months,RENT,10+ years,1
3,C,36 months,RENT,10+ years,1
4,A,36 months,RENT,3 years,1


#### One-hot encoding

In [7]:
categorical_var = [m for m in loans.columns if loans[m].dtypes == 'object']
categorical_var

['grade', 'term', 'home_ownership', 'emp_length']

In [8]:
loans=pd.get_dummies(loans)

In [9]:
loans.head()

Unnamed: 0,safe_loans,grade_A,grade_B,grade_C,grade_D,grade_E,grade_F,grade_G,term_ 36 months,term_ 60 months,...,emp_length_10+ years,emp_length_2 years,emp_length_3 years,emp_length_4 years,emp_length_5 years,emp_length_6 years,emp_length_7 years,emp_length_8 years,emp_length_9 years,emp_length_< 1 year
0,1,0,1,0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
1,-1,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
2,1,0,0,1,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
3,1,0,0,1,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
4,1,1,0,0,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0


In [10]:
with open(r"D:\Classification\Project 5\data\train-idx.json") as f:
    train_index=json.load(f)
with open(r"D:\Classification\Project 5\data\validation-idx.json") as f:
    validation_index=json.load(f)

In [11]:
train_data = loans.iloc[train_index]
validation_data = loans.iloc[validation_index]

#### Decision tree implementation

In [12]:
def reached_minimum_node_size(data, min_node_size):
    # Return True if the number of data points is less than or equal to the minimum node size.
    return len(data) <= min_node_size

In [13]:
def error_reduction(error_before_split, error_after_split):
    # Return the error before the split minus the error after the split.
    return error_before_split - error_after_split

In [14]:
def intermediate_node_num_mistakes(labels_in_node):
    '''
    computes the number of misclassified examples of an intermediate node
    given the set of labels (y values) of the data points contained in the node.
    '''
    # Corner case: If labels_in_node is empty, return 0
    if len(labels_in_node) == 0:
        return 0    
    # number of safe loans
    num_safe_loans = sum(labels_in_node==1)   
    # number of risky loans
    num_risky_loans = sum(labels_in_node==-1)              
    # number of mistakes that the majority classifier makes.
    return min(num_safe_loans,num_risky_loans)  

In [15]:
# Test case 1
example_labels = pd.Series([-1, -1, 1, 1, 1])
if intermediate_node_num_mistakes(example_labels) == 2.:
    print('Test passed!')
else:
    print('Test 1 failed... try again!')

# Test case 2
example_labels = pd.Series([-1, -1, 1, 1, 1, 1, 1])
if intermediate_node_num_mistakes(example_labels) == 2:
    print('Test passed!')
else:
    print('Test 3 failed... try again!')
    
# Test case 3
example_labels = pd.Series([-1, -1, -1, -1, -1, 1, 1])
if intermediate_node_num_mistakes(example_labels) == 2:
    print('Test passed!')
else:
    print('Test 3 failed... try again!')

Test passed!
Test passed!
Test passed!


In [16]:
def best_splitting_feature(data, features, target):
    
    target_values = data[target]
    best_feature = None # Keep track of the best feature 
    best_error = 10     # Keep track of the best error so far 
    # Note: Since error is always <= 1, we should intialize it with something larger than 1.

    # Convert to float to make sure error gets computed correctly.
    num_data_points = float(len(data))  
    
    for feature in features:
        
        # The left split will have all data points where the feature value is 0
        left_split = data[data[feature] == 0]
        
        # The right split will have all data points where the feature value is 1
        right_split = data[data[feature] == 1]
            
        # the number of misclassified examples in the left split.
        left_mistakes =  intermediate_node_num_mistakes(left_split[target])           

        # the number of misclassified examples in the right split.
        right_mistakes = intermediate_node_num_mistakes(right_split[target])
            
        # the classification error of this split.
        error = (left_mistakes+right_mistakes)/num_data_points
        
        if error < best_error:
            best_error=error
            best_feature=feature
    
    return best_feature

In [17]:
def create_leaf(target_values):    
    # Create a leaf node
    leaf = {'splitting_feature' : None,
            'left' : None,
            'right' : None,
            'is_leaf': True }
   
    # Count the number of data points that are +1 and -1 in this node.
    num_ones = len(target_values[target_values == +1])
    num_minus_ones = len(target_values[target_values == -1])    

    # For the leaf node, set the prediction to be the majority class.
    if num_ones > num_minus_ones:
        leaf['prediction'] = 1
    else:
        leaf['prediction'] = -1

    return leaf 

In [18]:
def decision_tree_create(data, features, target, current_depth = 0, max_depth = 10, min_node_size=1, 
                         min_error_reduction=0.0):
    
    remaining_features = features[:] 
    
    target_values = data[target]
    print("--------------------------------------------------------------------")
    print("Subtree, depth = %s (%s data points)." % (current_depth, len(target_values)))
    

    # All nodes are of the same type
    if  intermediate_node_num_mistakes(target_values) == 0:
        print("Stopping condition 1 reached.")
        # If not mistakes at current node, make current node a leaf node
        return create_leaf(target_values)
    
    # No more features to split on.
    if remaining_features == []:   
        print("Stopping condition 2 reached.")    
        # If there are no remaining features to consider, make current node a leaf node
        return create_leaf(target_values)    
    
    # limit tree depth
    if current_depth >= max_depth:
        print("Reached maximum depth. Stopping for now.")
        # If the max tree depth has been reached, make current node a leaf node
        return create_leaf(target_values)
    
    # Reached the minimum node size.
    if reached_minimum_node_size(data,min_node_size): 
        print("Early stopping condition 2 reached. Reached minimum node size.")
        return create_leaf(target_values)

    splitting_feature=best_splitting_feature(data,features,target)
    
    # Split on the best feature
    left_split = data[data[splitting_feature] == 0]
    right_split = data[data[splitting_feature] == 1]
    
    # Minimum error reduction
    error_before_split = intermediate_node_num_mistakes(target_values) / float(len(data))
    left_mistakes = intermediate_node_num_mistakes(left_split[target])
    right_mistakes = intermediate_node_num_mistakes(right_split[target])
    error_after_split = (left_mistakes + right_mistakes) / float(len(data))
    
    if error_reduction(error_before_split, error_after_split) <= min_error_reduction:
        print("Early stopping condition 3 reached. Minimum error reduction.")
        return create_leaf(target_values) 
    
    remaining_features.remove(splitting_feature)
    print("Split on feature %s. (%s, %s)" % (\
                      splitting_feature, len(left_split), len(right_split)))
    
    # Create a leaf node if the split is "perfect"
#     if len(left_split) == len(data):
#         print("Creating leaf node.")
#         return create_leaf(left_split[target])
#     if len(right_split) == len(data):
#         print("Creating leaf node.")
#         return create_leaf(left_split[target])

        
    # recurse on left and right subtrees
    left_tree = decision_tree_create(left_split, remaining_features, target, current_depth + 1, max_depth,
                                    min_node_size, min_error_reduction)        
    right_tree = decision_tree_create(right_split, remaining_features, target, current_depth + 1, max_depth,
                                     min_node_size, min_error_reduction)

    return {'is_leaf'          : False, 
            'prediction'       : None,
            'splitting_feature': splitting_feature,
            'left'             : left_tree, 
            'right'            : right_tree}

#### Build a tree

In [19]:
features_new = [col for col in train_data.drop('safe_loans',1).columns]

In [20]:
my_decision_tree_new = decision_tree_create(train_data, features_new, 'safe_loans', max_depth = 6, 
                                            min_node_size = 100, min_error_reduction=0.0)

--------------------------------------------------------------------
Subtree, depth = 0 (37224 data points).
Split on feature term_ 36 months. (9223, 28001)
--------------------------------------------------------------------
Subtree, depth = 1 (9223 data points).
Split on feature grade_A. (9122, 101)
--------------------------------------------------------------------
Subtree, depth = 2 (9122 data points).
Early stopping condition 3 reached. Minimum error reduction.
--------------------------------------------------------------------
Subtree, depth = 2 (101 data points).
Split on feature emp_length_< 1 year. (90, 11)
--------------------------------------------------------------------
Subtree, depth = 3 (90 data points).
Early stopping condition 2 reached. Reached minimum node size.
--------------------------------------------------------------------
Subtree, depth = 3 (11 data points).
Early stopping condition 2 reached. Reached minimum node size.
------------------------------------

In [21]:
my_decision_tree_old = decision_tree_create(train_data, features_new, 'safe_loans', max_depth = 6,
                                            min_node_size = 0, min_error_reduction=-1)

--------------------------------------------------------------------
Subtree, depth = 0 (37224 data points).
Split on feature term_ 36 months. (9223, 28001)
--------------------------------------------------------------------
Subtree, depth = 1 (9223 data points).
Split on feature grade_A. (9122, 101)
--------------------------------------------------------------------
Subtree, depth = 2 (9122 data points).
Split on feature grade_B. (8074, 1048)
--------------------------------------------------------------------
Subtree, depth = 3 (8074 data points).
Split on feature grade_C. (5884, 2190)
--------------------------------------------------------------------
Subtree, depth = 4 (5884 data points).
Split on feature grade_D. (3826, 2058)
--------------------------------------------------------------------
Subtree, depth = 5 (3826 data points).
Split on feature grade_E. (1693, 2133)
--------------------------------------------------------------------
Subtree, depth = 6 (1693 data points).
R

Split on feature grade_C. (1276, 0)
--------------------------------------------------------------------
Subtree, depth = 6 (1276 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 6 (0 data points).
Stopping condition 1 reached.
--------------------------------------------------------------------
Subtree, depth = 5 (0 data points).
Stopping condition 1 reached.
--------------------------------------------------------------------
Subtree, depth = 4 (0 data points).
Stopping condition 1 reached.
--------------------------------------------------------------------
Subtree, depth = 2 (4701 data points).
Split on feature grade_A. (4701, 0)
--------------------------------------------------------------------
Subtree, depth = 3 (4701 data points).
Split on feature grade_B. (4701, 0)
--------------------------------------------------------------------
Subtree, depth = 4 (4701 data points).
Split on featu

#### Making predictions with a decision tree

In [22]:
def classify(tree, x, annotate = False):
    # if the node is a leaf node.
    if tree['is_leaf']:
        if annotate:
             print("At leaf, predicting %s" % tree['prediction'])
        return tree['prediction']
    else:
        # split on feature.
        split_feature_value = x[tree['splitting_feature']]
        if annotate:
             print("Split on %s = %s" % (tree['splitting_feature'], split_feature_value))
        if split_feature_value == 0:
            return classify(tree['left'], x, annotate)
        else:
            return classify(tree['right'], x, annotate)               

In [23]:
print(validation_data.iloc[0])
print('Predicted class: %s ' % classify(my_decision_tree_new, validation_data.iloc[0]))

safe_loans                -1
grade_A                    0
grade_B                    0
grade_C                    0
grade_D                    1
grade_E                    0
grade_F                    0
grade_G                    0
term_ 36 months            0
term_ 60 months            1
home_ownership_MORTGAGE    0
home_ownership_OTHER       0
home_ownership_OWN         0
home_ownership_RENT        1
emp_length_1 year          0
emp_length_10+ years       0
emp_length_2 years         1
emp_length_3 years         0
emp_length_4 years         0
emp_length_5 years         0
emp_length_6 years         0
emp_length_7 years         0
emp_length_8 years         0
emp_length_9 years         0
emp_length_< 1 year        0
Name: 24, dtype: int64
Predicted class: -1 


In [24]:
classify(my_decision_tree_new, validation_data.iloc[0],annotate=True)

Split on term_ 36 months = 0
Split on grade_A = 0
At leaf, predicting -1


-1

In [25]:
classify(my_decision_tree_old, validation_data.iloc[0], annotate = True)

Split on term_ 36 months = 0
Split on grade_A = 0
Split on grade_B = 0
Split on grade_C = 0
Split on grade_D = 1
Split on grade_E = 0
At leaf, predicting -1


-1

#### Evaluating the model

In [26]:
def evaluate_classification_error(tree, data):
    prediction = data.apply(lambda x: classify(tree, x),axis=1)
    classification_error = sum(prediction != data['safe_loans'])/len(data)
    return classification_error

In [27]:
evaluate_classification_error(my_decision_tree_new, validation_data)

0.37774666092201636

In [28]:
evaluate_classification_error(my_decision_tree_old, validation_data)

0.37774666092201636

Note: The validation error of the new decision tree (adding more stopping conditions) is same as that of the old decision tree from the previous project.

#### Exploring the effect of max_depth

In [29]:
model_1 = decision_tree_create(train_data, features_new, 'safe_loans', max_depth = 2,
                                            min_node_size = 0, min_error_reduction=-1)

--------------------------------------------------------------------
Subtree, depth = 0 (37224 data points).
Split on feature term_ 36 months. (9223, 28001)
--------------------------------------------------------------------
Subtree, depth = 1 (9223 data points).
Split on feature grade_A. (9122, 101)
--------------------------------------------------------------------
Subtree, depth = 2 (9122 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 2 (101 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 1 (28001 data points).
Split on feature grade_D. (23300, 4701)
--------------------------------------------------------------------
Subtree, depth = 2 (23300 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 2 (4701 data po

In [30]:
model_2 = decision_tree_create(train_data, features_new, 'safe_loans', max_depth = 6,
                                            min_node_size = 0, min_error_reduction=-1)

--------------------------------------------------------------------
Subtree, depth = 0 (37224 data points).
Split on feature term_ 36 months. (9223, 28001)
--------------------------------------------------------------------
Subtree, depth = 1 (9223 data points).
Split on feature grade_A. (9122, 101)
--------------------------------------------------------------------
Subtree, depth = 2 (9122 data points).
Split on feature grade_B. (8074, 1048)
--------------------------------------------------------------------
Subtree, depth = 3 (8074 data points).
Split on feature grade_C. (5884, 2190)
--------------------------------------------------------------------
Subtree, depth = 4 (5884 data points).
Split on feature grade_D. (3826, 2058)
--------------------------------------------------------------------
Subtree, depth = 5 (3826 data points).
Split on feature grade_E. (1693, 2133)
--------------------------------------------------------------------
Subtree, depth = 6 (1693 data points).
R

Split on feature grade_C. (1276, 0)
--------------------------------------------------------------------
Subtree, depth = 6 (1276 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 6 (0 data points).
Stopping condition 1 reached.
--------------------------------------------------------------------
Subtree, depth = 5 (0 data points).
Stopping condition 1 reached.
--------------------------------------------------------------------
Subtree, depth = 4 (0 data points).
Stopping condition 1 reached.
--------------------------------------------------------------------
Subtree, depth = 2 (4701 data points).
Split on feature grade_A. (4701, 0)
--------------------------------------------------------------------
Subtree, depth = 3 (4701 data points).
Split on feature grade_B. (4701, 0)
--------------------------------------------------------------------
Subtree, depth = 4 (4701 data points).
Split on featu

In [31]:
model_3 = decision_tree_create(train_data, features_new, 'safe_loans', max_depth = 14,
                                            min_node_size = 0, min_error_reduction=-1)

--------------------------------------------------------------------
Subtree, depth = 0 (37224 data points).
Split on feature term_ 36 months. (9223, 28001)
--------------------------------------------------------------------
Subtree, depth = 1 (9223 data points).
Split on feature grade_A. (9122, 101)
--------------------------------------------------------------------
Subtree, depth = 2 (9122 data points).
Split on feature grade_B. (8074, 1048)
--------------------------------------------------------------------
Subtree, depth = 3 (8074 data points).
Split on feature grade_C. (5884, 2190)
--------------------------------------------------------------------
Subtree, depth = 4 (5884 data points).
Split on feature grade_D. (3826, 2058)
--------------------------------------------------------------------
Subtree, depth = 5 (3826 data points).
Split on feature grade_E. (1693, 2133)
--------------------------------------------------------------------
Subtree, depth = 6 (1693 data points).
S

Split on feature home_ownership_RENT. (643, 0)
--------------------------------------------------------------------
Subtree, depth = 13 (643 data points).
Split on feature emp_length_1 year. (602, 41)
--------------------------------------------------------------------
Subtree, depth = 14 (602 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 14 (41 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 13 (0 data points).
Stopping condition 1 reached.
--------------------------------------------------------------------
Subtree, depth = 12 (0 data points).
Stopping condition 1 reached.
--------------------------------------------------------------------
Subtree, depth = 9 (0 data points).
Stopping condition 1 reached.
--------------------------------------------------------------------
Subtree, depth = 7 (1 data

Split on feature home_ownership_OTHER. (1135, 0)
--------------------------------------------------------------------
Subtree, depth = 11 (1135 data points).
Split on feature home_ownership_OWN. (1135, 0)
--------------------------------------------------------------------
Subtree, depth = 12 (1135 data points).
Split on feature home_ownership_RENT. (1135, 0)
--------------------------------------------------------------------
Subtree, depth = 13 (1135 data points).
Split on feature emp_length_1 year. (1096, 39)
--------------------------------------------------------------------
Subtree, depth = 14 (1096 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 14 (39 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 13 (0 data points).
Stopping condition 1 reached.
------------------------------------------------

Split on feature term_ 60 months. (0, 969)
--------------------------------------------------------------------
Subtree, depth = 10 (0 data points).
Stopping condition 1 reached.
--------------------------------------------------------------------
Subtree, depth = 10 (969 data points).
Split on feature home_ownership_MORTGAGE. (367, 602)
--------------------------------------------------------------------
Subtree, depth = 11 (367 data points).
Split on feature home_ownership_OTHER. (367, 0)
--------------------------------------------------------------------
Subtree, depth = 12 (367 data points).
Split on feature home_ownership_OWN. (291, 76)
--------------------------------------------------------------------
Subtree, depth = 13 (291 data points).
Split on feature home_ownership_RENT. (0, 291)
--------------------------------------------------------------------
Subtree, depth = 14 (0 data points).
Stopping condition 1 reached.
----------------------------------------------------------

Split on feature home_ownership_OTHER. (45, 0)
--------------------------------------------------------------------
Subtree, depth = 12 (45 data points).
Split on feature home_ownership_OWN. (45, 0)
--------------------------------------------------------------------
Subtree, depth = 13 (45 data points).
Split on feature home_ownership_RENT. (45, 0)
--------------------------------------------------------------------
Subtree, depth = 14 (45 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 14 (0 data points).
Stopping condition 1 reached.
--------------------------------------------------------------------
Subtree, depth = 13 (0 data points).
Stopping condition 1 reached.
--------------------------------------------------------------------
Subtree, depth = 12 (0 data points).
Stopping condition 1 reached.
--------------------------------------------------------------------
Subtree, depth = 10 (0 

Split on feature grade_D. (23300, 4701)
--------------------------------------------------------------------
Subtree, depth = 2 (23300 data points).
Split on feature grade_E. (22024, 1276)
--------------------------------------------------------------------
Subtree, depth = 3 (22024 data points).
Split on feature grade_F. (21666, 358)
--------------------------------------------------------------------
Subtree, depth = 4 (21666 data points).
Split on feature grade_C. (14444, 7222)
--------------------------------------------------------------------
Subtree, depth = 5 (14444 data points).
Split on feature grade_G. (14347, 97)
--------------------------------------------------------------------
Subtree, depth = 6 (14347 data points).
Split on feature grade_A. (9318, 5029)
--------------------------------------------------------------------
Subtree, depth = 7 (9318 data points).
Split on feature home_ownership_OTHER. (9301, 17)
-------------------------------------------------------------

--------------------------------------------------------------------
Subtree, depth = 8 (0 data points).
Stopping condition 1 reached.
--------------------------------------------------------------------
Subtree, depth = 6 (97 data points).
Split on feature grade_A. (97, 0)
--------------------------------------------------------------------
Subtree, depth = 7 (97 data points).
Split on feature grade_B. (97, 0)
--------------------------------------------------------------------
Subtree, depth = 8 (97 data points).
Split on feature term_ 60 months. (97, 0)
--------------------------------------------------------------------
Subtree, depth = 9 (97 data points).
Split on feature home_ownership_MORTGAGE. (45, 52)
--------------------------------------------------------------------
Subtree, depth = 10 (45 data points).
Split on feature emp_length_3 years. (44, 1)
--------------------------------------------------------------------
Subtree, depth = 11 (44 data points).
Split on feature emp_

Split on feature grade_G. (58, 0)
--------------------------------------------------------------------
Subtree, depth = 14 (58 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 14 (0 data points).
Stopping condition 1 reached.
--------------------------------------------------------------------
Subtree, depth = 13 (0 data points).
Stopping condition 1 reached.
--------------------------------------------------------------------
Subtree, depth = 12 (0 data points).
Stopping condition 1 reached.
--------------------------------------------------------------------
Subtree, depth = 9 (129 data points).
Split on feature home_ownership_OWN. (113, 16)
--------------------------------------------------------------------
Subtree, depth = 10 (113 data points).
Split on feature grade_A. (113, 0)
--------------------------------------------------------------------
Subtree, depth = 11 (113 data points).
Split

Split on feature grade_B. (227, 0)
--------------------------------------------------------------------
Subtree, depth = 10 (227 data points).
Split on feature grade_G. (227, 0)
--------------------------------------------------------------------
Subtree, depth = 11 (227 data points).
Split on feature term_ 60 months. (227, 0)
--------------------------------------------------------------------
Subtree, depth = 12 (227 data points).
Split on feature home_ownership_OTHER. (227, 0)
--------------------------------------------------------------------
Subtree, depth = 13 (227 data points).
Split on feature home_ownership_OWN. (227, 0)
--------------------------------------------------------------------
Subtree, depth = 14 (227 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 14 (0 data points).
Stopping condition 1 reached.
--------------------------------------------------------------------
Subtree

Split on feature grade_B. (1276, 0)
--------------------------------------------------------------------
Subtree, depth = 5 (1276 data points).
Split on feature grade_C. (1276, 0)
--------------------------------------------------------------------
Subtree, depth = 6 (1276 data points).
Split on feature grade_F. (1276, 0)
--------------------------------------------------------------------
Subtree, depth = 7 (1276 data points).
Split on feature grade_G. (1276, 0)
--------------------------------------------------------------------
Subtree, depth = 8 (1276 data points).
Split on feature term_ 60 months. (1276, 0)
--------------------------------------------------------------------
Subtree, depth = 9 (1276 data points).
Split on feature home_ownership_MORTGAGE. (855, 421)
--------------------------------------------------------------------
Subtree, depth = 10 (855 data points).
Split on feature home_ownership_OTHER. (849, 6)
---------------------------------------------------------------

Split on feature home_ownership_RENT. (404, 0)
--------------------------------------------------------------------
Subtree, depth = 13 (404 data points).
Split on feature emp_length_1 year. (374, 30)
--------------------------------------------------------------------
Subtree, depth = 14 (374 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 14 (30 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 13 (0 data points).
Stopping condition 1 reached.
--------------------------------------------------------------------
Subtree, depth = 11 (10 data points).
Split on feature home_ownership_OWN. (10, 0)
--------------------------------------------------------------------
Subtree, depth = 12 (10 data points).
Split on feature home_ownership_RENT. (10, 0)
-------------------------------------------------------------

In [32]:
print("Training data, classification error (model 1):", evaluate_classification_error(model_1, train_data))
print("Training data, classification error (model 2):", evaluate_classification_error(model_2, train_data))
print("Training data, classification error (model 3):", evaluate_classification_error(model_3, train_data))

Training data, classification error (model 1): 0.40003761014399314
Training data, classification error (model 2): 0.3804266064904363
Training data, classification error (model 3): 0.3772566086395874


In [33]:
print("Validation data, classification error (model 1):", evaluate_classification_error(model_1, validation_data))
print("Validation data, classification error (model 2):", evaluate_classification_error(model_2, validation_data))
print("Validation data, classification error (model 3):", evaluate_classification_error(model_3, validation_data))

Validation data, classification error (model 1): 0.3981042654028436
Validation data, classification error (model 2): 0.37774666092201636
Validation data, classification error (model 3): 0.38140887548470487


Model 2 has the lowest classification error on validation data and model 3 s overfit.

#### Measuring the complexity of the tree
complexity(T) = number of leaves in the tree T

In [34]:
def count_leaves(tree):
    if tree['is_leaf']:
        return 1
    return count_leaves(tree['left']) + count_leaves(tree['right'])

In [35]:
print("Complexity of Model 1:", count_leaves(model_1))
print("Complexity of Model 2:", count_leaves(model_2))
print("Complexity of Model 3:", count_leaves(model_3))

Complexity of Model 1: 4
Complexity of Model 2: 39
Complexity of Model 3: 341


Model 3 is complex and overfit.

#### Exploring the effect of min_error

In [36]:
model_4 = decision_tree_create(train_data, features_new, 'safe_loans', max_depth = 6, 
                               min_node_size = 0, min_error_reduction=-1)

--------------------------------------------------------------------
Subtree, depth = 0 (37224 data points).
Split on feature term_ 36 months. (9223, 28001)
--------------------------------------------------------------------
Subtree, depth = 1 (9223 data points).
Split on feature grade_A. (9122, 101)
--------------------------------------------------------------------
Subtree, depth = 2 (9122 data points).
Split on feature grade_B. (8074, 1048)
--------------------------------------------------------------------
Subtree, depth = 3 (8074 data points).
Split on feature grade_C. (5884, 2190)
--------------------------------------------------------------------
Subtree, depth = 4 (5884 data points).
Split on feature grade_D. (3826, 2058)
--------------------------------------------------------------------
Subtree, depth = 5 (3826 data points).
Split on feature grade_E. (1693, 2133)
--------------------------------------------------------------------
Subtree, depth = 6 (1693 data points).
R

Split on feature grade_B. (4701, 0)
--------------------------------------------------------------------
Subtree, depth = 4 (4701 data points).
Split on feature grade_C. (4701, 0)
--------------------------------------------------------------------
Subtree, depth = 5 (4701 data points).
Split on feature grade_E. (4701, 0)
--------------------------------------------------------------------
Subtree, depth = 6 (4701 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 6 (0 data points).
Stopping condition 1 reached.
--------------------------------------------------------------------
Subtree, depth = 5 (0 data points).
Stopping condition 1 reached.
--------------------------------------------------------------------
Subtree, depth = 4 (0 data points).
Stopping condition 1 reached.
--------------------------------------------------------------------
Subtree, depth = 3 (0 data points).
Stopping conditio

In [37]:
model_5 = decision_tree_create(train_data, features_new, 'safe_loans', max_depth = 6,
                               min_node_size = 0, min_error_reduction=0)

--------------------------------------------------------------------
Subtree, depth = 0 (37224 data points).
Split on feature term_ 36 months. (9223, 28001)
--------------------------------------------------------------------
Subtree, depth = 1 (9223 data points).
Split on feature grade_A. (9122, 101)
--------------------------------------------------------------------
Subtree, depth = 2 (9122 data points).
Early stopping condition 3 reached. Minimum error reduction.
--------------------------------------------------------------------
Subtree, depth = 2 (101 data points).
Split on feature emp_length_< 1 year. (90, 11)
--------------------------------------------------------------------
Subtree, depth = 3 (90 data points).
Early stopping condition 3 reached. Minimum error reduction.
--------------------------------------------------------------------
Subtree, depth = 3 (11 data points).
Early stopping condition 3 reached. Minimum error reduction.
----------------------------------------

In [38]:
model_6 = decision_tree_create(train_data, features_new, 'safe_loans', max_depth = 6, 
                               min_node_size = 0, min_error_reduction=5)

--------------------------------------------------------------------
Subtree, depth = 0 (37224 data points).
Early stopping condition 3 reached. Minimum error reduction.


In [39]:
print("Validation data, classification error (model 4):", evaluate_classification_error(model_4, validation_data))
print("Validation data, classification error (model 5):", evaluate_classification_error(model_5, validation_data))
print("Validation data, classification error (model 6):", evaluate_classification_error(model_6, validation_data))

Validation data, classification error (model 4): 0.37774666092201636
Validation data, classification error (model 5): 0.37774666092201636
Validation data, classification error (model 6): 0.503446790176648


In [40]:
print("Complexity of Model 4:", count_leaves(model_4))
print("Complexity of Model 5:", count_leaves(model_5))
print("Complexity of Model 6:", count_leaves(model_6))

Complexity of Model 4: 39
Complexity of Model 5: 12
Complexity of Model 6: 1


Model 5: high validation error (as expected)

Model 4 and model 5 has same validation error but model 5 is simple. So, we choose model 5 from above 3 models.

#### Exploring the effect of min_node_size

In [41]:
model_7 = decision_tree_create(train_data, features_new, 'safe_loans', max_depth = 6,
                               min_node_size = 0, min_error_reduction=-1)

--------------------------------------------------------------------
Subtree, depth = 0 (37224 data points).
Split on feature term_ 36 months. (9223, 28001)
--------------------------------------------------------------------
Subtree, depth = 1 (9223 data points).
Split on feature grade_A. (9122, 101)
--------------------------------------------------------------------
Subtree, depth = 2 (9122 data points).
Split on feature grade_B. (8074, 1048)
--------------------------------------------------------------------
Subtree, depth = 3 (8074 data points).
Split on feature grade_C. (5884, 2190)
--------------------------------------------------------------------
Subtree, depth = 4 (5884 data points).
Split on feature grade_D. (3826, 2058)
--------------------------------------------------------------------
Subtree, depth = 5 (3826 data points).
Split on feature grade_E. (1693, 2133)
--------------------------------------------------------------------
Subtree, depth = 6 (1693 data points).
R

Split on feature grade_C. (1276, 0)
--------------------------------------------------------------------
Subtree, depth = 6 (1276 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 6 (0 data points).
Stopping condition 1 reached.
--------------------------------------------------------------------
Subtree, depth = 5 (0 data points).
Stopping condition 1 reached.
--------------------------------------------------------------------
Subtree, depth = 4 (0 data points).
Stopping condition 1 reached.
--------------------------------------------------------------------
Subtree, depth = 2 (4701 data points).
Split on feature grade_A. (4701, 0)
--------------------------------------------------------------------
Subtree, depth = 3 (4701 data points).
Split on feature grade_B. (4701, 0)
--------------------------------------------------------------------
Subtree, depth = 4 (4701 data points).
Split on featu

In [42]:
model_8 = decision_tree_create(train_data, features_new, 'safe_loans', max_depth = 6,
                               min_node_size = 2000, min_error_reduction=-1)

--------------------------------------------------------------------
Subtree, depth = 0 (37224 data points).
Split on feature term_ 36 months. (9223, 28001)
--------------------------------------------------------------------
Subtree, depth = 1 (9223 data points).
Split on feature grade_A. (9122, 101)
--------------------------------------------------------------------
Subtree, depth = 2 (9122 data points).
Split on feature grade_B. (8074, 1048)
--------------------------------------------------------------------
Subtree, depth = 3 (8074 data points).
Split on feature grade_C. (5884, 2190)
--------------------------------------------------------------------
Subtree, depth = 4 (5884 data points).
Split on feature grade_D. (3826, 2058)
--------------------------------------------------------------------
Subtree, depth = 5 (3826 data points).
Split on feature grade_E. (1693, 2133)
--------------------------------------------------------------------
Subtree, depth = 6 (1693 data points).
R

In [43]:
model_9 = decision_tree_create(train_data, features_new, 'safe_loans', max_depth = 6,
                               min_node_size = 50000, min_error_reduction=-1)

--------------------------------------------------------------------
Subtree, depth = 0 (37224 data points).
Early stopping condition 2 reached. Reached minimum node size.


In [44]:
print("Validation data, classification error (model 7):", evaluate_classification_error(model_7, validation_data))
print("Validation data, classification error (model 8):", evaluate_classification_error(model_8, validation_data))
print("Validation data, classification error (model 9):", evaluate_classification_error(model_9, validation_data))

Validation data, classification error (model 7): 0.37774666092201636
Validation data, classification error (model 8): 0.3774235243429556
Validation data, classification error (model 9): 0.503446790176648


In [45]:
print("Complexity of Model 7:", count_leaves(model_7))
print("Complexity of Model 8:", count_leaves(model_8))
print("Complexity of Model 9:", count_leaves(model_9))

Complexity of Model 7: 39
Complexity of Model 8: 20
Complexity of Model 9: 1


Model 7: overfit and complex model

Model 9: underfit model with very high error

Model 8: It is a good model compare to above two models.