In [1]:
import pandas as pd
import numpy as np
import json, graphviz, sys
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import warnings
warnings.filterwarnings("ignore")
np.random.seed(42)

In [2]:
loans = pd.read_csv('lending-club-data.csv', low_memory=False)

loans['bad_loans'].value_counts()
loans['safe_loans'] = loans['bad_loans'].apply(lambda x: 1-x)
del loans['bad_loans']

target = 'safe_loans'
features = ['grade',              # grade of the loan
            'term',               # the term of the loan
            'home_ownership',     # home ownership status: own, mortgage or rent
            'emp_length',         # number of years of employment
           ]
loans = loans[features + [target]]
print(loans.shape)

(122607, 5)


In [3]:
loans = loans.dropna()
print(loans.shape)

(122607, 5)


In [4]:
# one hot encode
for name, dtype in zip(loans.dtypes.index, loans.dtypes.values):
    if dtype == 'object':
        onehot = pd.get_dummies(loans[name])
        onehot.rename(columns={x:name+'_'+x.strip().replace(' ', '_') for x in onehot.columns}
                  , inplace=True)
        loans = loans.join(onehot)
        del loans[name]

In [5]:
feature_onehot = [x for x in loans.columns if x != target]

In [6]:
with open('module-8-assignment-2-train-idx.json', 'r') as f:
    train_idx = json.load(f)
with open('module-8-assignment-2-test-idx.json', 'r') as f:
    test_idx = json.load(f)
    
train_data = loans.iloc[train_idx, :].reset_index(drop=True)
test_data = loans.iloc[test_idx, :].reset_index(drop=True)

In [7]:
train_data.head()

Unnamed: 0,safe_loans,grade_A,grade_B,grade_C,grade_D,grade_E,grade_F,grade_G,term_36_months,term_60_months,...,emp_length_2_years,emp_length_3_years,emp_length_4_years,emp_length_5_years,emp_length_6_years,emp_length_7_years,emp_length_8_years,emp_length_9_years,emp_length_<_1_year,emp_length_n/a
0,0,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,1,0,0,1,...,0,0,1,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
3,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,1,0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0


In [127]:
train_data.shape

(37224, 26)

In [8]:
train_data.groupby(target)["grade_A"].value_counts()

safe_loans  grade_A
0           0          17218
            1           1258
1           0          14876
            1           3872
Name: grade_A, dtype: int64

In [9]:
def intermediate_node_weighted_mistakes(labels_in_node, data_weights):
    # Sum the weights of all entries with label +1
    total_weight_positive = sum([i for (i, v) in 
                                 zip(data_weights, [x == +1 for x in labels_in_node]) if v])
    
    # Weight of mistakes for predicting all -1's is equal to the sum above
    weighted_mistakes_all_negative = total_weight_positive
    
    # Sum the weights of all entries with label -1
    total_weight_negative = sum([i for (i, v) in 
                                 zip(data_weights, [x == 0 for x in labels_in_node]) if v])
    
    # Weight of mistakes for predicting all +1's is equal to the sum above
    weighted_mistakes_all_positive = total_weight_negative
    
    # Return the tuple (weight, class_label) representing the lower of the two weights
    #    class_label should be an integer of value +1 or -1.
    # If the two weights are identical, return (weighted_mistakes_all_positive,+1)
    if total_weight_negative > total_weight_positive:
        return (weighted_mistakes_all_negative, 0)
    else:
        return (weighted_mistakes_all_positive, 1)
    

In [109]:
labels_in_node = [1,1,1,0,0,1]
data_weights = [0.5, 0.5, 0.5, 1, 0.9, 0.5]
data_weights = np.ones(len(labels_in_node)) / len(labels_in_node)
intermediate_node_weighted_mistakes(labels_in_node, data_weights)

(0.33333333333333331, 1)

In [137]:
# If the data is identical in each feature, this function should return None

def best_splitting_feature(data, features, target, data_weights):
    
    # These variables will keep track of the best feature and the corresponding error
    best_feature = None
    best_error = float('+inf') 
    num_points = float(len(data))

    # Loop through each feature to consider splitting on that feature
    for feature in features:
        
        # The left split will have all data points where the feature value is 0
        # The right split will have all data points where the feature value is 1
        left_split = data[data[feature] == 0].reset_index(drop=True)
        right_split = data[data[feature] == 1].reset_index(drop=True)
        
        # Apply the same filtering to data_weights to create left_data_weights, right_data_weights
        left_data_weights = np.array([data_weights[i] for 
                                      i,x in enumerate(data[feature] == 0) if x])
        right_data_weights = np.array([data_weights[i] for 
                                      i,x in enumerate(data[feature] == 1) if x])
#         print("len of total:", len(data_weights))
#         print("len of left-w: ", len(left_data_weights))
#         print("len of right-w: ", len(right_data_weights))
                    
        # DIFFERENT HERE
        # Calculate the weight of mistakes for left and right sides
        left_weighted_mistakes, left_class = intermediate_node_weighted_mistakes(
                            left_split[target], left_data_weights)
        right_weighted_mistakes, right_class = intermediate_node_weighted_mistakes(
                            right_split[target], right_data_weights)
        
        # DIFFERENT HERE
        # Compute weighted error by computing
        #  ( [weight of mistakes (left)] + [weight of mistakes (right)] ) 
        # / [total weight of all data points]
        error = (left_weighted_mistakes + right_weighted_mistakes) / sum(data_weights)
        
        # If this is the best error we have found so far, store the feature and the error
        if error < best_error:
            best_feature = feature
            best_error = error
    
#     print("best error is: ", best_error)
    # Return the best feature we found
    return best_feature

In [129]:
def create_leaf(target_values, data_weights):
    
    # Create a leaf node
    leaf = {'splitting_feature' : None,
            'is_leaf': True}
    
    # Computed weight of mistakes.
    # Store the predicted class (1 or -1) in leaf['prediction']
    weighted_error, best_class = intermediate_node_weighted_mistakes(target_values, data_weights)
    leaf['prediction'] = best_class
    
    return leaf

In [122]:
def weighted_decision_tree_create(data, features, target, data_weights, 
                                  current_depth=1, max_depth=10, verbose=True):
    remaining_features = features[:] # Make a copy of the features.
    target_values = data[target]
    if verbose: 
        print("--------------------------------------------------------------------")
        print("Subtree, depth = %s (%s data points)." % (current_depth, len(target_values)))
    
    # Stopping condition 1. Error is 0.
    if intermediate_node_weighted_mistakes(target_values, data_weights)[0] <= 1e-15:
        if verbose: print("Stopping condition 1 reached.")                
        return create_leaf(target_values, data_weights)
    
    # Stopping condition 2. No more features.
    if remaining_features == []:
        if verbose: print("Stopping condition 2 reached.")                
        return create_leaf(target_values, data_weights)    
    
    # Additional stopping condition (limit tree depth)
    if current_depth > max_depth:
        if verbose: print("Reached maximum depth. Stopping for now.")
        return create_leaf(target_values, data_weights)
    
    # If all the datapoints are the same, splitting_feature will be None. Create a leaf
    splitting_feature = best_splitting_feature(data, features, target, data_weights)
    remaining_features.remove(splitting_feature)
        
    left_split = data[data[splitting_feature] == 0]
    right_split = data[data[splitting_feature] == 1]
    
    left_data_weights = np.array([w for (w, c) in 
            zip(data_weights, [x for x in data[splitting_feature] == 0]) if c])
    right_data_weights = np.array([w for (w, c) in 
            zip(data_weights, [x for x in data[splitting_feature] == 1]) if c])
    
    if verbose:
        print("Split on feature %s. (%s, %s)" % (
              splitting_feature, len(left_split), len(right_split)))
    
    # Create a leaf node if the split is "perfect"
    if len(left_split) == len(data):
        if verbose: print("Creating leaf node.")
        return create_leaf(left_split[target], data_weights)
    if len(right_split) == len(data):
        if verbose: print("Creating leaf node.")
        return create_leaf(right_split[target], data_weights)
    
    # Repeat (recurse) on left and right subtrees
    left_tree = weighted_decision_tree_create(
        left_split, remaining_features, target, 
        left_data_weights, current_depth + 1, max_depth, verbose)
    right_tree = weighted_decision_tree_create(
        right_split, remaining_features, target, 
        right_data_weights, current_depth + 1, max_depth, verbose)
    
    return {'is_leaf'          : False, 
            'prediction'       : None,
            'splitting_feature': splitting_feature,
            'left'             : left_tree, 
            'right'            : right_tree}

In [123]:
def count_nodes(tree):
    if tree['is_leaf']:
        return 1
    return 1 + count_nodes(tree['left']) + count_nodes(tree['right'])

In [124]:
def classify(tree, x, annotate=False):   
    # If the node is a leaf node.
    if tree['is_leaf']:
        if annotate: 
            print("At leaf, predicting %s" % tree['prediction'])
        return tree['prediction'] 
    else:
        # Split on feature.
        split_feature_value = x[tree['splitting_feature']]
        if annotate: 
            print("Split on %s = %s" % (tree['splitting_feature'], split_feature_value))
        if split_feature_value == 0:
            return classify(tree['left'], x, annotate)
        else:
            return classify(tree['right'], x, annotate)

In [125]:
def evaluate_classification_error(tree, data):
    # Apply the classify(tree, x) to each row in your data
    prediction = data.apply(lambda x: classify(tree, x), axis=1)
    
    # Once you've made the predictions, calculate the classification error
    return (prediction != data[target]).sum() / float(len(data))

In [138]:
# Assign weights
example_data_weights = np.array([1.] * 10 + [0.]*(len(train_data) - 20) + [1.] * 10)
example_data_weights = np.ones(len(train_data))
# Train a weighted decision tree model.
small_data_decision_tree_subset_20 = weighted_decision_tree_create(train_data, 
                                            feature_onehot, target,
                                            example_data_weights, max_depth=2)

--------------------------------------------------------------------
Subtree, depth = 1 (37224 data points).
Split on feature term_36_months. (9223, 28001)
--------------------------------------------------------------------
Subtree, depth = 2 (9223 data points).
Split on feature grade_A. (9122, 101)
--------------------------------------------------------------------
Subtree, depth = 3 (9122 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 3 (101 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 2 (28001 data points).
Split on feature grade_D. (23300, 4701)
--------------------------------------------------------------------
Subtree, depth = 3 (23300 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 3 (4701 data poi

In [23]:
N = 30
sanity_data = train_data.sample(N, random_state=42)
example_data_weights = np.random.random(N)
example_data_weights /= sum(example_data_weights)
sanity_data[target][:20].value_counts()

0    12
1     8
Name: safe_loans, dtype: int64

In [24]:
test_tree = weighted_decision_tree_create(sanity_data, 
                                    feature_onehot, target,
                                    example_data_weights, max_depth=3, verbose=False)
evaluate_classification_error(test_tree, sanity_data)

0.33333333333333331

In [25]:
evaluate_classification_error(small_data_decision_tree_subset_20, train_data)

0.40003761014399314

In [26]:
evaluate_classification_error(small_data_decision_tree_subset_20, train_data.iloc[-10:, :])

0.5

In [118]:
from math import log
from math import exp

def adaboost_with_tree_stumps(data, features, target, num_tree_stumps):
    # start with unweighted data
    alpha = np.ones(len(data)) / len(data)
    weights = []
    tree_stumps = []
    target_values = data[target]
    
    for t in range(num_tree_stumps):
        print('\n=====================================================')
        print('Adaboost Iteration %d' % t)
        print('=====================================================')        
        # Learn a weighted decision tree stump. Use max_depth=1
        tree_stump = weighted_decision_tree_create(data, features, target, 
                                                   data_weights=alpha, max_depth=1,
                                                   verbose=False)
        tree_stumps.append(tree_stump)
        
        # Make predictions
        predictions = data.apply(lambda x: classify(tree_stump, x), axis=1)
        
        # Produce a Boolean array indicating whether
        # each data point was correctly classified
        is_correct = predictions == target_values
        is_wrong   = predictions != target_values
        
        # Compute weighted error
        weighted_error = sum([alpha[i] for i,x in enumerate(is_wrong) if x])/float(sum(alpha))
        
        # Compute model coefficient using weighted error
        weight = 0.5*log((1 - weighted_error)/float(weighted_error))
        weights.append(weight)
        
        # Adjust weights on data point
        adjustment = np.array([exp(-weight) if x else exp(weight) for x in is_correct])
        
        # Scale alpha by multiplying by adjustment
        # Then normalize data points weights
        alpha_old = alpha[:]
        alpha = np.array([x*y for (x, y) in zip(alpha, adjustment)])
        alpha /= float(sum(alpha))
        
        print("\n***** stump summary *****")
        print("is_correct count:   ", is_correct.sum())
        print("is_wrong count:     ", is_wrong.sum())
        print("w-error is:         ", weighted_error)
        print("tree weight is:     ", weight)
        print("min, max of adjustment: ", adjustment.min(), adjustment.max())
        print("min, max alpha:     ", alpha.min(), alpha.max())
        print("is alpha the same?  ", np.allclose(alpha, alpha_old))
        p_c_w_tuple = [(p, c, '{:.2f}'.format(w)) for (p, c, w) in zip(predictions, data[target], alpha)]
#         print("detail: {}".format(p_c_w_tuple))
    
    return weights, tree_stumps

In [143]:
stump_weights, tree_stumps = adaboost_with_tree_stumps(
    train_data, feature_onehot, target, 30)


Adaboost Iteration 0
--------------------------------------------------------------------
Subtree, depth = 1 (37224 data points).
Split on feature term_36_months. (9223, 28001)
--------------------------------------------------------------------
Subtree, depth = 2 (9223 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 2 (28001 data points).
Reached maximum depth. Stopping for now.

***** stump summary *****
is_correct count:    21529
is_wrong count:      15695
w-error is:          0.421636578552
tree weight is:      0.15802933659181698
min, max of adjustment:  0.853824733293 1.17120055324
min, max alpha:      2.32244879001e-05 3.18572793883e-05
is alpha the same?   False

Adaboost Iteration 1
--------------------------------------------------------------------
Subtree, depth = 1 (37224 data points).
Split on feature grade_A. (32094, 5130)
--------------------------------------------------------


***** stump summary *****
is_correct count:    19075
is_wrong count:      18149
w-error is:          0.487021661048
tree weight is:      0.025962509691607592
min, max of adjustment:  0.974371618416 1.02630247136
min, max alpha:      1.46128924202e-05 5.37682553048e-05
is alpha the same?   False

Adaboost Iteration 10
--------------------------------------------------------------------
Subtree, depth = 1 (37224 data points).
Split on feature grade_D. (30465, 6759)
--------------------------------------------------------------------
Subtree, depth = 2 (30465 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 2 (6759 data points).
Reached maximum depth. Stopping for now.

***** stump summary *****
is_correct count:    17083
is_wrong count:      20141
w-error is:          0.484617644977
tree weight is:      0.030774421492271706
min, max of adjustment:  0.969694290597 1.03125284917
min, max alpha:    

Split on feature grade_B. (26858, 10366)
--------------------------------------------------------------------
Subtree, depth = 2 (26858 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 2 (10366 data points).
Reached maximum depth. Stopping for now.

***** stump summary *****
is_correct count:    20156
is_wrong count:      17068
w-error is:          0.492808481162
tree weight is:      0.014384029614075425
min, max of adjustment:  0.985718926308 1.01448797757
min, max alpha:      1.47722471825e-05 5.34394875695e-05
is alpha the same?   False

Adaboost Iteration 20
--------------------------------------------------------------------
Subtree, depth = 1 (37224 data points).
Split on feature emp_length_n/a. (35781, 1443)
--------------------------------------------------------------------
Subtree, depth = 2 (35781 data points).
Reached maximum depth. Stopping for now.
---------------------------------


***** stump summary *****
is_correct count:    19075
is_wrong count:      18149
w-error is:          0.494252681313
tree weight is:      0.01149514366369003
min, max of adjustment:  0.988570673068 1.01156146671
min, max alpha:      1.30467712754e-05 6.06217145073e-05
is alpha the same?   False

Adaboost Iteration 29
--------------------------------------------------------------------
Subtree, depth = 1 (37224 data points).
Split on feature grade_C. (27812, 9412)
--------------------------------------------------------------------
Subtree, depth = 2 (27812 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 2 (9412 data points).
Reached maximum depth. Stopping for now.

***** stump summary *****
is_correct count:    18060
is_wrong count:      19164
w-error is:          0.494348454041
tree weight is:      0.011303573315145135
min, max of adjustment:  0.988760072037 1.01136770009
min, max alpha:     

In [148]:
def predict_adaboost(stump_weights, tree_stumps, data):
    scores = np.zeros(len(data))
    
    for i, tree_stump in enumerate(tree_stumps):
        predictions = data.apply(lambda x: classify(tree_stump, x), axis=1).values
        predictions = [1 if x == 1 else -1 for x in predictions]
#         print(predictions)
        
        # Accumulate predictions on scores array
        scores = [stump_weights[i]*x + y for (x, y) in zip(predictions, scores)]
#         print(scores)
        
    return [1 if x > 0 else 0 for x in scores], scores

In [150]:
scroes = predict_adaboost(stump_weights, tree_stumps, train_data)[1]

In [152]:
np.array(scroes).mean()

0.011734907782874704

In [147]:
error_all = []
# stump_weights = weights
for n in range(1, 31):
    predictions = predict_adaboost(stump_weights[:n], tree_stumps[:n], train_data)[0]
    error = 1.0 - sum(train_data[target] == predictions) / len(train_data)
    error_all.append(error)
    print("Iteration %s, training error = %s" % (n, error_all[n-1]))

Iteration 1, training error = 0.421636578551
Iteration 2, training error = 0.433430045132
Iteration 3, training error = 0.400037610144
Iteration 4, training error = 0.400037610144
Iteration 5, training error = 0.384724908661
Iteration 6, training error = 0.384617451107
Iteration 7, training error = 0.382763808296
Iteration 8, training error = 0.384617451107
Iteration 9, training error = 0.382763808296
Iteration 10, training error = 0.384483129164
Iteration 11, training error = 0.382736943907
Iteration 12, training error = 0.381447453256
Iteration 13, training error = 0.381528046422
Iteration 14, training error = 0.380560928433
Iteration 15, training error = 0.380507199656
Iteration 16, training error = 0.378223726628
Iteration 17, training error = 0.378277455405
Iteration 18, training error = 0.378411777348
Iteration 19, training error = 0.378062540297
Iteration 20, training error = 0.378761014399
Iteration 21, training error = 0.379566946056
Iteration 22, training error = 0.3788953363