In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.datasets import load_iris
from sklearn.metrics import roc_auc_score

In [2]:
income = pd.read_pickle('income_df')

In [3]:
columns = ["age", "workclass", "education_num", 
           "marital_status", "occupation", 
           "relationship", "race", "sex", 
           "hours_per_week", "native_country"]

In [4]:
income.sample(10)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,high_income
19980,22,4,216867,4,3,2,3,0,4,1,0,0,48,26,0
23950,37,2,327120,15,10,2,14,0,4,1,0,0,30,39,0
31448,28,4,207513,9,13,2,12,0,4,1,7298,0,42,39,1
5356,38,4,204756,11,9,4,12,3,4,0,0,0,40,39,0
29785,28,4,227840,11,9,4,14,1,4,0,0,0,40,39,0
29451,32,6,62272,11,9,0,4,4,4,0,0,0,40,39,0
14171,34,4,162814,11,9,0,11,1,2,1,0,0,45,39,0
13228,43,4,145441,9,13,2,4,0,4,1,0,0,50,39,1
5637,19,4,63363,15,10,4,12,3,4,0,0,0,30,39,0
31964,24,4,175586,11,9,4,7,4,2,0,0,0,40,39,0


In [5]:
clf = DecisionTreeClassifier(random_state=1)

In [6]:
np.random.seed(1)

# Shuffle the rows.  This first permutes the index randomly using 
#numpy.random.permutation.
# Then, it reindexes the dataframe with this.
# The net effect is to put the rows into random order.
income = income.reindex(np.random.permutation(income.index))
train_max_row = np.int(income.shape[0] * .8)

In [7]:
train_max_row

26048

In [8]:
train = income.iloc[:train_max_row]
test = income.iloc[train_max_row:]


In [9]:
clf.fit(train[columns],train['high_income'])

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=1, splitter='best')

In [10]:
predictions = clf.predict(test[columns])

In [11]:
error = roc_auc_score(test['high_income'],predictions)
print('Test AUC: {0:.3f}'.format(error))

Test AUC: 0.693


In [12]:

train_predictions = clf.predict(train[columns])
error = roc_auc_score(train['high_income'],train_predictions)
print('Train AUC: {0:.3f}'.format(error))

Train AUC: 0.947


In [13]:
'''Changing min_samples_split. This means that a node remain a leaf till
the number of rows assigned to it are less than 13. Once the number of rows
exceed 13 it will be split.'''

clf = DecisionTreeClassifier(random_state=1,min_samples_split=13)
clf.fit(train[columns],train['high_income'])

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=13, min_weight_fraction_leaf=0.0,
            presort=False, random_state=1, splitter='best')

In [14]:
predictions = clf.predict(test[columns])
test_auc = roc_auc_score(test['high_income'],predictions)
print('Test AUC: {0:.3f}'.format(error))

Test AUC: 0.947


In [15]:
train_predictions = clf.predict(train[columns])
train_auc = roc_auc_score(train['high_income'],train_predictions)
print('Train AUC: {0:.3f}'.format(train_auc))

Train AUC: 0.842


In [16]:
clf = DecisionTreeClassifier(random_state=1,min_samples_split=13,
                             max_depth=7)

clf.fit(train[columns],train['high_income'])
predictions = clf.predict(test[columns])
test_auc = roc_auc_score(test['high_income'],predictions)
print('Test AUC: {0:.3f}'.format(test_auc))

train_predictions = clf.predict(train[columns])
train_auc = roc_auc_score(train['high_income'],train_predictions)
print('Train AUC: {0:.3f}'.format(train_auc))

Test AUC: 0.744
Train AUC: 0.748


## Bias:

- As the min_sample_split = 100 this reduces the depth of the tree.
- This causes underfitting as the tree model is too simple for the dataset.
- This cause the model to have wrong 'beliefs' about the nature of the dataset.

In [17]:
clf = DecisionTreeClassifier(random_state=1,min_samples_split=100,
                             max_depth=2)

clf.fit(train[columns],train['high_income'])
predictions = clf.predict(test[columns])
test_auc = roc_auc_score(test['high_income'],predictions)
print('Test AUC: {0:.3f}'.format(test_auc))

train_predictions = clf.predict(train[columns])
train_auc = roc_auc_score(train['high_income'],train_predictions)
print('Train AUC: {0:.3f}'.format(train_auc))

Test AUC: 0.655
Train AUC: 0.662


## Tree Variance:

- Decision Trees tend overfit data.
- Their predictions are influenced by variance in input data.
- Small changes in input data cause huge changes in the predictions.
- Adding some noise to the input data illustrates this point.

In [18]:
np.random.seed(1)

# Generate a column with random numbers from 0 to 4.
income["noise"] = np.random.randint(4, size=income.shape[0])

# Adjust columns to include the noise column.
columns = ["noise", "age", "workclass", "education_num", "marital_status", "occupation", "relationship", "race", "sex", "hours_per_week", "native_country"]

# Make new train and test sets.
train_max_row = np.int(income.shape[0] * .8)
train = income.iloc[:train_max_row]
test = income.iloc[train_max_row:]

# Initialize the classifier.
clf = DecisionTreeClassifier(random_state=1)

clf.fit(train[columns],train['high_income'])
predictions = clf.predict(test[columns])
test_auc = roc_auc_score(test['high_income'],predictions)
print('Test Error: {0:.3f}'.format(test_auc))

train_predictions = clf.predict(train[columns])
train_auc = roc_auc_score(train['high_income'],train_predictions)
print('Train Error: {0:.3f}'.format(train_auc))

Test Error: 0.691
Train Error: 0.975


## Ensemble Models:

In [19]:
columns = ["age", "workclass", "education_num", "marital_status", "occupation", "relationship", "race", "sex", "hours_per_week", "native_country"]

clf = DecisionTreeClassifier(random_state=1, min_samples_leaf=2)
clf.fit(train[columns], train["high_income"])

clf2 = DecisionTreeClassifier(random_state=1, max_depth=5)
clf2.fit(train[columns], train["high_income"])

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=1, splitter='best')

In [20]:
clf.fit(train[columns],train['high_income'])
predictions = clf.predict(test[columns])
test_auc = roc_auc_score(test['high_income'],predictions)
print('Test Error: {0:.5f}'.format(test_auc))

train_predictions = clf.predict(train[columns])
train_auc = roc_auc_score(train['high_income'],train_predictions)
print('Train Error: {0:.5f}'.format(train_auc))

Test Error: 0.68790
Train Error: 0.85595


In [21]:

clf2.fit(train[columns],train['high_income'])
predictions = clf2.predict(test[columns])
test_auc = roc_auc_score(test['high_income'],predictions)
print('Test Error: {0:.5f}'.format(test_auc))

train_predictions = clf2.predict(train[columns])
train_auc = roc_auc_score(train['high_income'],train_predictions)
print('Train Error: {0:.5f}'.format(train_auc))
predictions

Test Error: 0.67599
Train Error: 0.68334


array([0, 0, 0, ..., 0, 0, 1], dtype=int8)

In [22]:
predictions = clf.predict_proba(test[columns])[:,1]
predictions2 = clf2.predict_proba(test[columns])[:,1]

In [23]:
avg_predictions = np.round(0.5*(predictions+predictions2))

In [29]:
roc_auc_score(test['high_income'],avg_predictions)

0.71508468040388817

## Bagging and random feature subsets:

In [25]:
# Building 10 trees
# Each "bag" will have 60% of the number of original rows.
tree_count = 10
bag_proportion = .6

predictions = []
for i in range(tree_count):
    # We select 60% of the rows from train, sampling with replacement.
    # We set a random state to ensure we'll be able to replicate our results.
    # We set it to i instead of a fixed value so we don't get the same sample every loop.
    # That would make all of our trees the same.
    bag = train.sample(frac=bag_proportion,
                       replace=True, 
                       random_state=i)
    
    # Fit a decision tree model to the "bag".
    clf = DecisionTreeClassifier(random_state=1,min_samples_leaf=2,
                                 splitter='random',
                                 max_features="auto")
    
    clf.fit(bag[columns], bag["high_income"])
    
    # Using the model, make predictions on the test data.
    predictions.append(clf.predict_proba(test[columns])[:,1])
    
#Taking the mean and rounding to 0 or 1
combined_pred = np.round(np.mean(np.array(predictions),axis=0))
print(roc_auc_score(test['high_income'],combined_pred))

0.7345958638


## Random Forest Classifier:

In [30]:
clf = RandomForestClassifier(n_estimators=5,
                             random_state=1,
                             min_samples_leaf=2)

In [31]:
clf.fit(train[columns],train['high_income'])
predictions = clf.predict(test[columns])
test_auc = roc_auc_score(test['high_income'],predictions)
print('Test AUC: {0:.5f}'.format(test_auc))

train_predictions = clf.predict(train[columns])
train_auc = roc_auc_score(train['high_income'],train_predictions)
print('Train AUC: {0:.5f}'.format(train_auc))

Test AUC: 0.73475
Train AUC: 0.84084
