In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')

from sklearn import preprocessing
from sklearn.model_selection import train_test_split

### Data pre-processing

Only alter the data pre-processing code if you have completed the challenge for that section.

In [None]:
# The data URI
csv_file_uri = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"

column_names = [
    "age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
    "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
    "hours-per-week", "native-country", "target"
]


data_original = pd.read_csv(csv_file_uri, names=column_names, index_col=False)

USE_LABEL_ENCODER = False


if USE_LABEL_ENCODER:

    # Make a copy so that we always have the original data to refer to
    data = data_original.copy(deep=True)

    # Drop the US weights (don't have any value)
    data.drop(["fnlwgt"], axis=1, inplace=True)

    # Create a function that changes the text to a simple binary value
    def convert_target_variable(text):
        if text == " <=50K":
            return 0
        else:
            return 1

    data["target"] = data.target.apply(convert_target_variable)

    encoded_columns = []
    for c in data.columns:
        if data[c].dtype == "object":
            if "{}_encoded".format(c) not in data.columns:
                encoder = preprocessing.LabelEncoder()
                data["{}_encoded".format(c)] = encoder.fit_transform(data[c].values)
                encoded_columns.append(c)
                encoder = None
            else:
                print("{}_encoded already exists".format(c))

    print("Dropping the encoded columns {}".format(encoded_columns))
    data.drop(encoded_columns, axis=1, inplace=True)
    
else:
    
    # Make a copy so that we always have the original data to refer to
    data_pre_dummies = data_original.copy(deep=True)

    # Drop the US weights (don't have any value)
    data_pre_dummies.drop(["fnlwgt"], axis=1, inplace=True)
    
    data = pd.get_dummies(data_pre_dummies)

    # Deletes the original column in this dataframe.
    data.drop(["target_ <=50K"], axis=1, inplace=True)

    # Rename the target
    data.rename(columns={'target_ >50K': 'target' }, inplace=True)

In [None]:
feature_columns = data.columns.tolist()
feature_columns.remove("target")

---
### Model: Decision Tree

Scikit-learn [docs](http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html).

In [None]:
from sklearn import tree

In [None]:
clf = tree.DecisionTreeClassifier(max_depth=5)

In [None]:
X = data[feature_columns].values
y = data.target.values

In [None]:
clf.fit(X, y)

In [None]:
print("The accuracy is {:6.2f}%".format(clf.score(X,y)*100.0))

In [None]:
# How did the model do this? We can see the importance 
# for each column using 
# print(clf.feature_importances_)

# Pretty print with the column names
for ix, c in enumerate(feature_columns):
    if clf.feature_importances_[ix] > 0.0:
        print("Column {} is {}".format(c, clf.feature_importances_[ix]))

---
### Visualize the Tree

In [None]:
# Install dependency
!apt install graphviz

In [None]:
# Install dependency
!pip install graphviz

In [None]:
import graphviz

tree.export_graphviz(clf, out_file="/tmp/tree.dot")
with open("/tmp/tree.dot") as f:
    dot_graph = f.read()

graphviz.Source(dot_graph)

---
### Challenge: Add Cross Validation

In [None]:
# Add cross-validation to a deep (max_depth) Decision Tree and see
# how the training and testing accuracy will diverge.
# Why is this?

In [None]:
# Create a training and a testing group.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Train the model on the training set
clf = None
clf = tree.DecisionTreeClassifier(max_depth=5)
clf.fit(X_train, y_train)

# Predict on the train, test set and compare the results
accuracy_train = clf.score(X_train, y_train)
accuracy_test = clf.score(X_test, y_test)
print("- This model training accuracy of {}".format(accuracy_train))
print("- This model testing accuracy of {}".format(accuracy_test))
print("- The ratio is {}".format(accuracy_test/accuracy_train))

In [None]:
accuracies_train = []
accuracies_test = []

# Cross validation in a loop.
for i in range(25):
    
    # Create a training and a testing group.
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

    # Train the model on the training set
    clf = None
    clf = tree.DecisionTreeClassifier(max_depth=5)
    clf.fit(X_train, y_train)

    # Predict on the train, test set and compare the results
    accuracies_train.append(clf.score(X_train, y_train))
    accuracies_test.append(clf.score(X_test, y_test))

assert len(accuracies_train) == len(accuracies_test)
plt.figure(figsize=(16,5))
plt.plot(range(len(accuracies_train)), accuracies_train, label='Train')
plt.plot(range(len(accuracies_test)), accuracies_test, label='Test')
plt.ylim(0.8,1.0)
plt.legend(loc='upper left')

---
### Challenge: Implement a Random Forest

See notes [here](https://jakevdp.github.io/PythonDataScienceHandbook/05.08-random-forests.html#Ensembles-of-Estimators:-Random-Forests)

In [1]:
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Create a training and a testing group.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Train the model on the training set
clf = None
clf = RandomForestClassifier(n_estimators=100, max_depth=5)
clf.fit(X_train, y_train)

# Predict on the train, test set and compare the results
accuracy_train = clf.score(X_train, y_train)
accuracy_test = clf.score(X_test, y_test)
print("- This model training accuracy of {}".format(accuracy_train))
print("- This model testing accuracy of {}".format(accuracy_test))
print("- The ratio is {}".format(accuracy_test/accuracy_train))

NameError: name 'train_test_split' is not defined

---
### Challenges: Test different Hyper-parameters for a Random Forest

In [None]:
# Change some of the hyper-parameters and
# see how it impacts the tree and performance
# Hyper-parameters:
# - max_depth
# - n_estimators

In [None]:
plt.figure(figsize=(16,10))

n_estimators = [25,100,500]
max_depths = [5, 9]

ctr = 1
for ix,n_param in enumerate(n_estimators):
    for jx,depth in enumerate(max_depths):
        
        accuracies_train = []
        accuracies_test = []
        
        # Cross validation in a loop.
        for i in range(5):

            # Create a training and a testing group.
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

            # Train the model on the training set
            clf = None
            clf = RandomForestClassifier(n_estimators=n_param, max_depth=depth)
            clf.fit(X_train, y_train)

            # Predict on the train, test set and compare the results
            accuracies_train.append(clf.score(X_train, y_train))
            accuracies_test.append(clf.score(X_test, y_test))
            
        plt.subplot(len(n_estimators),len(max_depths), ctr)
        ctr += 1
        assert len(accuracies_train) == len(accuracies_test)
        plt.plot(range(len(accuracies_train)), accuracies_train, label='Train')
        plt.plot(range(len(accuracies_test)), accuracies_test, label='Test')
        plt.ylim(0.8,1.0)
        plt.legend(loc='upper left')
        plt.title("n_estimators({}), depth({})".format(n_param, depth))

---
### Challenges: Cross-validation for a Random Forest

In [None]:
# Implement cross-validation

In [None]:
accuracies_train = []
accuracies_test = []

# Cross validation in a loop.
for i in range(5):
    
    # Create a training and a testing group.
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

    # Train the model on the training set
    clf = None
    clf = RandomForestClassifier(n_estimators=100, max_depth=5)
    clf.fit(X_train, y_train)

    # Predict on the train, test set and compare the results
    accuracies_train.append(clf.score(X_train, y_train))
    accuracies_test.append(clf.score(X_test, y_test))

assert len(accuracies_train) == len(accuracies_test)
plt.figure(figsize=(16,5))
plt.plot(range(len(accuracies_train)), accuracies_train, label='Train')
plt.plot(range(len(accuracies_test)), accuracies_test, label='Test')
plt.ylim(0.8,1.0)
plt.legend(loc='upper left')

---
### More reading

* [BUILDING DECISION TREE ALGORITHM IN PYTHON WITH SCIKIT LEARN](http://dataaspirant.com/2017/02/01/decision-tree-algorithm-python-with-scikit-learn/)
* [
How To Implement The Decision Tree Algorithm From Scratch In Python](https://machinelearningmastery.com/implement-decision-tree-algorithm-scratch-python/)