# Assignment is below at the end

- https://scikit-learn.org/stable/modules/tree.html
- https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
- https://scikit-learn.org/stable/modules/generated/sklearn.tree.plot_tree.html

In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (20, 6)
plt.rcParams['font.size'] = 14
import pandas as pd

In [2]:
df = pd.read_csv('../data/adult.data', index_col=False)

In [3]:
golden = pd.read_csv('../data/adult.test', index_col=False)

In [4]:
golden.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K.
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K.
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K.
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K.
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K.


In [5]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [6]:
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'salary'],
      dtype='object')

In [7]:
from sklearn import preprocessing

In [8]:
enc = preprocessing.OrdinalEncoder()

In [9]:
transform_columns = ['sex']
non_num_columns = ['workclass', 'education', 'marital-status', 
                     'occupation', 'relationship', 'race', 'sex', 
                     'native-country']

In [10]:
pd.get_dummies(df[transform_columns]).head()

Unnamed: 0,sex_ Female,sex_ Male
0,0,1
1,0,1
2,0,1
3,0,1
4,1,0


In [11]:
x = df.copy()

x = pd.concat([x.drop(non_num_columns, axis=1), 
               pd.get_dummies(df[transform_columns])], axis=1,)

x["salary"] = enc.fit_transform(df[["salary"]])

In [12]:
x.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,salary,sex_ Female,sex_ Male
0,39,77516,13,2174,0,40,0.0,0,1
1,50,83311,13,0,0,13,0.0,0,1
2,38,215646,9,0,0,40,0.0,0,1
3,53,234721,7,0,0,40,0.0,0,1
4,28,338409,13,0,0,40,0.0,1,0


In [13]:
xt = golden.copy()

xt = pd.concat([xt.drop(non_num_columns, axis=1), 
               pd.get_dummies(golden[transform_columns])], axis=1,)

xt["salary"] = enc.fit_transform(golden[["salary"]])

In [14]:
xt.salary.value_counts()

0.0    12435
1.0     3846
Name: salary, dtype: int64

In [15]:
enc.categories_

[array([' <=50K.', ' >50K.'], dtype=object)]

In [16]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

#### Choose the model of your preference: DecisionTree or RandomForest

In [17]:
model = RandomForestClassifier(criterion='entropy')

In [18]:
model = DecisionTreeClassifier(criterion='entropy', max_depth=None)

In [19]:
model.fit(x.drop(['fnlwgt','salary'], axis=1), x.salary)

DecisionTreeClassifier(criterion='entropy')

In [20]:
model.tree_.node_count

8317

In [21]:
list(zip(x.drop(['fnlwgt','salary'], axis=1).columns, model.feature_importances_))

[('age', 0.32367448192304),
 ('education-num', 0.16039957769771152),
 ('capital-gain', 0.22830127920880136),
 ('capital-loss', 0.07800246105037775),
 ('hours-per-week', 0.15405831471575454),
 ('sex_ Female', 0.033582997454121355),
 ('sex_ Male', 0.021980887950193533)]

In [22]:
list(zip(x.drop(['fnlwgt','salary'], axis=1).columns, model.feature_importances_))

[('age', 0.32367448192304),
 ('education-num', 0.16039957769771152),
 ('capital-gain', 0.22830127920880136),
 ('capital-loss', 0.07800246105037775),
 ('hours-per-week', 0.15405831471575454),
 ('sex_ Female', 0.033582997454121355),
 ('sex_ Male', 0.021980887950193533)]

In [23]:
x.drop(['fnlwgt','salary'], axis=1).head()

Unnamed: 0,age,education-num,capital-gain,capital-loss,hours-per-week,sex_ Female,sex_ Male
0,39,13,2174,0,40,0,1
1,50,13,0,0,13,0,1
2,38,9,0,0,40,0,1
3,53,7,0,0,40,0,1
4,28,13,0,0,40,1,0


In [24]:
set(x.columns) - set(xt.columns)

set()

In [25]:
list(x.drop('salary', axis=1).columns)

['age',
 'fnlwgt',
 'education-num',
 'capital-gain',
 'capital-loss',
 'hours-per-week',
 'sex_ Female',
 'sex_ Male']

In [26]:
predictions = model.predict(xt.drop(['fnlwgt','salary'], axis=1))
predictionsx = model.predict(x.drop(['fnlwgt','salary'], axis=1))

In [27]:
from sklearn.metrics import (
    accuracy_score, 
    classification_report, 
    confusion_matrix, auc, roc_curve
)

In [28]:
accuracy_score(xt.salary, predictions)

0.8205269946563479

In [29]:
accuracy_score(xt.salary, predictions)

0.8205269946563479

In [30]:
confusion_matrix(xt.salary, predictions)

array([[11455,   980],
       [ 1942,  1904]], dtype=int64)

In [31]:
print(classification_report(xt.salary, predictions))

              precision    recall  f1-score   support

         0.0       0.86      0.92      0.89     12435
         1.0       0.66      0.50      0.57      3846

    accuracy                           0.82     16281
   macro avg       0.76      0.71      0.73     16281
weighted avg       0.81      0.82      0.81     16281



In [32]:
print(classification_report(xt.salary, predictions))

              precision    recall  f1-score   support

         0.0       0.86      0.92      0.89     12435
         1.0       0.66      0.50      0.57      3846

    accuracy                           0.82     16281
   macro avg       0.76      0.71      0.73     16281
weighted avg       0.81      0.82      0.81     16281



In [33]:
accuracy_score(x.salary, predictionsx)

0.8955806025613464

In [34]:
confusion_matrix(x.salary, predictionsx)

array([[24097,   623],
       [ 2777,  5064]], dtype=int64)

In [35]:
print(classification_report(x.salary, predictionsx))

              precision    recall  f1-score   support

         0.0       0.90      0.97      0.93     24720
         1.0       0.89      0.65      0.75      7841

    accuracy                           0.90     32561
   macro avg       0.89      0.81      0.84     32561
weighted avg       0.90      0.90      0.89     32561



In [36]:
print(classification_report(x.salary, predictionsx))

              precision    recall  f1-score   support

         0.0       0.90      0.97      0.93     24720
         1.0       0.89      0.65      0.75      7841

    accuracy                           0.90     32561
   macro avg       0.89      0.81      0.84     32561
weighted avg       0.90      0.90      0.89     32561



# For the following use the above `adult` dataset. Start with only numerical features/columns.  

In [37]:
df_train = df.copy()
df_test = golden.copy()

In [38]:
non_num_cols = [
    'workclass',
    'education',
    'marital-status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'native-country'
]
df_train = df.drop(non_num_cols, axis = 1).copy()
x_train = df_train.drop('salary', axis = 1)
y_train = df_train[['salary']]

df_test = golden.drop(non_num_cols, axis = 1).copy()
x_test = df_test.drop('salary', axis = 1)
y_test = df_test[['salary']]

# 1. Show the RandomForest outperforms the DecisionTree for a fixed `max_depth` by training using the train set and `precision`, `recall`, `f1` on golden-test set.

In [39]:
y_train = enc.fit_transform(y_train).ravel()
y_test = enc.fit_transform(y_test).ravel()

In [40]:
rf = RandomForestClassifier(max_depth = 15)
dt = DecisionTreeClassifier(max_depth = 15)

In [41]:
rf.fit(x_train, y_train)
dt.fit(x_train, y_train)

DecisionTreeClassifier(max_depth=15)

In [42]:
rf_pred = rf.predict(x_test)
dt_pred = dt.predict(x_test)

In [43]:
from sklearn.metrics import (
    accuracy_score, 
    classification_report, 
    confusion_matrix, auc, roc_curve
)

In [44]:
def get_accuracies(y_true, preds):
    acc = accuracy_score(y_true, preds)
    confus = confusion_matrix(y_true, preds)
    classif = classification_report(y_true, preds)
    
    return [acc, confus, classif]

In [45]:
acc_dict = {"Random Forest": rf_pred, "Decision Tree": dt_pred}

In [46]:
for k, v in acc_dict.items():
    accs, confus, classif = get_accuracies(y_test, v)
    print(f"\033[1m{k}\033[0m: \n\nAccuracy: {accs:.3f}%\n\nConfusion Matrix:\n {confus}\n\nClassification Report:\n\n {classif}")

[1mRandom Forest[0m: 

Accuracy: 0.838%

Confusion Matrix:
 [[11882   553]
 [ 2081  1765]]

Classification Report:

               precision    recall  f1-score   support

         0.0       0.85      0.96      0.90     12435
         1.0       0.76      0.46      0.57      3846

    accuracy                           0.84     16281
   macro avg       0.81      0.71      0.74     16281
weighted avg       0.83      0.84      0.82     16281

[1mDecision Tree[0m: 

Accuracy: 0.820%

Confusion Matrix:
 [[11554   881]
 [ 2053  1793]]

Classification Report:

               precision    recall  f1-score   support

         0.0       0.85      0.93      0.89     12435
         1.0       0.67      0.47      0.55      3846

    accuracy                           0.82     16281
   macro avg       0.76      0.70      0.72     16281
weighted avg       0.81      0.82      0.81     16281



# 2. For RandomForest or DecisionTree and using the `adult` dataset, systematically add new columns, one by one, that are non-numerical but converted using the feature-extraction techniques we learned. Show [`precision`, `recall`, `f1`] for each additional feature added.

In [47]:
import numpy as np

df_train = df.copy()
df_test = golden.copy()

In [48]:
def run_model(df_train,df_test,non_num_cols, target_var, col_list = ['sex'], model_type = 'Random Forest', verbose = True):
    """
    
    Function that runs classification models and outputs all accuracy metrics
    
    Variables
    --------------
    df_train: dataframe
        dataframe used to train the model
    df_test: dataframe
        dataframe used to test the model's accuracy
    non_num_cols: list
        list of columns that are non-numeric
    target_var: str
        name of predictor column
    col_list: list
        list of columns that are intended to be encoded and concatenated to the training dataframe
    model_type: str
        name of model used as well as report
            opts: 'Random Forest', 'Decision Tree'
    verbose: bool
        boolean to either print or not print the output of the accuracy report    
    """
    
    dftrain = df_train.drop(non_num_cols, axis = 1).copy()
    dftest = df_test.drop(non_num_cols, axis = 1).copy()

    xtrain = pd.concat([dftrain, pd.get_dummies(df_train[col_list])], axis = 1)
    xtest = pd.concat([dftest, pd.get_dummies(df_test[col_list])], axis = 1)
    
    diff_cols = set(xtrain.columns) - set(xtest.columns)
    
    # Create zero column for columns that differ between the test and training sets
    if diff_cols:
        difference_df = pd.DataFrame(data=np.zeros((xtest.shape[0], len(diff_cols))),
                                     columns=list(diff_cols))
        xtest = pd.concat([xtest, difference_df], axis = 1)
        xtest = xtest[xtrain.columns]
    

    ytrain = enc.fit_transform(df_train[[target_var]]).ravel()
    ytest = enc.fit_transform(df_test[[target_var]]).ravel()

    if model_type == 'Random Forest':
        model = RandomForestClassifier()
    elif model_type == 'Decision Tree':
        model = DecisionTreeClassifier()

    model.fit(xtrain, ytrain)
    preds = model.predict(xtest)

    acc_list = get_accuracies(ytest, preds)

    if verbose:
        print(f"\033[1m{model_type}\033[0m: \n\nNon-Numerical Columns: {col_list} \n\nAccuracy: {acc_list[0] * 100:.2f}%\n\nConfusion Matrix:\n {acc_list[1]}\n\nClassification Report:\n\n {acc_list[2]}\n\n")
        
    return acc_list

In [49]:
def run_model_addcat(df_train, df_test, target_var = 'salary'):
    """
    Function that systematically adds encoded categorical variables, trains a model, and then reports that accuracy in a formatted report
    
    Variables
    ----------
    df_train: dataframe
        dataframe used to train the model
    df_test: dataframe
        dataframe used to test the model
    target_var: str
        name of predictor column    
    
    """
    
    cat_vars = list(df_train.select_dtypes(include=['object']).columns)
    cat_vars.remove(target_var)
    cat_vars_uniquedict = {i:len(df[i].unique()) for i in cat_vars}

    non_num_cols = [i[0] for i in sorted(cat_vars_uniquedict.items(), key = lambda x: x[1])]
    
    cat_vars = list(df_train.select_dtypes(include=['object']).columns)
    non_num_cols = cat_vars.copy()
    cat_vars.remove(target_var)
    cat_vars_uniquedict = {i:len(df[i].unique()) for i in cat_vars}
    cat_vars = [i[0] for i in sorted(cat_vars_uniquedict.items(), key = lambda x: x[1])]
    print("---Starting Report---\n\n\n")
    for i in range(1,len(cat_vars) + 1):
        run_model(df_train, df_test, non_num_cols, 'salary', cat_vars[0:i], 'Random Forest', True)
        run_model(df_train, df_test, non_num_cols, 'salary', cat_vars[0:i], 'Decision Tree', True)
    print("\n\n\n---Finishing Report---")

    

In [50]:
run_model_addcat(df, golden)

---Starting Report---



[1mRandom Forest[0m: 

Non-Numerical Columns: ['sex'] 

Accuracy: 81.98%

Confusion Matrix:
 [[11287  1148]
 [ 1786  2060]]

Classification Report:

               precision    recall  f1-score   support

         0.0       0.86      0.91      0.88     12435
         1.0       0.64      0.54      0.58      3846

    accuracy                           0.82     16281
   macro avg       0.75      0.72      0.73     16281
weighted avg       0.81      0.82      0.81     16281



[1mDecision Tree[0m: 

Non-Numerical Columns: ['sex'] 

Accuracy: 78.48%

Confusion Matrix:
 [[10652  1783]
 [ 1720  2126]]

Classification Report:

               precision    recall  f1-score   support

         0.0       0.86      0.86      0.86     12435
         1.0       0.54      0.55      0.55      3846

    accuracy                           0.78     16281
   macro avg       0.70      0.70      0.70     16281
weighted avg       0.79      0.78      0.79     16281



[1mRandom Fo

# 3. Optional: Using gridSearch find the most optimal parameters for your model
Warning: this can be computationally intensive and may take some time.
- https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
- https://scikit-learn.org/stable/modules/grid_search.html

In [395]:
from sklearn.model_selection import GridSearchCV
parameters_rf = {
    'n_estimators' : [10, 50, 100, 200, 300, 500],
    'criterion' : ['gini', 'entropy'],
    'max_depth': [None, 10, 15, 25, 50]
}
rf = RandomForestClassifier()
clf = GridSearchCV(rf, parameters_rf)
clf.fit(x_train, y_train)

GridSearchCV(estimator=RandomForestClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [None, 10, 15, 25, 50],
                         'n_estimators': [10, 50, 100, 200, 300, 500]})

In [397]:
clf.cv_results_

{'mean_fit_time': array([ 0.26607652,  1.49843245,  3.07443242,  5.42180214,  8.64762654,
        14.51657195,  0.14425912,  0.71538653,  1.42572818,  2.84674644,
         4.27355132,  7.39134669,  0.19362202,  1.06073928,  2.05134807,
         3.65817304,  5.48727622,  9.12941923,  0.24207854,  1.15445013,
         2.29742756,  5.3503171 ,  8.94251018, 15.09877682,  0.37479377,
         1.72970343,  3.06913753,  6.01469517,  9.01518703, 15.24403338,
         0.38510656,  2.0672318 ,  4.09541273,  8.10080194, 13.41103654,
        23.85574636,  0.21171923,  0.96369548,  2.60740027,  5.95415359,
         8.08663545, 11.94553466,  0.33672209,  1.4965764 ,  3.04027114,
         6.44706473,  8.87330565, 15.21995511,  0.34397197,  1.73710818,
         3.44171743,  7.7625483 , 11.62421522, 20.41408758,  0.40847578,
         2.03247933,  4.49909558,  7.94361548, 13.54872761, 22.33676162]),
 'std_fit_time': array([6.75443030e-02, 1.23954929e-01, 2.61542535e-01, 4.46269218e-01,
        3.4364756

In [399]:
clf.best_params_

{'criterion': 'gini', 'max_depth': 10, 'n_estimators': 100}

In [51]:
cat_vars = list(df.select_dtypes(include = ['object']).columns)
non_num_cols = cat_vars.copy()
cat_vars.remove('salary')

In [52]:
rf = RandomForestClassifier(criterion = 'gini', max_depth = 10, n_estimators = 100)

xtrain = pd.concat([df_train.drop(non_num_cols, axis = 1), pd.get_dummies(df[cat_vars])], axis = 1)
# xtrain = xtrain.drop('native-country_ Holand-Netherlands', axis = 1)
xtest = pd.concat([df_test.drop(non_num_cols, axis = 1), pd.get_dummies(golden[cat_vars])], axis = 1)
diff_cols = set(xtrain.columns) - set(xtest.columns)
diff_df = pd.DataFrame(np.zeros((xtest.shape[0], 1)), columns = diff_cols)
xtest = pd.concat([xtest, diff_df], axis = 1)
xtest = xtest[xtrain.columns]

ytrain = df[['salary']]
ytrain = enc.fit_transform(ytrain).ravel()
ytest = golden[['salary']]
ytest = enc.fit_transform(ytest).ravel()

rf.fit(xtrain, ytrain)
pred = rf.predict(xtest)

accs = get_accuracies(ytest, pred)

In [54]:
print(f"\033[1m{'Random Forest'}\033[0m: \n\nAccuracy: {accs[0] * 100:.2f}%\n\nConfusion Matrix:\n {accs[1]}\n\nClassification Report:\n\n {accs[2]}\n\n")

[1mRandom Forest[0m: 

Accuracy: 85.81%

Confusion Matrix:
 [[11888   547]
 [ 1763  2083]]

Classification Report:

               precision    recall  f1-score   support

         0.0       0.87      0.96      0.91     12435
         1.0       0.79      0.54      0.64      3846

    accuracy                           0.86     16281
   macro avg       0.83      0.75      0.78     16281
weighted avg       0.85      0.86      0.85     16281



