In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (20, 6)
plt.rcParams['font.size'] = 14
import pandas as pd

In [2]:
df = pd.read_csv('../data/adult.data', index_col=False)

In [3]:
golden = pd.read_csv('../data/adult.test', index_col=False)

The `golden` dataframe is a "golden" test set that can be used to verify the performance of the model.

In [4]:
golden.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K.
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K.
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K.
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K.
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K.


In [5]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [6]:
df['salary'].unique()

array([' <=50K', ' >50K'], dtype=object)

In [7]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [8]:
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'salary'],
      dtype='object')

In [9]:
non_numeric_columns = ['workclass', 'education', 'marital-status', 
                     'occupation', 'relationship', 'race', 'sex', 
                     'native-country', 'fnlwgt']

In [10]:
# from sklearn import preprocessing

# enc = preprocessing.OrdinalEncoder()

# pd.get_dummies(df[transform_columns]).head()

In [11]:
x = df.copy().drop(non_numeric_columns, axis=1)

In [12]:
# x = pd.concat([x.drop(transform_columns, axis=1), 
#                pd.get_dummies(df[transform_columns])], axis=1,)

# x["salary"] = enc.fit_transform(df[["salary"]])
# enc.categories_

In [13]:
xt = golden.copy().drop(non_numeric_columns, axis=1)

In [14]:
# xt = pd.concat([xt.drop(transform_columns, axis=1), 
#                pd.get_dummies(golden[transform_columns])], axis=1,)

# xt["salary"] = enc.fit_transform(golden[["salary"]])

In [15]:
x['salary'] = x.salary.str.contains('>').astype(int)

In [16]:
xt['salary'] = xt.salary.str.contains('>').astype(int)

In [17]:
xt.salary.value_counts()

0    12435
1     3846
Name: salary, dtype: int64

In [18]:
# from sklearn.linear_model import LogisticRegression 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# from sklearn.ensemble import RandomForestClassifier as Model
# model = Model()

#### For now we choose the DecisionTree as our classifier. More details next class about Decision Trees

In [19]:
model = DecisionTreeClassifier(criterion='entropy')

In [20]:
model.fit(x.drop(['salary'], axis=1), x.salary)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [21]:
list(zip(x.drop(['salary'], axis=1).columns, model.feature_importances_))

[('age', 0.3277892586574141),
 ('education-num', 0.16588018810033892),
 ('capital-gain', 0.2485558364304248),
 ('capital-loss', 0.097973625506434),
 ('hours-per-week', 0.15980109130538828)]

In [22]:
x.drop(['salary'] , axis=1).head()

Unnamed: 0,age,education-num,capital-gain,capital-loss,hours-per-week
0,39,13,2174,0,40
1,50,13,0,0,13
2,38,9,0,0,40
3,53,7,0,0,40
4,28,13,0,0,40


In [23]:
set(x.columns) - set(xt.columns)

set()

In [64]:
predictions = model.predict(xt.drop(['salary'], axis=1))

In [25]:
predictions_train = model.predict(x.drop(['salary'], axis=1))

In [26]:
xt.drop(['salary'], axis=1).head()

Unnamed: 0,age,education-num,capital-gain,capital-loss,hours-per-week
0,25,7,0,0,40
1,38,9,0,0,50
2,28,12,0,0,40
3,44,10,7688,0,40
4,18,10,0,0,30


### Here we use the metrics to calculate how well our model is performing.
- accuracy = number correct / total number
- precision = number of correct 1 (or 0) / number of 1s (or 0s) the model predicted
- recall = number of correct 1s (or 0s) / number of 1s (or 0s) that exist in the dataset
- f1 = 2 * prec * recall / (prec+recall)

In [27]:
from sklearn.metrics import (accuracy_score, 
                             classification_report, 
                             confusion_matrix, auc, roc_curve
                            )

In [55]:
import numpy as np

In [56]:
predictions = np.zeros(16281)

In [57]:
accuracy_score(xt.salary, predictions)

0.7637737239727289

In [28]:
accuracy_score(xt.salary, predictions)

0.8169031386278484

In [58]:
predictions.shape, xt.salary.values.shape

((16281,), (16281,))

In [54]:
(xt.salary.values == predictions).sum()/len(predictions)

0.8169031386278484

In [69]:
np.logical_and(xt.salary.values, predictions).sum()

1769

In [67]:
confusion_matrix(xt.salary, predictions)

array([[11531,   904],
       [ 2077,  1769]])

In [74]:
11531/(11531+2077), 1769/(1769+904), 1769/(1769+2077), 11531/(11531+904)

(0.8473691945914168,
 0.6618032173587729,
 0.45995839833593344,
 0.9273019702452754)

In [71]:
print(classification_report(xt.salary, predictions))

              precision    recall  f1-score   support

           0       0.85      0.93      0.89     12435
           1       0.66      0.46      0.54      3846

   micro avg       0.82      0.82      0.82     16281
   macro avg       0.75      0.69      0.71     16281
weighted avg       0.80      0.82      0.80     16281



In [75]:
2* (.80 *.10)/(.8+.1)

0.1777777777777778

### Can we run this on the training data? What type of scores should we expect?

In [31]:
accuracy_score(x.salary, predictions_train)

0.8841558920180584

In [32]:
confusion_matrix(x.salary, predictions_train)

array([[24136,   584],
       [ 3188,  4653]])

In [33]:
print(classification_report(x.salary, predictions_train))

              precision    recall  f1-score   support

           0       0.88      0.98      0.93     24720
           1       0.89      0.59      0.71      7841

   micro avg       0.88      0.88      0.88     32561
   macro avg       0.89      0.78      0.82     32561
weighted avg       0.88      0.88      0.88     32561



In [34]:
len(x)

32561

# Now we use test_train_split instead of using the golden test set provided to us

In [35]:
from sklearn.model_selection import train_test_split

In [36]:
x_train, x_test, y_train, y_test = train_test_split(x.drop(['salary'], axis=1),x.salary, test_size=.20)

In [37]:
x.shape, x_train.shape, x_test.shape

((32561, 6), (26048, 5), (6513, 5))

In [38]:
model.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [39]:
test_predictions = model.predict(x_test)

In [40]:
accuracy_score(y_test, test_predictions)

0.8174420389989252

In [41]:
confusion_matrix(y_test, test_predictions)

array([[4634,  336],
       [ 853,  690]])

In [79]:
confusion_matrix(xt.salary, predictions)

array([[11531,   904],
       [ 2077,  1769]])

In [43]:
print(classification_report(y_test, test_predictions))

              precision    recall  f1-score   support

           0       0.84      0.93      0.89      4970
           1       0.67      0.45      0.54      1543

   micro avg       0.82      0.82      0.82      6513
   macro avg       0.76      0.69      0.71      6513
weighted avg       0.80      0.82      0.80      6513



In [44]:
print(classification_report(xt.salary, predictions))

              precision    recall  f1-score   support

           0       0.85      0.93      0.89     12435
           1       0.66      0.47      0.55      3846

   micro avg       0.82      0.82      0.82     16281
   macro avg       0.75      0.70      0.72     16281
weighted avg       0.80      0.82      0.81     16281



# Using cross_validate to perform n-fold validations

In [45]:
from sklearn.model_selection import cross_validate

In [81]:
cross_validate(model, x.drop(['salary'], axis=1), x.salary, cv=20, 
               scoring=['accuracy', 'recall', 'precision', 'f1'], return_train_score=True)

{'fit_time': array([0.05218482, 0.05036306, 0.04355931, 0.04408693, 0.04560494,
        0.04542613, 0.04380012, 0.04379725, 0.04658771, 0.04353285,
        0.04618597, 0.04922199, 0.06221199, 0.05606103, 0.053514  ,
        0.04534197, 0.05001402, 0.04959989, 0.04892015, 0.04610419]),
 'score_time': array([0.0079689 , 0.00689578, 0.00608683, 0.00628901, 0.00596976,
        0.00682402, 0.0060699 , 0.00584984, 0.0061121 , 0.00645018,
        0.00679612, 0.00734806, 0.00680113, 0.00774193, 0.00900197,
        0.00783587, 0.00897002, 0.00692701, 0.00788999, 0.00763011]),
 'test_accuracy': array([0.81276857, 0.82555283, 0.81203931, 0.82125307, 0.81081081,
        0.82432432, 0.80773956, 0.81756757, 0.81449631, 0.81756757,
        0.82493857, 0.81818182, 0.80773956, 0.81081081, 0.82555283,
        0.81265356, 0.82002457, 0.81818182, 0.82063882, 0.80773956]),
 'train_accuracy': array([0.88526445, 0.88410435, 0.88549446, 0.88530049, 0.88504186,
        0.88449229, 0.88507419, 0.88504186, 0.884