In [2]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (20, 6)
plt.rcParams['font.size'] = 14
import pandas as pd

In [3]:
df = pd.read_csv('../data/adult.data', index_col=False)

In [4]:
golden = pd.read_csv('../data/adult.test', index_col=False)

The `golden` dataframe is a "golden" test set that can be used to verify the performance of the model.

In [5]:
golden.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K.
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K.
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K.
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K.
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K.


In [6]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [7]:
df['salary'].unique()

array([' <=50K', ' >50K'], dtype=object)

In [8]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [9]:
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'salary'],
      dtype='object')

In [10]:
non_numeric_columns = ['workclass', 'education', 'marital-status', 
                     'occupation', 'relationship', 'race', 'sex', 
                     'native-country', 'fnlwgt']

In [11]:
# from sklearn import preprocessing

# enc = preprocessing.OrdinalEncoder()

# pd.get_dummies(df[transform_columns]).head()

In [12]:
x = df.copy().drop(non_numeric_columns, axis=1)

In [13]:
# x = pd.concat([x.drop(transform_columns, axis=1), 
#                pd.get_dummies(df[transform_columns])], axis=1,)

# x["salary"] = enc.fit_transform(df[["salary"]])
# enc.categories_

In [14]:
xt = golden.copy().drop(non_numeric_columns, axis=1)

In [15]:
# xt = pd.concat([xt.drop(transform_columns, axis=1), 
#                pd.get_dummies(golden[transform_columns])], axis=1,)

# xt["salary"] = enc.fit_transform(golden[["salary"]])

In [16]:
x['salary'] = x.salary.str.contains('>').astype(int)

In [17]:
xt['salary'] = xt.salary.str.contains('>').astype(int)

In [18]:
x.salary.value_counts()

0    24720
1     7841
Name: salary, dtype: int64

In [19]:
xt.salary.value_counts()

0    12435
1     3846
Name: salary, dtype: int64

In [20]:
# from sklearn.linear_model import LogisticRegression 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# from sklearn.ensemble import RandomForestClassifier as Model
# model = Model()

#### For now we choose the DecisionTree as our classifier. More details next class about Decision Trees

In [24]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
model = DecisionTreeClassifier(criterion='entropy')

In [25]:
model.fit(x.drop(['salary'], axis=1), x.salary)
x_train, x_test, y_train, y_test = train_test_split(x.drop(['salary'], axis=1),x.salary, test_size=.20)

In [26]:
list(zip(x.drop(['salary'], axis=1).columns, model.feature_importances_))

[('age', 0.3277472031065908),
 ('education-num', 0.16669390396474873),
 ('capital-gain', 0.24758558299408112),
 ('capital-loss', 0.09775111857802014),
 ('hours-per-week', 0.1602221913565592)]

In [27]:
x.drop(['salary'] , axis=1).head()

Unnamed: 0,age,education-num,capital-gain,capital-loss,hours-per-week
0,39,13,2174,0,40
1,50,13,0,0,13
2,38,9,0,0,40
3,53,7,0,0,40
4,28,13,0,0,40


In [28]:
set(x.columns) - set(xt.columns)

set()

In [38]:
predictions = model.predict(xt.drop(['salary'], axis=1))

In [39]:
predictions_train = model.predict(x.drop(['salary'], axis=1))

In [41]:
xt.drop(['salary'], axis=1).head()

Unnamed: 0,age,education-num,capital-gain,capital-loss,hours-per-week
0,25,7,0,0,40
1,38,9,0,0,50
2,28,12,0,0,40
3,44,10,7688,0,40
4,18,10,0,0,30


### Here we use the metrics to calculate how well our model is performing.
- accuracy = number correct / total number
- precision = number of correct 1 (or 0) / number of 1s (or 0s) the model predicted
- recall = number of correct 1s (or 0s) / number of 1s (or 0s) that exist in the dataset
- f1 = 2 * prec * recall / (prec+recall)

In [42]:
from sklearn.metrics import (accuracy_score, 
                             classification_report, 
                             confusion_matrix, auc, roc_curve
                            )

In [43]:
import numpy as np

In [149]:
predictions = np.zeros(16281)

In [44]:
xt.salary.size

16281

In [150]:
accuracy_score(xt.salary, predictions)

0.7637737239727289

In [151]:
predictions.shape, xt.salary.values.shape

((16281,), (16281,))

In [152]:
(xt.salary.values == predictions).sum()/len(predictions)

0.7637737239727289

In [153]:
np.logical_and(xt.salary.values, predictions).sum()

0

In [33]:
from sklearn.metrics import (accuracy_score, 
                             classification_report, 
                             confusion_matrix
                            )

In [37]:
confusion_matrix(xt.salary, predictions)

array([[11510,   925],
       [ 2068,  1778]], dtype=int64)

In [155]:
11531/(11531+2077), 1769/(1769+904), 1769/(1769+2077), 11531/(11531+904)

(0.8473691945914168,
 0.6618032173587729,
 0.45995839833593344,
 0.9273019702452754)

In [118]:
print(classification_report(xt.salary, predictions))

              precision    recall  f1-score   support

           0       0.76      1.00      0.87     12435
           1       0.00      0.00      0.00      3846

    accuracy                           0.76     16281
   macro avg       0.38      0.50      0.43     16281
weighted avg       0.58      0.76      0.66     16281



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [119]:
2* (.80 *.10)/(.8+.1)

0.1777777777777778

### Can we run this on the training data? What type of scores should we expect?

In [156]:
accuracy_score(x.salary, predictions_train)

0.8841558920180584

In [157]:
confusion_matrix(x.salary, predictions_train)

array([[24136,   584],
       [ 3188,  4653]], dtype=int64)

In [158]:
print(classification_report(x.salary, predictions_train))

              precision    recall  f1-score   support

           0       0.88      0.98      0.93     24720
           1       0.89      0.59      0.71      7841

    accuracy                           0.88     32561
   macro avg       0.89      0.78      0.82     32561
weighted avg       0.88      0.88      0.88     32561



In [159]:
len(x)

32561

# Now we use test_train_split instead of using the golden test set provided to us

In [160]:
from sklearn.model_selection import train_test_split

In [161]:
x_train, x_test, y_train, y_test = train_test_split(x.drop(['salary'], axis=1),x.salary, test_size=.20)

In [163]:
x.shape, x_train.shape, x_test.shape

((32561, 6), (26048, 5), (6513, 5))

In [164]:
model.fit(x_train, y_train)

DecisionTreeClassifier(criterion='entropy')

In [165]:
test_predictions = model.predict(x_test)

In [166]:
accuracy_score(y_test, test_predictions)

0.8232765238753262

In [171]:
confusion_matrix(y_test, test_predictions)

array([[4603,  381],
       [ 770,  759]], dtype=int64)

In [172]:
confusion_matrix(xt.salary, predictions)

array([[12435,     0],
       [ 3846,     0]], dtype=int64)

In [173]:
print(classification_report(y_test, test_predictions))

              precision    recall  f1-score   support

           0       0.86      0.92      0.89      4984
           1       0.67      0.50      0.57      1529

    accuracy                           0.82      6513
   macro avg       0.76      0.71      0.73      6513
weighted avg       0.81      0.82      0.81      6513



In [170]:
print(classification_report(xt.salary, predictions))

              precision    recall  f1-score   support

           0       0.76      1.00      0.87     12435
           1       0.00      0.00      0.00      3846

    accuracy                           0.76     16281
   macro avg       0.38      0.50      0.43     16281
weighted avg       0.58      0.76      0.66     16281



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Using cross_validate to perform n-fold validations

In [174]:
from sklearn.model_selection import cross_validate

In [175]:
cross_validate(model, x.drop(['salary'], axis=1), x.salary, cv=20, 
               scoring=['accuracy', 'recall', 'precision', 'f1'], return_train_score=True)

{'fit_time': array([0.04188681, 0.04188728, 0.03992844, 0.04088354, 0.03986049,
        0.04091597, 0.04040217, 0.04089069, 0.04089022, 0.03989625,
        0.04085803, 0.04089522, 0.04085469, 0.04212236, 0.04118395,
        0.03992677, 0.04085779, 0.04091978, 0.04190731, 0.04188609]),
 'score_time': array([0.00498652, 0.0050137 , 0.00395489, 0.00498867, 0.00498652,
        0.00399256, 0.00402403, 0.00498629, 0.00498652, 0.00398922,
        0.00498748, 0.00398397, 0.00402331, 0.00701714, 0.00399184,
        0.00395918, 0.00399017, 0.00395465, 0.00397229, 0.00598431]),
 'test_accuracy': array([0.81399632, 0.82616708, 0.81326781, 0.81818182, 0.80896806,
        0.82432432, 0.80773956, 0.81756757, 0.81265356, 0.81941032,
        0.82248157, 0.81572482, 0.80773956, 0.81081081, 0.82616708,
        0.81388206, 0.81879607, 0.81756757, 0.81818182, 0.80712531]),
 'train_accuracy': array([0.88526445, 0.88410435, 0.88549446, 0.88530049, 0.88504186,
        0.88449229, 0.88507419, 0.88504186, 0.884

In [None]:
[('CBC/CRP', 0.09150688820803148),
 ('Traveling in past 3 months ago', 0.001108024675533264),
 ('Connection with a suspected (covid-19) person', 0.002780418994537029),
 ('Diabetes', 0.04152608671398356),
 ('blood pressure', 0.0693419819146794),
 ('Asthma', 0.0413161274777236),
 ('Heart disease', 0.052924078580527126),
 ('kidney disease', 0.09430407628238216),
 ('Respiratory disease', 0.05239074705358912),
 ('Cancer', 0.11804601451444832),
 ('Corticosteroids', 0.0),
 ('HIV', 0.0),
 (' transplant', 0.0),
 ('HEM', 0.0244129711043944),
 (' Immunodeficiency', 0.0),
 ('Liver disease', 0.040837063861037665),
 ('Rheumatological disease', 0.0),
 (' Chest pain', 0.0),
 ('Fever', 0.0011326675161627536),
 ('Trembling or Shakes', 0.002059458722032772),
 ('Weakness', 0.03687419051155895),
 ('Sweating', 0.0),
 ('Sore throat', 0.02959999924636446),
 ('dyspnea', 0.1280551818981637),
 ('Dry cough', 0.03629134661895331),
 ('Cough with sputum', 0.0),
 ('Fatigue, whole body hurts', 0.03770801554701942),
 ('Anosmia', 0.0),
 ('Ageusia', 0.01837735455557079),
 ('Anorexie', 0.0),
 ('Eczema', 0.0),
 ('Conjunctivitis (Pink eye)', 0.0),
 ('Blindness and Tunnel vision', 0.0),
 ('Vertigo', 0.0),
 ('Nausea/Diarrhea', 0.0),
 ('Tobacco', 0.0),
 ('Chest pain: Diagnosis of stroke or heart disease', 0.0),
 ('The Infected person ( covid-19)  in family', 0.07940730600330663)]