# Assignment is below at the end

- https://scikit-learn.org/stable/modules/tree.html
- https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
- https://scikit-learn.org/stable/modules/generated/sklearn.tree.plot_tree.html

In [139]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (20, 6)
plt.rcParams['font.size'] = 14
import pandas as pd

In [140]:
df = pd.read_csv('C:/Users/erinp/Desktop/mlnn/data/adult.data', index_col=False)

In [141]:
golden = pd.read_csv('C:/Users/erinp/Desktop/mlnn/data/adult.test', index_col=False)

In [142]:
golden.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K.
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K.
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K.
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K.
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K.


In [143]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [144]:
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'salary'],
      dtype='object')

In [145]:
from sklearn import preprocessing

In [146]:
# Columns we want to transform
transform_columns = ['sex']

#Columns we can't use because non-numerical
non_num_columns = ['workclass', 'education', 'marital-status', 
                     'occupation', 'relationship', 'race', 'sex', 
                     'native-country']

## First let's try using `pandas.get_dummies()` to transform columns

In [147]:
dummies = pd.get_dummies(df[transform_columns])
dummies

Unnamed: 0,sex_ Female,sex_ Male
0,0,1
1,0,1
2,0,1
3,0,1
4,1,0
...,...,...
32556,1,0
32557,0,1
32558,1,0
32559,0,1


In [148]:
dummies.shape

(32561, 2)

## sklearn has a similar process for OneHot Encoding features

In [149]:
onehot = preprocessing.OneHotEncoder(handle_unknown="ignore", sparse=False)
onehot.fit(df[transform_columns])

OneHotEncoder(handle_unknown='ignore', sparse=False)

In [150]:
onehot.categories_

[array([' Female', ' Male'], dtype=object)]

In [151]:
sex = onehot.transform(df[transform_columns])
sex

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       ...,
       [1., 0.],
       [0., 1.],
       [1., 0.]])

In [152]:
sex.shape

(32561, 2)

## In addition to OneHot encoding there is Ordinal Encoding 

In [153]:
enc = preprocessing.OrdinalEncoder()
enc.fit(df[["salary"]])
salary = enc.transform(df[["salary"]])
salary

array([[0.],
       [0.],
       [0.],
       ...,
       [0.],
       [0.],
       [1.]])

In [154]:
enc.categories_[0]

array([' <=50K', ' >50K'], dtype=object)

In [155]:
x = df.copy()

# transformed = pd.get_dummies(df[transform_columns])


onehot = preprocessing.OneHotEncoder(handle_unknown="ignore", sparse=False).fit(df[transform_columns])

enc = preprocessing.OrdinalEncoder()

enc.fit(df[["salary"]])


transformed = onehot.transform(df[transform_columns])
new_cols = list(onehot.categories_[0].flatten())
df_trans = pd.DataFrame(transformed, columns=new_cols)


x = pd.concat(
    [
        x.drop(non_num_columns, axis=1), 
        df_trans
    ], 
    axis=1,)


x["salary"] = enc.transform(df[["salary"]])

In [156]:
x.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,salary,Female,Male
0,39,77516,13,2174,0,40,0.0,0.0,1.0
1,50,83311,13,0,0,13,0.0,0.0,1.0
2,38,215646,9,0,0,40,0.0,0.0,1.0
3,53,234721,7,0,0,40,0.0,0.0,1.0
4,28,338409,13,0,0,40,0.0,1.0,0.0


In [157]:
xt = golden.copy()

transformed = onehot.transform(xt[transform_columns])
new_cols = list(onehot.categories_[0].flatten())
df_trans = pd.DataFrame(transformed, columns=new_cols)

xt = pd.concat(
    [
        xt.drop(non_num_columns, axis=1), 
        df_trans
    ], 
    axis=1,)

xt["salary"] = enc.fit_transform(golden[["salary"]])

In [158]:
xt.salary.value_counts()

0.0    12435
1.0     3846
Name: salary, dtype: int64

In [159]:
enc.categories_

[array([' <=50K.', ' >50K.'], dtype=object)]

In [160]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

#### Choose the model of your preference: DecisionTree or RandomForest

In [161]:
model = RandomForestClassifier(criterion='entropy')

In [162]:
model = DecisionTreeClassifier(criterion='entropy', max_depth=None)

In [163]:
model.fit(x.drop(['fnlwgt','salary'], axis=1), x.salary)

DecisionTreeClassifier(criterion='entropy')

In [164]:
model.tree_.node_count

8347

In [165]:
list(zip(x.drop(['fnlwgt','salary'], axis=1).columns, model.feature_importances_))

[('age', 0.3229634879505442),
 ('education-num', 0.1611890294796697),
 ('capital-gain', 0.22693236736850153),
 ('capital-loss', 0.07830677192060598),
 ('hours-per-week', 0.15490294824776152),
 (' Female', 0.03469264091677785),
 (' Male', 0.021012754116139256)]

In [166]:
list(zip(x.drop(['fnlwgt','salary'], axis=1).columns, model.feature_importances_))

[('age', 0.3229634879505442),
 ('education-num', 0.1611890294796697),
 ('capital-gain', 0.22693236736850153),
 ('capital-loss', 0.07830677192060598),
 ('hours-per-week', 0.15490294824776152),
 (' Female', 0.03469264091677785),
 (' Male', 0.021012754116139256)]

In [167]:
x.drop(['fnlwgt','salary'], axis=1).head()

Unnamed: 0,age,education-num,capital-gain,capital-loss,hours-per-week,Female,Male
0,39,13,2174,0,40,0.0,1.0
1,50,13,0,0,13,0.0,1.0
2,38,9,0,0,40,0.0,1.0
3,53,7,0,0,40,0.0,1.0
4,28,13,0,0,40,1.0,0.0


In [168]:
set(x.columns) - set(xt.columns)

set()

In [169]:
list(x.drop('salary', axis=1).columns)

['age',
 'fnlwgt',
 'education-num',
 'capital-gain',
 'capital-loss',
 'hours-per-week',
 ' Female',
 ' Male']

In [170]:
predictions = model.predict(xt.drop(['fnlwgt','salary'], axis=1))
predictionsx = model.predict(x.drop(['fnlwgt','salary'], axis=1))

In [171]:
from sklearn.metrics import (
    accuracy_score, 
    classification_report, 
    confusion_matrix, auc, roc_curve
)

In [172]:
accuracy_score(xt.salary, predictions)

0.8205269946563479

In [173]:
accuracy_score(xt.salary, predictions)

0.8205269946563479

In [174]:
confusion_matrix(xt.salary, predictions)

array([[11461,   974],
       [ 1948,  1898]], dtype=int64)

In [175]:
print(classification_report(xt.salary, predictions))

              precision    recall  f1-score   support

         0.0       0.85      0.92      0.89     12435
         1.0       0.66      0.49      0.57      3846

    accuracy                           0.82     16281
   macro avg       0.76      0.71      0.73     16281
weighted avg       0.81      0.82      0.81     16281



In [176]:
print(classification_report(xt.salary, predictions))

              precision    recall  f1-score   support

         0.0       0.85      0.92      0.89     12435
         1.0       0.66      0.49      0.57      3846

    accuracy                           0.82     16281
   macro avg       0.76      0.71      0.73     16281
weighted avg       0.81      0.82      0.81     16281



In [177]:
accuracy_score(x.salary, predictionsx)

0.8955806025613464

In [178]:
confusion_matrix(x.salary, predictionsx)

array([[24097,   623],
       [ 2777,  5064]], dtype=int64)

In [179]:
print(classification_report(x.salary, predictionsx))

              precision    recall  f1-score   support

         0.0       0.90      0.97      0.93     24720
         1.0       0.89      0.65      0.75      7841

    accuracy                           0.90     32561
   macro avg       0.89      0.81      0.84     32561
weighted avg       0.90      0.90      0.89     32561



In [180]:
print(classification_report(x.salary, predictionsx))

              precision    recall  f1-score   support

         0.0       0.90      0.97      0.93     24720
         1.0       0.89      0.65      0.75      7841

    accuracy                           0.90     32561
   macro avg       0.89      0.81      0.84     32561
weighted avg       0.90      0.90      0.89     32561



# For the following use the above `adult` dataset. 

# 1. Show the RandomForest outperforms the DecisionTree for a fixed `max_depth` by training using the train set and calculate `precision`, `recall`, `f1`, `confusion matrix` on golden-test set. Start with only numerical features/columns. (age, education-num, capital-gain, capital-loss, hours-per-week) 

In [215]:
model_rf = RandomForestClassifier(criterion='entropy')
model_dt = DecisionTreeClassifier(criterion='entropy', max_depth=None)

In [216]:
# create df with only numerical features/columns (fnlwgt and salary will be dropped when building model)
numeric_df = df.copy().drop(non_num_columns, axis = 1)
numeric_df.columns

Index(['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss',
       'hours-per-week', 'salary'],
      dtype='object')

In [217]:
# make salary in numeric_df numeric
enc.fit(numeric_df[["salary"]])
salary = enc.transform(numeric_df[["salary"]])
salary

array([[0.],
       [0.],
       [0.],
       ...,
       [0.],
       [0.],
       [1.]])

In [218]:
numeric_df["salary"] = enc.transform(numeric_df[["salary"]])

In [219]:
numeric_df.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,salary
0,39,77516,13,2174,0,40,0.0
1,50,83311,13,0,0,13,0.0
2,38,215646,9,0,0,40,0.0
3,53,234721,7,0,0,40,0.0
4,28,338409,13,0,0,40,0.0


In [227]:
# create train set using only numeric columns
numeric_train = golden.copy().drop(non_num_columns, axis = 1)
numeric_train.columns

Index(['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss',
       'hours-per-week', 'salary'],
      dtype='object')

In [230]:
# make salary in numeric_train numeric
enc.fit(numeric_train[["salary"]])
salary = enc.transform(numeric_train[["salary"]])
numeric_train["salary"] = enc.transform(numeric_train[["salary"]])

In [231]:
numeric_train.salary.value_counts()

0.0    12435
1.0     3846
Name: salary, dtype: int64

In [232]:
# fit random forest and decision tree models
model_rf.fit(numeric_df.drop(['fnlwgt','salary'], axis=1), numeric_df.salary)
model_dt.fit(numeric_df.drop(['fnlwgt','salary'], axis=1), numeric_df.salary)

DecisionTreeClassifier(criterion='entropy')

In [233]:
# predictions using numeric_train for random forest
predictions_rf_train = model_rf.predict(numeric_train.drop(['fnlwgt','salary'], axis=1))
predictions_rf = model_rf.predict(numeric_df.drop(['fnlwgt','salary'], axis=1))

In [234]:
# predictions using numeric_train for decision tree
predictions_dt_train = model_dt.predict(numeric_train.drop(['fnlwgt','salary'], axis=1))
predictions_dt = model_dt.predict(numeric_df.drop(['fnlwgt','salary'], axis=1))

In [235]:
# classification report for random forest
print(classification_report(numeric_train.salary, predictions_rf_train))

              precision    recall  f1-score   support

         0.0       0.85      0.93      0.89     12435
         1.0       0.68      0.48      0.56      3846

    accuracy                           0.82     16281
   macro avg       0.77      0.70      0.73     16281
weighted avg       0.81      0.82      0.81     16281



In [236]:
# classification report for decision tree
print(classification_report(numeric_train.salary, predictions_dt_train))

              precision    recall  f1-score   support

         0.0       0.85      0.93      0.88     12435
         1.0       0.66      0.46      0.54      3846

    accuracy                           0.82     16281
   macro avg       0.75      0.69      0.71     16281
weighted avg       0.80      0.82      0.80     16281



There is lower precision, recall, and f1-scores for the decision tree when comparing classification reports using the training set featuring only numeric columns.

# 2. Use a RandomForest or DecisionTree and the `adult` dataset, systematically add new columns, one by one, that are non-numerical but converted using the feature-extraction techniques we learned. Using the golden-test set show [`precision`, `recall`, `f1`, `confusion matrix`] for each additional feature added.

In [184]:
# x and xt already have non-numeric columns removed and sex transformed
x.columns

Index(['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss',
       'hours-per-week', 'salary', ' Female', ' Male'],
      dtype='object')

In [185]:
xt.columns

Index(['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss',
       'hours-per-week', 'salary', ' Female', ' Male'],
      dtype='object')

In [237]:
non_num_columns

['workclass',
 'education',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native-country']

In [276]:
# predictions using sex feature (same as example)
predictions = model.predict(xt.drop(['fnlwgt','salary'], axis=1))
predictionsx = model.predict(x.drop(['fnlwgt','salary'], axis=1))

In [239]:
# classification report using sex feature
print(classification_report(xt.salary, predictions))

              precision    recall  f1-score   support

         0.0       0.85      0.92      0.89     12435
         1.0       0.66      0.49      0.57      3846

    accuracy                           0.82     16281
   macro avg       0.76      0.71      0.73     16281
weighted avg       0.81      0.82      0.81     16281



In [268]:
# transform another variable, race

# Columns we want to transform
transform_columns = ['sex', 'race']

dummies = pd.get_dummies(df[transform_columns],prefix='', prefix_sep='')
dummies

Unnamed: 0,Female,Male,Amer-Indian-Eskimo,Asian-Pac-Islander,Black,Other,White
0,0,1,0,0,0,0,1
1,0,1,0,0,0,0,1
2,0,1,0,0,0,0,1
3,0,1,0,0,1,0,0
4,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...
32556,1,0,0,0,0,0,1
32557,0,1,0,0,0,0,1
32558,1,0,0,0,0,0,1
32559,0,1,0,0,0,0,1


In [269]:
x_sex_race = df.copy()

enc.fit(df[["salary"]])

x_sex_race = pd.concat(
    [
        x_sex_race.drop(non_num_columns, axis=1), 
        dummies
    ], 
    axis=1,)


x_sex_race["salary"] = enc.transform(df[["salary"]])

In [270]:
x_sex_race.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,salary,Female,Male,Amer-Indian-Eskimo,Asian-Pac-Islander,Black,Other,White
0,39,77516,13,2174,0,40,0.0,0,1,0,0,0,0,1
1,50,83311,13,0,0,13,0.0,0,1,0,0,0,0,1
2,38,215646,9,0,0,40,0.0,0,1,0,0,0,0,1
3,53,234721,7,0,0,40,0.0,0,1,0,0,1,0,0
4,28,338409,13,0,0,40,0.0,1,0,0,0,1,0,0


In [280]:
x_sex_race_train = golden.copy()

dummies = pd.get_dummies(golden[transform_columns],prefix='', prefix_sep='')

x_sex_race_train = pd.concat(
    [
        x_sex_race_train.drop(non_num_columns, axis=1), 
        dummies
    ], 
    axis=1,)

x_sex_race_train["salary"] = enc.fit_transform(golden[["salary"]])

In [281]:
# predictions using sex and race feature
model_rf = RandomForestClassifier(criterion='entropy')

model_rf.fit(x_sex_race.drop(['fnlwgt','salary'], axis=1), x_sex_race.salary)

predictions_sex_race = model_rf.predict(x_sex_race_train.drop(['fnlwgt','salary'], axis=1))
predictionsx_sex_race = model_rf.predict(x_sex_race.drop(['fnlwgt','salary'], axis=1))

In [282]:
# classification report using sex and race feature
print(classification_report(x_sex_race_train.salary, predictions_sex_race))

              precision    recall  f1-score   support

         0.0       0.86      0.92      0.89     12435
         1.0       0.67      0.51      0.58      3846

    accuracy                           0.83     16281
   macro avg       0.77      0.72      0.74     16281
weighted avg       0.82      0.83      0.82     16281



Adding the race feature improves the metrics in the classification report.

In [284]:
# transform another variable, workclass

# Columns we want to transform
transform_columns = ['sex', 'race', 'workclass']

dummies = pd.get_dummies(df[transform_columns],prefix='', prefix_sep='')
dummies

Unnamed: 0,Female,Male,Amer-Indian-Eskimo,Asian-Pac-Islander,Black,Other,White,?,Federal-gov,Local-gov,Never-worked,Private,Self-emp-inc,Self-emp-not-inc,State-gov,Without-pay
0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0
1,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0
2,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0
3,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
32557,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0
32558,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
32559,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0


In [285]:
x_sex_race_class = df.copy()

enc.fit(df[["salary"]])

x_sex_race_class = pd.concat(
    [
        x_sex_race_class.drop(non_num_columns, axis=1), 
        dummies
    ], 
    axis=1,)


x_sex_race_class["salary"] = enc.transform(df[["salary"]])

In [286]:
x_sex_race_class_train = golden.copy()

dummies = pd.get_dummies(golden[transform_columns],prefix='', prefix_sep='')

x_sex_race_class_train = pd.concat(
    [
        x_sex_race_class_train.drop(non_num_columns, axis=1), 
        dummies
    ], 
    axis=1,)

x_sex_race_class_train["salary"] = enc.fit_transform(golden[["salary"]])

In [287]:
# predictions using sex, race and class feature
model_rf = RandomForestClassifier(criterion='entropy')

model_rf.fit(x_sex_race_class.drop(['fnlwgt','salary'], axis=1), x_sex_race_class.salary)

predictions_sex_race_class = model_rf.predict(x_sex_race_class_train.drop(['fnlwgt','salary'], axis=1))
predictionsx_sex_race_class = model_rf.predict(x_sex_race_class.drop(['fnlwgt','salary'], axis=1))

In [288]:
# classification report using sex, race and class feature
print(classification_report(x_sex_race_class_train.salary, predictions_sex_race_class))

              precision    recall  f1-score   support

         0.0       0.86      0.92      0.89     12435
         1.0       0.67      0.53      0.59      3846

    accuracy                           0.83     16281
   macro avg       0.77      0.72      0.74     16281
weighted avg       0.82      0.83      0.82     16281



The recall and f1-score are higher after adding class.

In [289]:
# let's try adding one more, marital status

# Columns we want to transform
transform_columns = ['sex', 'race', 'workclass', 'marital-status']

dummies = pd.get_dummies(df[transform_columns],prefix='', prefix_sep='')
dummies

Unnamed: 0,Female,Male,Amer-Indian-Eskimo,Asian-Pac-Islander,Black,Other,White,?,Federal-gov,Local-gov,...,Self-emp-not-inc,State-gov,Without-pay,Divorced,Married-AF-spouse,Married-civ-spouse,Married-spouse-absent,Never-married,Separated,Widowed
0,0,1,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,1,0,0
1,0,1,0,0,0,0,1,0,0,0,...,1,0,0,0,0,1,0,0,0,0
2,0,1,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,0,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
32557,0,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
32558,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
32559,0,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [290]:
x_sex_race_class_mar = df.copy()

enc.fit(df[["salary"]])

x_sex_race_class_mar = pd.concat(
    [
        x_sex_race_class_mar.drop(non_num_columns, axis=1), 
        dummies
    ], 
    axis=1,)


x_sex_race_class_mar["salary"] = enc.transform(df[["salary"]])

In [291]:
x_sex_race_class_mar_train = golden.copy()

dummies = pd.get_dummies(golden[transform_columns],prefix='', prefix_sep='')

x_sex_race_class_mar_train = pd.concat(
    [
        x_sex_race_class_mar_train.drop(non_num_columns, axis=1), 
        dummies
    ], 
    axis=1,)

x_sex_race_class_mar_train["salary"] = enc.fit_transform(golden[["salary"]])

In [295]:
# predictions using sex, race, class and marital status feature
model_rf = RandomForestClassifier(criterion='entropy')

model_rf.fit(x_sex_race_class_mar.drop(['fnlwgt','salary'], axis=1), x_sex_race_class_mar.salary)

predictions_sex_race_class_mar = model_rf.predict(x_sex_race_class_mar_train.drop(['fnlwgt','salary'], axis=1))
predictionsx_sex_race_class_mar = model_rf.predict(x_sex_race_class_mar.drop(['fnlwgt','salary'], axis=1))

In [296]:
# classification report using sex, race and class feature
print(classification_report(x_sex_race_class_mar_train.salary, predictions_sex_race_class_mar))

              precision    recall  f1-score   support

         0.0       0.88      0.92      0.90     12435
         1.0       0.69      0.61      0.65      3846

    accuracy                           0.84     16281
   macro avg       0.79      0.76      0.77     16281
weighted avg       0.84      0.84      0.84     16281



Again we see improvements after adding a feature.