In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, cross_validate
import numpy as np
from sklearn.preprocessing import LabelBinarizer, OneHotEncoder
from sklearn.metrics import fbeta_score, precision_score, recall_score, f1_score, make_scorer
from sklearn.ensemble import RandomForestClassifier

In [2]:
data = pd.read_csv('data/census.csv')
print(data.head(5))

   age         workclass   fnlgt  education  education-num  \
0   39         State-gov   77516  Bachelors             13   
1   50  Self-emp-not-inc   83311  Bachelors             13   
2   38           Private  215646    HS-grad              9   
3   53           Private  234721       11th              7   
4   28           Private  338409  Bachelors             13   

       marital-status         occupation   relationship   race     sex  \
0       Never-married       Adm-clerical  Not-in-family  White    Male   
1  Married-civ-spouse    Exec-managerial        Husband  White    Male   
2            Divorced  Handlers-cleaners  Not-in-family  White    Male   
3  Married-civ-spouse  Handlers-cleaners        Husband  Black    Male   
4  Married-civ-spouse     Prof-specialty           Wife  Black  Female   

   capital-gain  capital-loss  hours-per-week native-country salary  
0          2174             0              40  United-States  <=50K  
1             0             0             

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlgt           32561 non-null  int64 
 3   education       32561 non-null  object
 4   education-num   32561 non-null  int64 
 5   marital-status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital-gain    32561 non-null  int64 
 11  capital-loss    32561 non-null  int64 
 12  hours-per-week  32561 non-null  int64 
 13  native-country  32561 non-null  object
 14  salary          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [4]:
data.shape

(32561, 15)

In [5]:
train, test = train_test_split(data, test_size=0.20)

In [6]:
print(train.shape)
print(test.shape)

(26048, 15)
(6513, 15)


In [7]:
cat_features = [
    "workclass",
    "education",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "native-country",
]

In [8]:
def process_data(X, categorical_features=[], label=None, training=True, encoder=None, lb=None):
    if label is not None:
        y = X[label]
        X = X.drop([label], axis=1)
    else:
        y = np.array([])

    X_categorical = X[categorical_features].values
    X_continuous = X.drop(*[categorical_features], axis=1)

    if training is True:
        encoder = OneHotEncoder(sparse=False, handle_unknown="ignore")
        lb = LabelBinarizer()
        X_categorical = encoder.fit_transform(X_categorical)
        y = lb.fit_transform(y.values).ravel()
    else:
        X_categorical = encoder.transform(X_categorical)
        try:
            y = lb.transform(y.values).ravel()
        # Catch the case where y is None because we're doing inference.
        except AttributeError:
            pass

    X = np.concatenate([X_continuous, X_categorical], axis=1)
    return X, y, encoder, lb

In [9]:
X_train, y_train, encoder, lb = process_data(train, categorical_features=cat_features, label="salary")

In [10]:
print(X_train.shape)
print(y_train.shape)

(26048, 108)
(26048,)


In [11]:
X_test, y_test, _, _ = process_data(test, categorical_features=cat_features, label="salary", 
                                             training=False, encoder=encoder, lb=lb)

In [12]:
print(X_test.shape)
print(y_test.shape)

(6513, 108)
(6513,)


In [38]:
def train_model(X_train, y_train):
    
    cv = KFold(n_splits=10)
    
    scoring = {
        'precision': make_scorer(precision_score),
        'recall': make_scorer(recall_score), 
        'f1_score': make_scorer(f1_score)
    }
    
    model = RandomForestClassifier(n_estimators=300)
    model.fit(X_train, y_train)
    
    scores = cross_validate(model, X_train, y_train, scoring=scoring, cv=cv)
    
    print(f"precision Score: {np.mean(scores['test_precision']):.3f}")
    print(f"recall Score: {np.mean(scores['test_recall']):.3f}")
    print(f"f1 Score: {np.mean(scores['test_f1_score']):.3f}")
    
    return model

In [39]:
def inference(model, X):
    preds = model.predict(X)
    return preds

In [40]:
def compute_model_metrics(y, preds):
    fbeta = fbeta_score(y, preds, beta=1, zero_division=1)
    precision = precision_score(y, preds, zero_division=1)
    recall = recall_score(y, preds, zero_division=1)
    return precision, recall, fbeta

In [41]:
model = train_model(X_train, y_train)

precision Score: 0.735
recall Score: 0.629
f1 Score: 0.678


In [42]:
import pickle

pickle.dump(model, open('model/model_300.pkl', "wb"))

In [43]:
model = pickle.load(open('model/model_300.pkl', "rb"))

In [44]:
preds = inference(model, X_test)

In [45]:
precision, recall, fbeta = compute_model_metrics(y_test, preds)

In [46]:
print(f"precision: {precision:.3f}")
print(f"recall: {recall:.3f}")
print(f"fbeta: {fbeta:.3f}")

precision: 0.741
recall: 0.616
fbeta: 0.672


In [47]:
df = data

In [52]:
df['workclass'].value_counts()

Private             22696
Self-emp-not-inc     2541
Local-gov            2093
?                    1836
State-gov            1298
Self-emp-inc         1116
Federal-gov           960
Without-pay            14
Never-worked            7
Name: workclass, dtype: int64

In [54]:
metric_per_slice = []

for category in cat_features:
    print(f'processing ------------ {category}')
    for category_cls in df[category].unique():
        df_slice = df[df[category] == category_cls]

        X_slice_test, y_slice_test, _, _ = process_data(df_slice, categorical_features=cat_features, label="salary", 
                                                 training=False, encoder=encoder, lb=lb)
        print(f'for class: {category_cls}')
        
        preds = inference(model, X_test)
        precision, recall, fbeta = compute_model_metrics(y_test, preds)
        
        category_cls_name = category_cls if category_cls != '?' else 'unknown'

        metric_per_slice.append([f'{category}-{category_cls_name}', f'{precision:.3f}', 
                                 f'{recall:.3f}', f'{fbeta:.3f}'])
        
with open('model/slice_output.txt', 'w') as fp:
    for single_slice in metric_per_slice:
        # write each item on a new line
        fp.write("%s\n" % single_slice)

processing ------------ workclass
for class: State-gov
for class: Self-emp-not-inc
for class: Private
for class: Federal-gov
for class: Local-gov
for class: ?
for class: Self-emp-inc
for class: Without-pay
for class: Never-worked
processing ------------ education
for class: Bachelors
for class: HS-grad
for class: 11th
for class: Masters
for class: 9th
for class: Some-college
for class: Assoc-acdm
for class: Assoc-voc
for class: 7th-8th
for class: Doctorate
for class: Prof-school
for class: 5th-6th
for class: 10th
for class: 1st-4th
for class: Preschool
for class: 12th
processing ------------ marital-status
for class: Never-married
for class: Married-civ-spouse
for class: Divorced
for class: Married-spouse-absent
for class: Separated
for class: Married-AF-spouse
for class: Widowed
processing ------------ occupation
for class: Adm-clerical
for class: Exec-managerial
for class: Handlers-cleaners
for class: Prof-specialty
for class: Other-service
for class: Sales
for class: Craft-repair
fo