In [11]:
%load_ext autoreload
%autoreload 2

import logging
import os.path
import pandas as pd
from sklearn.model_selection import train_test_split
from ml.data import process_data
from ml.model import train_model, model_performance_on_slices, generate_slices, save_model

logging.basicConfig(level=logging.INFO, format="%(asctime)-15s %(message)s")
logger = logging.getLogger()

logger.info('Reading data')
data = pd.read_csv(os.path.join('..', 'data', 'census-cleaner.csv'))

logger.info('Splitting data')
train, test = train_test_split(data, test_size=0.20)

logger.info('Processing data')
cat_features = [
    "workclass",
    "education",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "native-country",
]
target = "salary"
X_train, y_train, encoder, lb = process_data(
    train, categorical_features=cat_features, label=target, training=True
)

X_test, y_test, _, _ = process_data(
    test, categorical_features=cat_features, label=target, training=False, encoder=encoder, lb=lb
)

ImportError: cannot import name 'save_model' from 'ml.model' (/Users/eric/src/learning/udacity-mldevops-exercise3/model/ml/model.py)

In [2]:
logger.info('Training classifier')
fixed_params = {'use_label_encoder': False, 'objective': 'binary:logistic', 'eval_metric': 'auc', 'subsample': 0.8, 'colsample_bytree': 0.8, 'seed': 42}
model = train_model(X_train, y_train, fixed_params)

2022-04-29 10:00:27,564 Training classifier
2022-04-29 10:00:27,566 Tuning hyperparameters for roc_auc
2022-04-29 10:00:49,951 Best parameters set found on development set:
2022-04-29 10:00:49,952 {'eta': 0.1, 'max_depth': 7, 'n_estimators': 150}


In [10]:
logger.info('Saving model')
from ml.model import train_model, model_performance_on_slices, generate_slices, save_model
save_model(model, os.path.join(os.path.dirname(__file__), 'xgboost-model.json'))

2022-04-29 10:10:39,245 Saving model


ImportError: cannot import name 'save_model' from 'ml.model' (/Users/eric/src/learning/udacity-mldevops-exercise3/model/ml/model.py)

In [4]:
test

Unnamed: 0,age,workclass,fnlgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
10184,42,Self-emp-not-inc,115323,Doctorate,16,Never-married,Prof-specialty,Not-in-family,White,Male,0,0,7,?,<=50K
28404,41,Private,157025,HS-grad,9,Never-married,Machine-op-inspct,Unmarried,Black,Male,0,0,40,United-States,<=50K
30200,36,Private,247936,HS-grad,9,Married-civ-spouse,Other-service,Wife,Asian-Pac-Islander,Female,0,0,2,Taiwan,<=50K
5905,60,Self-emp-not-inc,88055,Some-college,10,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,24,United-States,<=50K
5764,54,Private,88278,Doctorate,16,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,1977,50,United-States,>50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24027,86,Private,149912,Masters,14,Never-married,Adm-clerical,Not-in-family,White,Female,0,0,40,United-States,<=50K
28976,23,Private,308498,Some-college,10,Never-married,Craft-repair,Own-child,White,Male,0,0,15,United-States,<=50K
13062,21,Private,182117,Bachelors,13,Never-married,Other-service,Other-relative,White,Male,0,0,20,United-States,<=50K
8072,44,Self-emp-not-inc,26669,Assoc-acdm,12,Married-civ-spouse,Other-service,Wife,White,Female,0,0,99,United-States,<=50K


In [5]:
encoder

OneHotEncoder(handle_unknown='ignore', sparse=False)

In [6]:
X_test

array([[4.20000e+01, 1.15323e+05, 1.60000e+01, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00],
       [4.10000e+01, 1.57025e+05, 9.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 0.00000e+00],
       [3.60000e+01, 2.47936e+05, 9.00000e+00, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00],
       ...,
       [2.10000e+01, 1.82117e+05, 1.30000e+01, ..., 1.00000e+00,
        0.00000e+00, 0.00000e+00],
       [4.40000e+01, 2.66690e+04, 1.20000e+01, ..., 1.00000e+00,
        0.00000e+00, 0.00000e+00],
       [4.30000e+01, 3.08240e+04, 1.00000e+01, ..., 1.00000e+00,
        0.00000e+00, 0.00000e+00]])

In [7]:
logger.info('Generating data slices')
slices = {}
for feature in cat_features:
    logger.info(f'Generating slice for {feature}')
    for cls in test[feature].unique():
        data_slice = test[test[feature] == cls]
        slices[f"{feature} => {cls}"] = data_slice

2022-04-29 10:01:05,138 Generating data slices
2022-04-29 10:01:05,139 Generating slice for workclass
2022-04-29 10:01:05,147 Generating slice for education
2022-04-29 10:01:05,155 Generating slice for marital-status
2022-04-29 10:01:05,160 Generating slice for occupation
2022-04-29 10:01:05,168 Generating slice for relationship
2022-04-29 10:01:05,172 Generating slice for race
2022-04-29 10:01:05,176 Generating slice for sex
2022-04-29 10:01:05,179 Generating slice for native-country


In [8]:
slices

{'workclass => Self-emp-not-inc':        age         workclass   fnlgt     education  education-num  \
 10184   42  Self-emp-not-inc  115323     Doctorate             16   
 5905    60  Self-emp-not-inc   88055  Some-college             10   
 24378   44  Self-emp-not-inc  361280  Some-college             10   
 24150   35  Self-emp-not-inc  114366     Bachelors             13   
 15950   63  Self-emp-not-inc  181561  Some-college             10   
 ...    ...               ...     ...           ...            ...   
 30233   74  Self-emp-not-inc  109101  Some-college             10   
 6447    60  Self-emp-not-inc  235535       HS-grad              9   
 32438   66  Self-emp-not-inc  102686       Masters             14   
 6892    37  Self-emp-not-inc  255454  Some-college             10   
 8072    44  Self-emp-not-inc   26669    Assoc-acdm             12   
 
            marital-status       occupation   relationship                race  \
 10184       Never-married   Prof-specialty

In [9]:
from sklearn.metrics import fbeta_score, precision_score, recall_score

for key, data_slice in slices.items():
    X_slice, y_slice, _, _ = process_data(data_slice, categorical_features=cat_features, label=target, training=False, encoder=encoder, lb=lb)
    preds = model.predict(X_slice)
    fbeta = fbeta_score(y_slice, preds, beta=1, zero_division=1)
    precision = precision_score(y_slice, preds, zero_division=1)
    recall = recall_score(y_slice, preds, zero_division=1)
    print(key, precision, recall, fbeta)

workclass => Self-emp-not-inc 0.7623762376237624 0.5968992248062015 0.6695652173913043
workclass => Private 0.7942643391521197 0.6402010050251257 0.7089593767390096
workclass => Local-gov 0.768595041322314 0.7322834645669292 0.7500000000000001
workclass => State-gov 0.7735849056603774 0.6612903225806451 0.7130434782608696
workclass => ? 0.56 0.3783783783783784 0.4516129032258065
workclass => Federal-gov 0.7468354430379747 0.8939393939393939 0.8137931034482758
workclass => Self-emp-inc 0.8405797101449275 0.8111888111888111 0.8256227758007118
workclass => Without-pay 1.0 1.0 1.0
education => Doctorate 0.864406779661017 0.9272727272727272 0.8947368421052632
education => HS-grad 0.8023255813953488 0.40588235294117647 0.5390625
education => Some-college 0.7545454545454545 0.5684931506849316 0.6484375
education => 11th 1.0 0.4 0.5714285714285715
education => Bachelors 0.7556053811659192 0.8259803921568627 0.7892271662763466
education => Prof-school 0.8735632183908046 0.987012987012987 0.9268