In this notebook we train a model

In [6]:
import pandas as pd
import joblib
import os

from pathlib import Path
from sklearn.model_selection import train_test_split

In [7]:

path = Path(os.getcwd())
dirname = os.path.join(path.parent.absolute(), 'model', 'latest')
data_path = os.path.join(path.parent.absolute(), 'data', 'census.csv')

In [8]:
import sys
sys.path.append("../") # go to parent dir

In [9]:
from ml.data import process_data
from ml.model import train_model, compute_model_metrics, inference

In [11]:
data = pd.read_csv(data_path)

First we split

In [12]:
train, test = train_test_split(data, test_size=0.20, shuffle=True, stratify=data["salary"])

Then we preprocess 

In [13]:
cat_features = [
    "workclass",
    "education",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "native-country",
]
X_train, y_train, encoder, lb = process_data(
    train, categorical_features=cat_features, label="salary", training=True
)

X_test, y_test, _, _ = process_data(
    test, categorical_features=cat_features, label="salary", encoder = encoder, lb = lb, training = False
)


Then we train

In [14]:
model = train_model(X_train, y_train)

Found best params: {'scoring': 'loss', 'min_samples_leaf': 20, 'max_leaf_nodes': 40, 'max_iter': 1000, 'max_depth': 20, 'learning_rate': 0.1, 'l2_regularization': 1}


In [15]:
os.makedirs(dirname, exist_ok=True)
joblib.dump(model, os.path.join(dirname, 'model'))
joblib.dump(encoder, os.path.join(dirname, 'encoder'))
joblib.dump(lb, os.path.join(dirname, 'lb'))
joblib.dump(cat_features, os.path.join(dirname, 'cat_features'))


['/home/diego/projects/ud_mlops/census_heroku/model/latest/cat_features']