# Train

In [1]:
import os

here = os.path.abspath("")
project = os.path.dirname(here)
print(project)

c:\Users\dimar\OneDrive\Desktop\github.com\frndlytm\compensation-imputer


A simple training notebook that:

- Adapts an `sklearn.impute.SimpleImputer` to an `mlflow.pyfunc.PythonModel`
- Trains the model naively because it's a `SimpleImputer`, and
- Saves the model to the local file-system under `./data/models/`

This ensures that we can generalize the cloud function always to use
`mlflow.pyfunc.PythonModel` and get said model from the file-system.

In [2]:

import mlflow.pyfunc
import numpy as np
import pandas as pd
import sklearn.impute

In [29]:
class SimpleImputer(mlflow.pyfunc.PythonModel):
    """
    A custom model that adpts the sklearn.impute.SimpleImputer from the
    `transform` interface into the `predict` interface.
    """

    def __init__(self, *args, **kwargs):
        self.imputer = sklearn.impute.SimpleImputer(*args, **kwargs)

    def fit(self, _: pd.DataFrame, y: pd.Series) -> "SimpleImputer":
        """
        Since sklearn.impute.SimpleImputer is a univariate method, I
        use only the data from the `y` series to learn a value for it. 
        """
        self.imputer.fit(y.array.reshape(-1, 1))
        return self

    def predict(self, context, model_input, params=None):
        """Adapt the `transform` from SimpleImputer to `predict` by creating
        an empty target series and filling it with the transform."""
        y_pred = np.full(model_input.shape[0], np.nan).reshape(-1, 1)
        y_pred = self.imputer.transform(y_pred)
        return pd.Series(y_pred.flatten())


In [20]:
df = pd.read_csv(os.path.join(project, "data", "raw", "compensations.csv"))

target = "compensation" 
features = list(filter(lambda c: c != target, df.columns))

X, y = df[features], df[target]
(X.head(5), y.head(5))

(   employee_id                     ssg  ...         city years_of_experience
 0         2977                      HR  ...  Los Angeles            4.000000
 1         2666                 Finance  ...      Seattle            5.760796
 2         5547            XYZ Division  ...       Austin           12.020000
 3         1841                  Buyers  ...      Chicago           41.513650
 4         1618  Administrative Support  ...       Austin           13.507989
 
 [5 rows x 5 columns],
 0     92313.79723
 1     91261.14162
 2     52424.19504
 3    110625.48450
 4     44926.42635
 Name: compensation, dtype: float64)

In [32]:
model = SimpleImputer().fit(X, y)
mlflow.pyfunc.save_model(
    path=os.path.join(project, "data", "models", "SimpleImputer"),
    python_model=model
)

In [34]:
X[target] = model.predict(None, X, None)


In [37]:
pd.read_json(X.to_json(orient="table"), orient="table")

  pd.read_json(X.to_json(orient="table"), orient="table")


Unnamed: 0,employee_id,ssg,gender,city,years_of_experience,compensation
0,2977,HR,female,Los Angeles,4.000000,91594.782902
1,2666,Finance,female,Seattle,5.760796,91594.782902
2,5547,XYZ Division,female,Austin,12.020000,91594.782902
3,1841,Buyers,female,Chicago,41.513650,91594.782902
4,1618,Administrative Support,female,Austin,13.507989,91594.782902
...,...,...,...,...,...,...
3887,2854,Finance,male,Seattle,25.599970,91594.782902
3888,1893,Buyers,male,Chicago,21.870848,91594.782902
3889,3836,LMNOP Division Operators,male,Seattle,16.310000,91594.782902
3890,4511,QRS Division,male,Austin,17.300000,91594.782902


## Bonus: Get Request Payload

In [41]:
missing_compensation = df[pd.isnull(df.compensation)]

with open(os.path.join(project, "data", "processed", "payload.json"), "w+") as fh:
    fh.write(missing_compensation.to_json(orient="table", indent=2))

Index(['employee_id', 'ssg', 'compensation', 'gender', 'city',
       'years_of_experience'],
      dtype='object')
