# Load Basic Dependencies

In [95]:
import pandas as pd
import numpy as np
import sklearn
import xgboost as xgb

print('Pandas version', pd.__version__)
print('Numpy version', np.__version__)
print('Scikit Learn version', sklearn.__version__)
print('XGBoost version', xgb.__version__)

Pandas version 1.1.3
Numpy version 1.19.2
Scikit Learn version 0.23.2
XGBoost version 1.4.1


# Load Sample Data

In [96]:
df = pd.read_csv('../datasets/census.csv')
final_columns = set(df.columns) - set(['fnlwgt'])
final_columns = list(final_columns)
df = df[final_columns]
df = df.drop(columns=['income'])

# Create Sample Request Datasets

Here we emulate how data would look when we build an API to serve model requests

Typically requests and responses are generated in JSON, hence we will work with python dictionaries as inputs and outputs

In [97]:
request_data1 = df.iloc[25].to_dict()
request_data1

{'education': 'Assoc-voc',
 'workclass': 'Private',
 'native.country': 'United-States',
 'sex': 'Male',
 'education.num': 11,
 'race': 'White',
 'occupation': 'Craft-repair',
 'capital.gain': 0,
 'capital.loss': 2603,
 'marital.status': 'Married-civ-spouse',
 'relationship': 'Husband',
 'hours.per.week': 40,
 'age': 21}

Creating another sample request dataset with multiple records and introducing more missing data and fields to simulate real-world scenarios

In [98]:
request_data2 = df.iloc[0:3].to_dict(orient='records')

request_data2[2]['workclass'] = ''
request_data2[2]['race'] = '?'
request_data2[2]['hours.per.week'] = '?'

del request_data2[0]['native.country']
del request_data2[0]['sex']
del request_data2[0]['age']
del request_data2[0]['race']
del request_data2[0]['relationship']
del request_data2[0]['marital.status']
del request_data2[0]['education.num']

request_data2

[{'education': 'HS-grad',
  'workclass': '?',
  'occupation': '?',
  'capital.gain': 0,
  'capital.loss': 4356,
  'hours.per.week': 40},
 {'education': 'HS-grad',
  'workclass': 'Private',
  'native.country': 'United-States',
  'sex': 'Female',
  'education.num': 9,
  'race': 'White',
  'occupation': 'Exec-managerial',
  'capital.gain': 0,
  'capital.loss': 4356,
  'marital.status': 'Widowed',
  'relationship': 'Not-in-family',
  'hours.per.week': 18,
  'age': 82},
 {'education': 'Some-college',
  'workclass': '',
  'native.country': 'United-States',
  'sex': 'Female',
  'education.num': 10,
  'race': '?',
  'occupation': '?',
  'capital.gain': 0,
  'capital.loss': 4356,
  'marital.status': 'Widowed',
  'relationship': 'Unmarried',
  'hours.per.week': '?',
  'age': 66}]

# Step 1: Create function to load model artifacts

In [99]:
import dill


def load_model_artifacts(path):
    with open(path, "rb") as dill_infile:
        model_artifacts = dill.load(dill_infile)
        
    return model_artifacts

In [100]:
ML_ARTIFACTS_PATH = "../ml_app/saved_models/census_xgb_artifacts.pkl"

ml_artifacts = load_model_artifacts(path=ML_ARTIFACTS_PATH)
ml_artifacts.keys()

dict_keys(['dummy_encoder', 'cat_init_features', 'num_init_features', 'cat_ohe_features', 'cat_imputer', 'num_imputer', 'xgb_model', 'column_names_order'])

# Step 2: Create function to form a dataset from request data

In [101]:
def form_dataset(request_data, ml_model_artifacts,
                 na_values=['', '?']):
    
    # convert request records into a list of dicts
    request_data = [request_data] if type(request_data) == dict else request_data
    # for each record add in missing fields
    for record in request_data:
        # get list of inital data features
        feature_names = list(ml_model_artifacts['cat_init_features']) + list(ml_model_artifacts['num_init_features'])
        # get list of features missing in record
        features_not_present = list(set(feature_names) - set(record.keys()))
        # fill feature names with a missing value placeholder
        for feature in features_not_present:
            record[feature] = '?'
    
    # convert list of record dicts into a dataframe     
    request_df = pd.DataFrame(request_data)
    # convert missing value tokens to NaNs
    for token in na_values:
        request_df = request_df.replace({token : np.NaN})

    return request_df

In [102]:
request_df = form_dataset(request_data=request_data2,
                          ml_model_artifacts=ml_artifacts)
request_df

Unnamed: 0,education,workclass,occupation,capital.gain,capital.loss,hours.per.week,native.country,sex,education.num,race,marital.status,relationship,age
0,HS-grad,,,0,4356,40.0,,,,,,,
1,HS-grad,Private,Exec-managerial,0,4356,18.0,United-States,Female,9.0,White,Widowed,Not-in-family,82.0
2,Some-college,,,0,4356,,United-States,Female,10.0,,Widowed,Unmarried,66.0


# Step 3: Impute and Encode Features

In [103]:
def impute_and_encode_features(request_df, ml_model_artifacts):
    
    # separate categorical and numeric features
    categorical_features_init = ml_model_artifacts['cat_init_features']
    numeric_features_init = ml_model_artifacts['num_init_features']
    request_df_cat = request_df[categorical_features_init]
    request_df_num = request_df[numeric_features_init]
    
    # impute categorical features
    categorical_imputer = ml_model_artifacts['cat_imputer']
    request_df_cat = pd.DataFrame(categorical_imputer.transform(request_df_cat), 
                                  columns=categorical_features_init)
    
    # one-hot encode categorical features (dummy variables)
    categorical_ohe = ml_model_artifacts['dummy_encoder']
    request_df_cat_ohe = categorical_ohe.transform(request_df_cat).toarray()
    
    categorical_features_ohe = ml_model_artifacts['cat_ohe_features']
    request_df_cat_ohe = pd.DataFrame(request_df_cat_ohe, 
                                      columns=categorical_features_ohe)
    
    # impute numeric features
    numeric_imputer = ml_model_artifacts['num_imputer']
    request_df_num = pd.DataFrame(numeric_imputer.transform(request_df_num), 
                                  columns=numeric_features_init)
    
    # combine numeric and categorical features
    request_df = pd.concat([request_df_num, request_df_cat_ohe], axis=1)
    # align column names for feature set
    column_names = ml_model_artifacts['column_names_order']
    request_df = request_df[column_names]
    
    return request_df

In [104]:
request_df = impute_and_encode_features(request_df=request_df, 
                                        ml_model_artifacts=ml_artifacts)
request_df

Unnamed: 0,age,hours.per.week,education.num,capital.loss,capital.gain,relationship_Husband,relationship_Not-in-family,relationship_Other-relative,relationship_Own-child,relationship_Unmarried,...,education_9th,education_Assoc-acdm,education_Assoc-voc,education_Bachelors,education_Doctorate,education_HS-grad,education_Masters,education_Preschool,education_Prof-school,education_Some-college
0,57.8,40.0,7.8,4356.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,82.0,18.0,9.0,4356.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,66.0,40.0,10.0,4356.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


# Step 4: Load and make ML model predictions

In [105]:
def make_model_predictions(request_df, ml_model_artifacts):
    
    # load saved ML model
    ml_model = ml_model_artifacts['xgb_model']
    
    # make model predictions
    predictions = ml_model.predict(request_df)
    
    # return predictions
    return {
        'predicted_classes' : list(predictions)
    }

In [106]:
make_model_predictions(request_df=request_df, 
                       ml_model_artifacts=ml_artifacts)



{'predicted_classes': ['<=50K', '<=50K', '<=50K']}

# Step 5: Build ML inference pipeline

In [107]:
ML_ARTIFACTS_PATH = "../ml_app/saved_models/census_xgb_artifacts.pkl"


def ml_inference_pipeline(request_data):
    
    # 1. Load model artifacts
    ml_artifacts = load_model_artifacts(path=ML_ARTIFACTS_PATH)
    
    # 2. Create request dataset
    request_df = form_dataset(request_data=request_data,
                              ml_model_artifacts=ml_artifacts)
    
    # 3. Impute and Encode Features
    request_df = impute_and_encode_features(request_df=request_df, 
                                            ml_model_artifacts=ml_artifacts)
    
    # 4. Load and make ML model predictions
    pred_response = make_model_predictions(request_df=request_df, 
                                           ml_model_artifacts=ml_artifacts)
    
    # return response
    return pred_response


# Test inference pipeline

In [108]:
ml_inference_pipeline(request_data=request_data1)

{'predicted_classes': ['<=50K']}

In [109]:
ml_inference_pipeline(request_data=request_data2)



{'predicted_classes': ['<=50K', '<=50K', '<=50K']}

In [110]:
df_raw = pd.read_csv('../datasets/census.csv')

In [111]:
request_data3 = df.iloc[20000:20010].to_dict(orient='records')

In [112]:
ml_inference_pipeline(request_data=request_data3)



{'predicted_classes': ['<=50K',
  '>50K',
  '<=50K',
  '<=50K',
  '>50K',
  '>50K',
  '<=50K',
  '<=50K',
  '<=50K',
  '<=50K']}

In [113]:
df_raw.iloc[20000:20010]['income'].tolist()

['<=50K',
 '>50K',
 '<=50K',
 '<=50K',
 '>50K',
 '>50K',
 '<=50K',
 '<=50K',
 '<=50K',
 '<=50K']