## Build Data Preprocess steps and Modeling with XGboost

To build a data pipeline flexible enough to accormodate future data changes 
1. Build modules for each data prepartion steps, easier to scale 
2. Handling special dirty data (such as negative values, and exterme outliers 

**Key decisions from data understanding**
1. remove "last data vet visit" and index column from model building
2. Handle exterme outliers and negative values,the one beyond quantile 99.9%, such as the one with hair length, but keep the others 
3. Keep the dirty data, such as "Die of age" 0, the future data might have the same kind of data, so the model training need to take that into consideration. 

In [1]:
! pip install hyperopt | tail -n 1

Successfully installed cloudpickle-1.6.0 future-0.18.2 hyperopt-0.2.5 networkx-2.5


In [2]:
import numpy as np
import pandas as pd
%matplotlib inline

In [3]:
# load data from csv file
def load_data(filepath):
    
    columns = ['index','age','breed','last_vet_visit','hair_length','height','num_vet_visit','weight']
    
    return pd.read_csv(filepath,names=columns,header=0)

In [4]:
# remove all negative values from data
def remove_negative_value(df, columns):
    # df: dataframe of the data 
    # columns: list of columns need to remove negative value from 
    
    if len(columns) == 0 and df.empty:
        return df
    
    for col in columns:
        df = df[df[col]>=0]
    
    return df

In [5]:
def remove_extreme_outliers(df,columns):
    #for values great then quantile 99.9% then it would be treated as exterm outlier 
    # df: dataframe of the data 
    # columns: list of columns need to remove negative value from 
    
    if len(columns) == 0 and df.empty:
        return df
    
    for col in columns:
        limit = df[col].quantile(0.999)
        df = df[df[col]<limit] 
        
    return df

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder


# Data preprocess and split testing and training set

def preprocess():
    
    filepath='data.csv'

    df = load_data(filepath)

    #remove  column 
    df = df.drop(columns=['index','last_vet_visit'])

    #remove any negative value
    df = remove_negative_value(df,['age','hair_length','height','num_vet_visit','weight'])

    #remove any exterme outliers
    df = remove_extreme_outliers(df,['age','hair_length','height','num_vet_visit','weight'])
    
    #split training, test and validation set
 
    test_size = 0.2
    
    df_train,df_test = train_test_split(df, test_size=test_size)

    
    return df_train,df_test

    
    

In [7]:
train,test=preprocess()
print('train size {}, test size {}'.format(train.shape,test.shape))

train size (232, 6), test size (58, 6)


In [8]:
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder, StandardScaler 
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from xgboost import XGBRegressor
from sklearn.preprocessing import OneHotEncoder



# further data processing, standardise the numerical columns, and perform one hot encoding for category values 
# On the category values, for those category with exterme less examples, such as 'Donald'
# consider rename all those categories into a common name, such as "Other"
    
category_cols = ['breed']
numerical_cols = ['hair_length','height','num_vet_visit','weight']

transformer = make_pipeline(
    ColumnTransformer([
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), category_cols),    
    ])
)

ModuleNotFoundError: No module named 'xgboost'

In [None]:
# perform data process

df_train,df_test = preprocess()

age_train = df_train.age.copy().to_numpy()
age_test = df_test.age.copy().to_numpy()


x_train = transformer.fit_transform(df_train.drop(['age'],axis=1))
x_test = transformer.transform(df_test.drop(['age'],axis=1))





In [None]:
#hyperparameter turning

from hyperopt.pyll.base import scope
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.metrics import make_scorer, mean_squared_error

def objective(space):
    reg = XGBRegressor(n_jobs=-1, 
                           eval_metric="rmse", 
                           eta=space["eta"], 
                           max_depth=space["max_depth"], 
                           n_estimators=space["n_estimators"], 
                           min_child_weight=space["min_child_weight"], 
                           colsample_bytree=space["colsample_bytree"],
                           subsample=space["subsample"], 
                           seed=1,
                          silent=True)
    reg.fit(x_train, age_train, eval_set=[(x_train, age_train), (x_test, age_test)], early_stopping_rounds=200, verbose=False) 
    age_pred = reg.predict(x_test)
    mse = mean_squared_error(age_test, age_pred)
    return {"loss": mse, "status": STATUS_OK }

space = {
    "max_depth": scope.int(hp.quniform("max_depth", 3, 8, q=1)),
    "n_estimators": scope.int(hp.quniform("n_estimators", 150, 450, q=50)),
    "eta": hp.quniform("eta", 0.05, 0.2, 0.05),
    "min_child_weight": hp.quniform("min_child_weight", 0.5, 1.8, 0.1),
    "subsample": hp.quniform("subsample", 0.5, 1, 0.1),
    "colsample_bytree": hp.quniform("colsample_bytree", 0.5, 1, 0.1)
}

best = fmin(
    fn = objective,
    space = space,
    algo = tpe.suggest,
    max_evals = 1000,
    rstate = np.random.RandomState(12345)
)

In [None]:
best

In [None]:
# Train the model withe identified hyperparameter
model = XGBRegressor(n_jobs=-1, 
                   eval_metric="rmse",
                   subsample=0.8,
                   colsample_bytree=0.7,
                   eta=0.2,
                   max_depth=4,
                   min_child_weight=1.6,
                   n_estimators=400,
                    seed=1)

model.fit(x_train,age_train,eval_set=[(x_train, age_train), (x_test, age_test)], early_stopping_rounds=200, verbose=False)



In [None]:
predict_test = model.predict(x_test)
mean_squared_error(predict_test,age_test)

In [None]:
## save the pipeline and model

from joblib import dump, load

dump(model, 'model.joblib')
dump(transformer,'transformer.joblib')

