## Goal of data pipeline 

To build a data pipeline flexible enough to accormodate future data changes 
1. Build modules for each data prepartion steps, easier to scale 
2. Handling special dirty data (such as negative values, and exterme outliers 

In [1]:
! pip install hyperopt | tail -n 1



In [2]:
import numpy as np
import pandas as pd
%matplotlib inline

In [3]:
def load_data(filepath):
    
    columns = ['index','age','breed','last_vet_visit','hair_length','height','num_vet_visit','weight']
    
    return pd.read_csv(filepath,names=columns,header=0)

In [4]:
df = load_data('data.csv')

In [5]:
def remove_negative_value(df, columns):
    # df: dataframe of the data 
    # columns: list of columns need to remove negative value from 
    
    if len(columns) == 0 and df.empty:
        return df
    
    for col in columns:
        df = df[df[col]>=0]
    
    return df

In [6]:
remove_negative_value(df,['age','hair_length','height','num_vet_visit','weight']).describe()

Unnamed: 0,index,age,hair_length,height,num_vet_visit,weight
count,295.0,295.0,295.0,295.0,295.0,295.0
mean,148.738983,9.00339,3068759000.0,20.498551,8.420339,3.94339
std,86.317853,2.893223,52707670000.0,5.087124,3.709648,2.025471
min,0.0,0.0,0.6904537,5.077179,0.0,0.0
25%,74.5,7.0,0.9377535,16.94558,6.0,2.0
50%,149.0,9.0,1.003818,20.74863,8.0,4.0
75%,222.5,11.0,1.067127,23.945353,11.0,5.0
max,299.0,18.0,905284000000.0,33.043014,19.0,10.0


In [7]:
def remove_extreme_outliers(df,columns):
    #for values great then quantile 99.9% then it would be treated as exterm outlier 
    # df: dataframe of the data 
    # columns: list of columns need to remove negative value from 
    
    if len(columns) == 0 and df.empty:
        return df
    
    for col in columns:
        limit = df[col].quantile(0.999)
        df = df[df[col]<limit] 
        
    return df

In [8]:
remove_extreme_outliers(df,['age','hair_length','height','num_vet_visit','weight']).describe()

Unnamed: 0,index,age,hair_length,height,num_vet_visit,weight
count,293.0,293.0,293.0,293.0,293.0,293.0
mean,149.938567,8.989761,1.002831,20.443126,8.262799,3.919113
std,86.513577,2.857919,0.100257,5.095106,3.736512,2.000148
min,0.0,0.0,0.690454,5.077179,-1.0,0.0
25%,76.0,7.0,0.937148,16.86038,6.0,2.0
50%,150.0,9.0,1.003253,20.789497,8.0,4.0
75%,224.0,11.0,1.065585,23.889272,11.0,5.0
max,299.0,17.0,1.238662,32.216511,18.0,9.0


In [9]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder



def preprocess():
    
    filepath='data.csv'

    df = load_data(filepath)

    #remove  column 
    df = df.drop(columns=['index','last_vet_visit'])

    #remove any negative value
    df = remove_negative_value(df,['age','hair_length','height','num_vet_visit','weight'])

    #remove any exterme outliers
    df = remove_extreme_outliers(df,['age','hair_length','height','num_vet_visit','weight'])
    
    #split training, test and validation set
#     val_size = 0.1 
    test_size = 0.2
    
    df_train,df_test = train_test_split(df, test_size=test_size)
#     df_train,df_test = train_test_split(df_temp, test_size=test_size)
    
    return df_train,df_test

    
    

In [None]:
train,test=preprocess()
print('train size {}, test size {}'.format(train.shape,test.shape))

In [None]:
train

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder, StandardScaler 
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from xgboost import XGBRegressor
from sklearn.preprocessing import OneHotEncoder



# def training():
    
category_cols = ['breed']
numerical_cols = ['hair_length','height','num_vet_visit','weight']

transformer = make_pipeline(
    ColumnTransformer([
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), category_cols),    
    ])
)

In [None]:
df_train,df_test = preprocess()

age_train = df_train.age.copy().to_numpy()
age_test = df_test.age.copy().to_numpy()
# age_val = df_val.age.copy()

x_train = transformer.fit_transform(df_train.drop(['age'],axis=1))
x_test = transformer.transform(df_test.drop(['age'],axis=1))
# x_val = transformer.transform(df_test.drop(['age'],axis=1))




In [None]:
#hyperparameter turning

from hyperopt.pyll.base import scope
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.metrics import make_scorer, mean_squared_error

def objective(space):
    reg = XGBRegressor(n_jobs=-1, 
                           eval_metric="rmse", 
#                            eta=space["eta"], 
                           max_depth=space["max_depth"], 
                           n_estimators=space["n_estimators"], 
                           min_child_weight=space["min_child_weight"], 
                           colsample_bytree=space["colsample_bytree"],
                           subsample=space["subsample"], 
                           seed=1,
                          silent=True)
    reg.fit(x_train, age_train, eval_set=[(x_train, age_train), (x_test, age_test)], early_stopping_rounds=200, verbose=False) 
    age_pred = reg.predict(x_test)
    mse = mean_squared_error(age_test, age_pred)
    return {"loss": mse, "status": STATUS_OK }

space = {
    "max_depth": scope.int(hp.quniform("max_depth", 3, 8, q=1)),
    "n_estimators": scope.int(hp.quniform("n_estimators", 150, 450, q=50)),
#     "eta": hp.quniform("eta", 0.05, 0.2, 0.05),
    "min_child_weight": hp.quniform("min_child_weight", 0.5, 1.8, 0.1),
    "subsample": hp.quniform("subsample", 0.5, 1, 0.1),
    "colsample_bytree": hp.quniform("colsample_bytree", 0.5, 1, 0.1)
}

best = fmin(
    fn = objective,
    space = space,
    algo = tpe.suggest,
    max_evals = 1000,
    rstate = np.random.RandomState(12345)
)

In [None]:
best

In [None]:
model = XGBRegressor(n_jobs=-1, 
                   eval_metric="rmse",
                   subsample=0.8,
                   colsample_bytree=0.5,
                   eta=0.05,
                   max_depth=7,
                   min_child_weight=1.8,
                   n_estimators=200,
                    seed=1)

model.fit(x_train,age_train,eval_set=[(x_train, age_train), (x_test, age_test)], early_stopping_rounds=200, verbose=False)



In [None]:
predict_test = model.predict(x_test)
mean_squared_error(predict_test,age_test)

In [None]:
## save the pipeline and model

from joblib import dump, load

dump(model, 'model.joblib')
dump(transformer,'transformer.joblib')

