# Modelling

## 1.0 Importing libraries and loading data

In [320]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, mean_absolute_error
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, cross_val_score, cross_validate, KFold
from sklearn.ensemble import VotingRegressor
import time
from pycaret.regression import *
import pickle

In [321]:
mainpath = '/Users/chinmayasukumar/Documents/Springboard/Capstone #2/data/interim/'

steel = pd.read_csv('/Users/chinmayasukumar/Documents/Springboard/Capstone #2/data/interim/steel_clean_final.csv')
X = np.array(pd.read_csv(mainpath + 'X.csv'))
y = np.array(pd.read_csv(mainpath + 'y.csv'))
X_train = np.array(pd.read_csv(mainpath + 'X_train.csv'))
y_train = np.array(pd.read_csv(mainpath + 'y_train.csv'))
X_valid = np.array(pd.read_csv(mainpath + 'X_valid.csv'))
y_valid = np.array(pd.read_csv(mainpath + 'y_valid.csv'))
X_test = np.array(pd.read_csv(mainpath + 'X_test.csv'))
y_test = np.array(pd.read_csv(mainpath + 'y_test.csv'))

In [322]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape, X_valid.shape, y_valid.shape, y.shape

((432, 14), (432, 1), (124, 14), (124, 1), (62, 14), (62, 1), (618, 1))

In [323]:
y, y_train, y_valid, y_test = y.reshape(y.shape[0],), y_train.reshape(y_train.shape[0],), \
                            y_valid.reshape(y_valid.shape[0],), y_test.reshape(y_test.shape[0],)

In [324]:
y_train.shape, y_test.shape, y.shape

((432,), (124,), (618,))

In [325]:
np.random.seed(132)

## 2.0 Top 3 models using PyCaret

### Using pycaret, the top 3 models will be chosen to build a weighted Ensemble model

In [326]:
steel.head()

Unnamed: 0,c,si,mn,p,s,ni,cr,mo,cu,v,al,n,nb+ta,temp,yield
0,0.12,0.36,0.52,0.009,0.003,0.089,0.97,0.61,0.04,0.0,0.003,0.0066,0.0,27,342
1,0.12,0.36,0.52,0.009,0.003,0.089,0.97,0.61,0.04,0.0,0.003,0.0066,0.0,100,338
2,0.12,0.36,0.52,0.009,0.003,0.089,0.97,0.61,0.04,0.0,0.003,0.0066,0.0,200,337
3,0.12,0.36,0.52,0.009,0.003,0.089,0.97,0.61,0.04,0.0,0.003,0.0066,0.0,300,346
4,0.12,0.36,0.52,0.009,0.003,0.089,0.97,0.61,0.04,0.0,0.003,0.0066,0.0,400,316


In [327]:
s = setup(steel, target='yield', session_id=123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,yield
2,Target type,regression
3,Data shape,"(618, 15)"
4,Train data shape,"(432, 15)"
5,Test data shape,"(186, 15)"
6,Numeric features,14
7,Preprocess,True
8,Imputation type,simple
9,Numeric imputation,mean


In [328]:
top3 = compare_models(n_select = 3)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,14.9141,735.798,24.5215,0.9557,0.0699,0.0454,0.468
lightgbm,Light Gradient Boosting Machine,16.5407,824.1776,26.0469,0.9508,0.0731,0.0493,0.183
et,Extra Trees Regressor,17.2057,812.8257,26.4365,0.9508,0.0759,0.0511,0.115
gbr,Gradient Boosting Regressor,17.9938,875.5949,27.4723,0.9475,0.0789,0.0545,0.049
xgboost,Extreme Gradient Boosting,16.9166,894.2253,27.1273,0.9468,0.0756,0.05,0.07
rf,Random Forest Regressor,17.9213,882.6691,27.795,0.9468,0.0789,0.0529,0.133
dt,Decision Tree Regressor,22.7973,1534.3729,35.714,0.9083,0.1026,0.0646,0.022
ada,AdaBoost Regressor,35.2418,2148.1269,45.6406,0.8711,0.1412,0.113,0.056
knn,K Neighbors Regressor,35.2107,2448.9948,48.7628,0.8511,0.1421,0.1054,0.036
lar,Least Angle Regression,35.669,2459.3196,48.0963,0.8506,0.1394,0.107,0.029


#### The top 3 models are a Cat Boost Regressor, Light Gradient Boosting Regressor and an Extra Trees Regressor

In [10]:
# Cat Boost Regressor
evaluate_model(top3[0]) 

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [11]:
# Light Gradient Boosting Machine 
evaluate_model(top3[1])

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [12]:
# Extra Trees Regressor
evaluate_model(top3[2])

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [350]:
cat, lgbm, xt = top3[0], top3[1], top3[2] 

### Each regressor will be tuned before use in a Voting Regressor

## 3.0 CatBoostRegressor

### 3.1 Hyperparameter tuning

In [345]:
# Finding an optimal CatBoostRegressor
params = {'depth':[2,4,6,8], 'learning_rate':[0.001,0.05,0.1,0.2,0.3,0.5,0.7,1.0], \
          'iterations':[30,50,100,150,200,500]}

cat_cv = GridSearchCV(cat, params)
cat_cv.fit(X_train, y_train)
print('CV score - CatBoostRegressor:',cat_cv.best_score_)
print('Best params: ',cat_cv.best_params_)

# We will assign a name to the best regressor 
best_cat = cat_cv.best_estimator_

CV score - CatBoostRegressor: 0.9568347253427676
Best params:  {'depth': 6, 'iterations': 200, 'learning_rate': 0.3}


### 3.2 Scoring

In [228]:
# Returns CV score
def cv(model, scoring):
    return np.abs(cross_val_score(model, X, y, scoring=scoring, cv=KFold(5, shuffle=True, random_state=132)).mean())

# Returns multiple metrics
def scorer(model, X, y):
    y_pred = model.predict(X)
    return model.score(X, y), mean_absolute_error(y_pred, y), mean_squared_error(y_pred, y)

# Returns CV score
def scorer_cv(model):
    return cv(model, 'r2'), cv(model, 'neg_mean_absolute_error'), cv(model, 'neg_mean_squared_error')

#### The untuned model will be scored to serve as a benchmark

In [256]:
# Scoring on the untuned model
cat.fit(X_train, y_train)

train_cat0_r2, train_cat0_mae, train_cat0_mse = scorer(cat, X_train, y_train)
valid_cat0_r2, valid_cat0_mae, valid_cat0_mse = scorer(cat, X_valid, y_valid)
test_cat0_r2, test_cat0_mae, test_cat0_mse = scorer(cat, X_test, y_test)
entire_cat0_r2, entire_cat0_mae, entire_cat0_mse = scorer_cv(cat)

#### Scoring on the optimal regressor

In [269]:
# Scoring on the training set
best_cat.fit(X_train, y_train)

train_cat_r2, train_cat_mae, train_cat_mse = scorer(best_cat, X_train, y_train)

In [270]:
# Scoring on the test set
test_cat_r2, test_cat_mae, test_cat_mse = scorer(best_cat, X_test, y_test)

In [271]:
# Scoring on the validation set
valid_cat_r2, valid_cat_mae, valid_cat_mse = scorer(best_cat, X_valid, y_valid)

In [272]:
# Scoring on the entire set
entire_cat_r2, entire_cat_mae, entire_cat_mse = scorer_cv(best_cat)

### 3.3 Comparing results

In [273]:
index = [['Train', 'Train', 'Train', 'Valid', 'Valid', 'Valid', 'Test', 'Test', 'Test', 'CV entire', 'CV entire',\
          'CV entire'],
         ['R2', 'MAE', 'MSE', 'R2', 'MAE', 'MSE', 'R2', 'MAE', 'MSE', 'R2', 'MAE', 'MSE']]

data = {'Untuned CatBoost':[train_cat0_r2, train_cat0_mae, train_cat0_mse, valid_cat0_r2, valid_cat0_mae, \
                           valid_cat0_mse, test_cat0_r2, test_cat0_mae, test_cat0_mse, entire_cat0_r2, \
                            entire_cat0_mae, entire_cat0_mse],
       'CatBoost': [train_cat_r2, train_cat_mae, train_cat_mse, valid_cat_r2, valid_cat_mae, \
                           valid_cat_mse, test_cat_r2, test_cat_mae, test_cat_mse, entire_cat_r2, \
                            entire_cat_mae, entire_cat_mse]}
catboost_df = pd.DataFrame(data, index=index)

catboost_df

Unnamed: 0,Unnamed: 1,Untuned CatBoost,CatBoost
Train,R2,0.998044,0.999429
Train,MAE,4.381501,2.509424
Train,MSE,34.05589,9.948996
Valid,R2,0.983944,0.982376
Valid,MAE,12.45279,13.496551
Valid,MSE,311.425003,341.836147
Test,R2,0.927087,0.923388
Test,MAE,16.969241,17.255153
Test,MSE,1303.243063,1369.360729
CV entire,R2,0.956715,0.955564


#### Since the tuned regressor is performing poorly compared to the default model, the default model will be chosen


## 4.0 Light Gradient Boosting Machine

### 4.1 Hyperparameter tuning

In [274]:
# Finding an optimal LGBM model
params = {'num_leaves':[2,3,4,6,8,10,15,20,30], 'max_depth':[-1,2,3,4,5,10,15,20], \
          'n_estimators':[10,20,50,100,150,200,300,500], 'learning_rate':[0.001, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5]}

lgbm_cv = GridSearchCV(lgbm, params)
lgbm_cv.fit(X_train, y_train)
print('CV score - LGBM: ', lgbm_cv.best_score_)
print('Best params: ', lgbm_cv.best_params_)

best_lgbm = lgbm_cv.best_estimator_

CV score - LGBM:  0.9566370645451399
Best params:  {'learning_rate': 0.3, 'max_depth': 4, 'n_estimators': 100, 'num_leaves': 8}


### 4.2 Scoring

#### Scoring on untuned model

In [219]:
# Scoring on the untuned model
lgbm.fit(X_train, y_train)

train_lgbm0_r2, train_lgbm0_mae, train_lgbm0_mse = scorer(lgbm, X_train, y_train)
valid_lgbm0_r2, valid_lgbm0_mae, valid_lgbm0_mse = scorer(lgbm, X_valid, y_valid)
test_lgbm0_r2, test_lgbm0_mae, test_lgbm0_mse = scorer(lgbm, X_test, y_test)
entire_lgbm0_r2, entire_lgbm0_mae, entire_lgbm0_mse = scorer_cv(lgbm)

#### Scoring with best model

In [275]:
# Scoring on the training set
best_lgbm.fit(X_train, y_train)

train_lgbm_r2, train_lgbm_mae, train_lgbm_mse = scorer(best_lgbm, X_train, y_train)

In [276]:
# Scoring on the test set
test_lgbm_r2, test_lgbm_mae, test_lgbm_mse = scorer(best_lgbm, X_test, y_test)

In [277]:
# Scoring on validation set
valid_lgbm_r2, valid_lgbm_mae, valid_lgbm_mse = scorer(best_lgbm, X_valid, y_valid)

In [278]:
# Scoring on the entire set
entire_lgbm_r2, entire_lgbm_mae, entire_lgbm_mse = scorer_cv(best_lgbm)

### 4.3 Comparing results

In [279]:
data = {'Untuned LGBM':[train_lgbm0_r2, train_lgbm0_mae, train_lgbm0_mse, valid_lgbm0_r2, valid_lgbm0_mae, \
                           valid_lgbm0_mse, test_lgbm0_r2, test_lgbm0_mae, test_lgbm0_mse, entire_lgbm0_r2, \
                            entire_lgbm0_mae, entire_lgbm0_mse],
       'LGBM': [train_lgbm_r2, train_lgbm_mae, train_lgbm_mse, valid_lgbm_r2, valid_lgbm_mae, \
                           valid_lgbm_mse, test_lgbm_r2, test_lgbm_mae, test_lgbm_mse, entire_lgbm_r2, \
                            entire_lgbm_mae, entire_lgbm_mse]}
lgbm_df = pd.DataFrame(data, index=index)

lgbm_df

Unnamed: 0,Unnamed: 1,Untuned LGBM,LGBM
Train,R2,0.984709,0.988166
Train,MAE,8.633846,8.457923
Train,MSE,266.223392,206.045577
Valid,R2,0.982793,0.984886
Valid,MAE,13.783181,13.106688
Valid,MSE,333.740443,293.156436
Test,R2,0.919589,0.920844
Test,MAE,16.959305,18.45185
Test,MSE,1437.257607,1414.834575
CV entire,R2,0.956984,0.953459


#### The tuned regressor will be chosen

In [332]:
# Combined DataFrame

combined = pd.concat([catboost_df, lgbm_df, xt_df], axis=1)
combined

Unnamed: 0,Unnamed: 1,Untuned CatBoost,CatBoost,Untuned LGBM,LGBM,Untuned ExtraTrees,ExtraTrees
Train,R2,0.998044,0.999429,0.984709,0.988166,1.0,0.997733
Train,MAE,4.381501,2.509424,8.633846,8.457923,0.0,3.277293
Train,MSE,34.05589,9.948996,266.223392,206.045577,0.0,39.463553
Valid,R2,0.983944,0.982376,0.982793,0.984886,0.982518,0.982405
Valid,MAE,12.45279,13.496551,13.783181,13.106688,15.066129,15.224032
Valid,MSE,311.425003,341.836147,333.740443,293.156436,339.085687,341.274511
Test,R2,0.927087,0.923388,0.919589,0.920844,0.916665,0.922419
Test,MAE,16.969241,17.255153,16.959305,18.45185,19.18129,18.308929
Test,MSE,1303.243063,1369.360729,1437.257607,1414.834575,1489.526024,1386.68081
CV entire,R2,0.956715,0.955564,0.956984,0.953459,0.947689,0.95065


## 5.0 Extra Trees Regressor

### 5.1 Hyperparameter tuning

In [304]:
# Finding an optimal Extra Trees Regressor
params = {'n_estimators':[10,20,30,40,50,100], 'max_depth':[5,10,15,20,30,50], 'min_samples_split':[1,2,3,4,5,10],\
         'min_samples_leaf':[1,2,3,4,5,10]}

xt_cv = RandomizedSearchCV(xt, params)
xt_cv.fit(X_train, y_train)
print('CV score - Extra Trees Regressor: ', xt_cv.best_score_)
print('Best params: ', xt_cv.best_params_)

best_xt = xt_cv.best_estimator_

CV score - Extra Trees Regressor:  0.9459190789532078
Best params:  {'n_estimators': 50, 'min_samples_split': 3, 'min_samples_leaf': 1, 'max_depth': 20}


### 5.2 Scoring

#### Scoring on untuned model

In [118]:
# Scoring untuned model
xt.fit(X_train, y_train)

train_xt0_r2, train_xt0_mae, train_xt0_mse = scorer(xt, X_train, y_train)
valid_xt0_r2, valid_xt0_mae, valid_xt0_mse = scorer(xt, X_valid, y_valid)
test_xt0_r2, test_xt0_mae, test_xt0_mse = scorer(xt, X_test, y_test)
entire_xt0_r2, entire_xt0_mae, entire_xt0_mse = scorer_cv(xt)

#### Scoring on best model

In [305]:
# Scoring on the training set
best_xt.fit(X_train, y_train)

train_xt_r2, train_xt_mae, train_xt_mse = scorer(best_xt, X_train, y_train)

In [306]:
# Scoring on the test set
test_xt_r2, test_xt_mae, test_xt_mse = scorer(best_xt, X_test, y_test)

In [307]:
# Scoring on the validation set
valid_xt_r2, valid_xt_mae, valid_xt_mse = scorer(best_xt, X_valid, y_valid)

In [308]:
# Scoring on the entire set
entire_xt_r2, entire_xt_mae, entire_xt_mse = scorer_cv(best_xt)

### 5.3 Comparing results

In [309]:
data = {'Untuned ExtraTrees':[train_xt0_r2, train_xt0_mae, train_xt0_mse, valid_xt0_r2, valid_xt0_mae, \
                           valid_xt0_mse, test_xt0_r2, test_xt0_mae, test_xt0_mse, entire_xt0_r2, \
                            entire_xt0_mae, entire_xt0_mse],
       'ExtraTrees': [train_xt_r2, train_xt_mae, train_xt_mse, valid_xt_r2, valid_xt_mae, \
                           valid_xt_mse, test_xt_r2, test_xt_mae, test_xt_mse, entire_xt_r2, \
                            entire_xt_mae, entire_xt_mse]}
xt_df = pd.DataFrame(data, index=index)

xt_df

Unnamed: 0,Unnamed: 1,Untuned ExtraTrees,ExtraTrees
Train,R2,1.0,0.997733
Train,MAE,0.0,3.277293
Train,MSE,0.0,39.463553
Valid,R2,0.982518,0.982405
Valid,MAE,15.066129,15.224032
Valid,MSE,339.085687,341.274511
Test,R2,0.916665,0.922419
Test,MAE,19.18129,18.308929
Test,MSE,1489.526024,1386.68081
CV entire,R2,0.947689,0.95065


#### The tuned regressor will be chosen

## 6.0 Creating an Ensemble Model

### 6.1 Finding weights

In [310]:
# Weights will be assigned iteratively to each model in a Voting Regressor to discover the most accurate model
weights1 = []
weights2 = []
weights3 = []
scores = []

for i in np.arange(0.1,1,0.1):
    for j in np.arange(0.1,1,0.1):
        for k in np.arange(0.1,1,0.1):
            vote_reg = VotingRegressor([('cat', cat), ('lgbm', best_lgbm), ('xt', best_xt)], weights = [i,j,k])
            vote_reg.fit(X_train, y_train)
            y_pred = vote_reg.predict(X_test)
            score = r2_score(y_pred, y_test)
            scores.append(score)
            weights1.append(i)
            weights2.append(j)
            weights3.append(k)

In [311]:
#Combining weights into a DataFrame
scores_df = pd.DataFrame({'cat weights':weights1, 'lgbm weights':weights2, 'xt weights':weights3, 'score':scores})

#The sum of the weights must equal 1 so combinations of weights with a sum less than 1 are excluded
scores_df = scores_df[scores_df[['cat weights', 'lgbm weights', 'xt weights']].sum(axis=1) == 1]
scores_df.sort_values(by='score', ascending=False)

Unnamed: 0,cat weights,lgbm weights,xt weights,score
487,0.7,0.1,0.2,0.921013
567,0.8,0.1,0.1,0.920955
407,0.6,0.1,0.3,0.92093
327,0.5,0.1,0.4,0.920706
415,0.6,0.2,0.2,0.920643
335,0.5,0.2,0.3,0.92056
247,0.4,0.1,0.5,0.920342
255,0.4,0.2,0.4,0.920336
343,0.5,0.3,0.2,0.920175
423,0.6,0.3,0.1,0.920118


In [312]:
#The weights [0.3, 0.2, 0.5] are the ones which result in the highest score
vote_reg = VotingRegressor([ ('cat', cat), ('lgbm', best_lgbm), ('xt', best_xt),], weights = [0.7, 0.1, 0.2])
vote_reg.fit(X_train, y_train)

VotingRegressor(estimators=[('cat',
                             <catboost.core.CatBoostRegressor object at 0x7f8da23e5790>),
                            ('lgbm',
                             LGBMRegressor(learning_rate=0.3, max_depth=4,
                                           num_leaves=8, random_state=123)),
                            ('xt',
                             ExtraTreesRegressor(max_depth=20,
                                                 min_samples_split=3,
                                                 n_estimators=50, n_jobs=-1,
                                                 random_state=123))],
                weights=[0.7, 0.1, 0.2])

### 6.2 Scoring

In [313]:
# Scoring on the train set
vote_reg.fit(X_train, y_train)

train_vote_reg_r2, train_vote_reg_mae, train_vote_reg_mse = scorer(vote_reg, X_train, y_train)

In [314]:
# Scoring on the test set
test_vote_reg_r2, test_vote_reg_mae, test_vote_reg_mse = scorer(vote_reg, X_test, y_test)

In [315]:
# Scoring on the validation set
valid_vote_reg_r2, valid_vote_reg_mae, valid_vote_reg_mse = scorer(vote_reg, X_valid, y_valid)

In [316]:
# Scoring on the entire set
entire_vote_reg_r2, entire_vote_reg_mae, entire_vote_reg_mse = scorer_cv(vote_reg)

In [317]:
data = {'VotingRegressor':[train_vote_reg_r2, train_vote_reg_mae, train_vote_reg_mse,\
                           valid_vote_reg_r2, valid_vote_reg_mae, valid_vote_reg_mse, \
                           test_vote_reg_r2, test_vote_reg_mae, test_vote_reg_mse, \
                           entire_vote_reg_r2, entire_vote_reg_mae, entire_vote_reg_mse]}
voting_df = pd.DataFrame(data, index=index)
voting_df

Unnamed: 0,Unnamed: 1,VotingRegressor
Train,R2,0.997801
Train,MAE,4.254489
Train,MSE,38.284181
Valid,R2,0.985509
Valid,MAE,12.410774
Valid,MSE,281.061241
Test,R2,0.927101
Test,MAE,16.846251
Test,MSE,1302.991689
CV entire,R2,0.957005


## 7.0 Comparing the models

In [318]:
entire_df = pd.concat([catboost_df['Untuned CatBoost'], lgbm_df['LGBM'], xt_df['ExtraTrees'], voting_df], axis=1)

entire_df

Unnamed: 0,Unnamed: 1,Untuned CatBoost,LGBM,ExtraTrees,VotingRegressor
Train,R2,0.998044,0.988166,0.997733,0.997801
Train,MAE,4.381501,8.457923,3.277293,4.254489
Train,MSE,34.05589,206.045577,39.463553,38.284181
Valid,R2,0.983944,0.984886,0.982405,0.985509
Valid,MAE,12.45279,13.106688,15.224032,12.410774
Valid,MSE,311.425003,293.156436,341.274511,281.061241
Test,R2,0.927087,0.920844,0.922419,0.927101
Test,MAE,16.969241,18.45185,18.308929,16.846251
Test,MSE,1303.243063,1414.834575,1386.68081,1302.991689
CV entire,R2,0.956715,0.953459,0.95065,0.957005


## 8.0 Saving the model

In [319]:
path = '/Users/chinmayasukumar/Documents/Springboard/Capstone #2/models/Final model'
with open(path, "wb") as f:
    pickle.dump(vote_reg, f)