# Machine Learning model for MIL

The work below examines the data from Mechatherm International Limited and experimentation of linear and nonlinear models to determine which one generalizes well on the data.

## Module imports

In [23]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.core.tools.datetimes import Scalar
from sklearn.linear_model import Ridge, LinearRegression, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, AdaBoostRegressor
from sklearn.model_selection import cross_val_score, RepeatedKFold, GridSearchCV, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

import pickle
import warnings

from sklearn.utils import shuffle

warnings.filterwarnings('ignore')

## Loading and splitting data

In [24]:
# loading the data
data = pd.read_excel('Data.xlsx')
# split data into train and test
X_train = data.iloc[0:10, 0:5]
Y_train = data.iloc[0:10, 5:18]

X_test = data.iloc[10:,0:5]
Y_test = data.iloc[10:,5:18] 

In [25]:
##

## Inital model evaluation 

In [26]:
# create a list for linear models
l_models = []
l_models.append(('LR',LinearRegression()))
l_models.append(('R',Ridge()))
l_models.append(('LASSO',Lasso()))
l_models.append(('EN',ElasticNet()))

# create for nonlinear models
nl_models = []
nl_models.append(('DT',DecisionTreeRegressor()))
nl_models.append(('RF',RandomForestRegressor()))
nl_models.append(('eT',ExtraTreesRegressor()))

In [27]:
#Evaluate linear models in turn
results = []
names = []
scoring = 'neg_mean_absolute_error'

for name, model in l_models:
    kfold = KFold(n_splits=10, random_state=1, shuffle=True)
    cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

LR: -98.357823 (56.725117)
R: -98.437962 (60.887168)
LASSO: -106.619964 (57.900019)
EN: -120.702393 (75.189692)


In [28]:
#Evaluate lnon-inear models in turn
results = []
names = []
scoring = 'neg_mean_absolute_error'

for name, model in nl_models:
    kfold = KFold(n_splits=10, random_state=1, shuffle=True)
    cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

DT: -1238.392308 (251.882711)
RF: -726.630846 (774.580901)
eT: -520.741923 (521.042578)


### Observations from the initial model evaluation (before standardization)

* The linear models performed better in general compared to the performance of the nonlinear models. 
* Linear regression and Ridge regression outperformed all the linear models.
* Elastic Net did better than all the non linear models but it's cross validated mean is worse than the worse performing linear model.




In [29]:
# Evaluate algorithms on standardized dataset
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
pipelines = []
# Linear regression models
pipelines.append(('ScaledLR',Pipeline([('Scaler',StandardScaler()),('LR',LinearRegression())])))
pipelines.append(('ScaledR',Pipeline([('Scaler',StandardScaler()),('R',Ridge())])))
pipelines.append(('ScaledLASSO',Pipeline([('Scaler',StandardScaler()),('LASSO',Lasso())])))
pipelines.append(('ScaledEN',Pipeline([('Scaler',StandardScaler()),('EN',ElasticNet())])))
# nonlinear regression models
pipelines.append(('ScaledDT',Pipeline([('Scaler',StandardScaler()),('DT',DecisionTreeRegressor())])))
pipelines.append(('ScaledRF',Pipeline([('Scaler',StandardScaler()),('RF',RandomForestRegressor())])))
pipelines.append(('ScaledET',Pipeline([('Scaler',StandardScaler()),('ET',ExtraTreesRegressor())])))

In [30]:
# Evaluate each model in turn
results = []
names = []
scoring = 'neg_mean_absolute_error'

for name, model in pipelines:
    kfold = KFold(n_splits=10, random_state=1, shuffle=True)
    cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

ScaledLR: -98.357823 (56.725117)
ScaledR: -229.000443 (114.576791)
ScaledLASSO: -79.482398 (36.843980)
ScaledEN: -407.832096 (249.682816)
ScaledDT: -1184.015385 (275.192692)
ScaledRF: -715.547077 (769.654577)
ScaledET: -558.306154 (514.397951)


### Observations after standardising the data

* Linear regression maintained it's values compared to its performance before standardisation. It could be said that it generalized well
* Ridge regression which was close to the Linear regression did not do so well after standardisation
* Lasso regression outperformed all the models and hence the optimal model for the data
* Elastic Net performed poorly of all the linear models
* The nonlinear models did not improve significantly. This can be attributed to the volume of training data




## Hyperparameter tuning for Lasso Regression

In [31]:
# Lasso performs better
# Tune Lasso Regressor
# defing model
model = Lasso()
# define model evaluation method
cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=1)


In [32]:
# define grid
grid = dict()
grid['alpha'] = np.arange(0,1,0.01)

# define search
search = GridSearchCV(model, grid, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)

# perform the search
sc = StandardScaler()
X_scaled = sc.fit_transform(X_train)
results = search.fit(X_scaled, Y_train)

# summerize
print('MAE: %.3f' % results.best_score_)
print('Config: %s' % results.best_params_)

MAE: -88.715
Config: {'alpha': 0.99}


In [33]:
# Build Final Model
# initialize scaler
scaler = sc.fit(X_train)
# saving scaler for deployment
with open('scaler_pkl','wb') as files:
    pickle.dump(scaler, files)
#  transform train data with scaler
X_scaled = sc.transform(X_train)
final_model = Lasso(alpha=0.99)
final_model.fit(X_scaled, Y_train)


Lasso(alpha=0.99)

In [34]:
# Save final model
with open('tuned_pkl','wb') as files:
    pickle.dump(final_model, files)


In [35]:
# make predictions
x_test_scaled = scaler.transform(X_test)
y_pred = final_model.predict(x_test_scaled)


In [36]:
# Evaluate performance
print("MAE", mean_absolute_error(Y_test, y_pred))
print("MSE", mean_squared_error(Y_test, y_pred))
print("RMSE", np.sqrt(mean_squared_error(Y_test, y_pred)))
print("R_score", r2_score(Y_test, y_pred))

pred_columns = ['Back Ramp', 'Centre Base', 'Front Ramp', 'Back Wall', 'Left Wall','Right Wall', 'Roof Beams',
'Lintel Beam', 'Door Shaft','Door Fabrication', 'Heat Shield', 'Door Surround Casting','Refractory']
# save output to dataframe
output_df = pd.DataFrame(y_pred, columns=pred_columns)
output_df

MAE 722.9709849245822
MSE 2159423.9487632955
RMSE 1469.4978559913911
R_score 0.8739648233905039


Unnamed: 0,Back Ramp,Centre Base,Front Ramp,Back Wall,Left Wall,Right Wall,Roof Beams,Lintel Beam,Door Shaft,Door Fabrication,Heat Shield,Door Surround Casting,Refractory
0,5636.027323,8107.477482,10757.283293,7648.130588,5016.578827,5015.065686,4030.560334,5409.926498,2944.128457,7613.645507,4118.740196,4591.038166,161856.402453
1,6811.920068,10029.496988,12658.466718,8861.13587,5431.744623,5428.730312,4854.111241,5903.474464,3045.598775,8280.143958,4438.922197,4889.373744,186589.906809
2,7863.130995,11749.313333,14633.811415,9366.34785,5753.794884,5748.248255,5603.163981,6386.117481,3147.069093,8764.061926,4783.80145,5159.80879,209820.967898
3,8901.326004,13459.77465,16572.849791,9874.00798,6075.315457,6067.262238,6348.878241,6866.851065,3248.539411,9195.371325,5118.367653,5426.651803,233005.773459
4,10016.592468,15301.206342,18639.875521,10428.073929,6404.251669,6393.331664,7141.331232,7374.31672,3350.009729,9650.616268,5452.933857,5693.494816,257355.148972
5,11111.417472,16891.726449,20841.952658,11012.620788,6617.86042,6603.120139,7898.019681,7946.396128,3485.303487,10154.584158,5906.673265,6049.285499,280740.430315


## Performance of the Lasso regression

* Using a grid search, the best penalty paramter was 0.99
* After implementation, the model was evaluated on the Mean Absolute Error (MAE), Mean Squared Error (MSE), Root Mean Square Error (RMSE) and the r-squared
* values for the final evaluation are on the hihger side which could be becuase of the amount of training data.

In [37]:
data

Unnamed: 0,Capacity,Door Height,Door Width,Metal Depth,Bath Length,Back Ramp,Centre Base,Front Ramp,Back Wall,Left Wall,Right Wall,Roof Beams,Lintel Beam,Door Shaft,Door Fabrication,Heat Shield,Door Surround Casting,Refractory,Total steel
0,5.09,1500,2750,500,2200,1272,607,2589,5313,2339,2333,422,3238,2232,4750,1698,2619,36300,26793
1,10.35,1500,3750,530,2900,1631,1281,3322,5251,2791,2785,888,3507,2368,4911,2177,3024,52424,30912
2,15.49,1500,4500,580,3300,1911,1710,4073,5371,2996,2997,1070,3703,2470,5318,2475,3251,64252,34094
3,20.6,1500,5000,650,3600,2185,2018,4733,5371,3200,3199,1328,3884,2538,5589,2754,3437,73658,36799
4,25.08,1500,5250,700,3900,2490,2613,5247,5413,3375,3371,1576,3991,2572,5725,2938,3580,81174,39311
5,30.37,1500,5500,720,4300,2670,3098,5647,5502,3663,3662,1848,4089,2606,5861,2957,3620,90370,41603
6,40.84,1800,5750,770,5000,3394,4536,6292,6668,4299,4298,2499,4326,2640,6261,3052,3765,110076,48265
7,50.15,1800,6250,830,5300,4019,5969,7209,6862,4559,4568,2732,4438,2708,6539,3242,3959,122556,52845
8,60.68,1800,7000,870,5500,4466,6373,8562,7153,4672,4671,3248,4869,2810,6955,3629,4211,136998,57408
9,72.11,1800,7750,940,5600,5158,7141,9878,7445,4809,4808,3682,5243,2912,7371,4016,4505,150273,62463


In [38]:
data.describe()

Unnamed: 0,Capacity,Door Height,Door Width,Metal Depth,Bath Length,Back Ramp,Centre Base,Front Ramp,Back Wall,Left Wall,Right Wall,Roof Beams,Lintel Beam,Door Shaft,Door Fabrication,Heat Shield,Door Surround Casting,Refractory,Total steel
count,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0
mean,70.4675,1750.0,7062.5,846.25,5143.75,5232.5,7473.1875,9897.25,7221.125,4471.75,4470.6875,3710.5,5083.8125,2835.5625,6986.0625,3699.0,4283.625,139032.625,61081.4375
std,57.273002,216.02469,2691.808562,217.190393,1643.154994,3506.795802,6447.996528,6394.112339,1805.711305,1297.032536,1298.020016,2751.20662,1474.489345,393.009409,1639.299016,1265.321145,1079.120004,73573.075808,28001.892786
min,5.09,1500.0,2750.0,500.0,2200.0,1272.0,607.0,2589.0,5251.0,2339.0,2333.0,422.0,3238.0,2232.0,4750.0,1698.0,2619.0,36300.0,26793.0
25%,23.96,1500.0,5187.5,687.5,3825.0,2413.75,2464.25,5118.5,5402.5,3331.25,3328.0,1514.0,3964.25,2563.5,5691.0,2892.0,3544.25,79295.0,38683.0
50%,55.415,1800.0,6625.0,850.0,5400.0,4242.5,6171.0,7885.5,7007.5,4615.5,4619.5,2990.0,4653.5,2759.0,6747.0,3435.5,4085.0,129777.0,55126.5
75%,106.945,2000.0,8937.5,1077.5,6300.0,7679.0,9775.5,13554.5,8970.0,5547.5,5547.5,5421.5,6106.5,3073.5,8230.75,4502.5,5063.25,189491.25,78408.75
max,184.9,2000.0,12000.0,1100.0,7600.0,12256.0,22520.0,23514.0,10227.0,6353.0,6354.0,9471.0,8235.0,3525.0,9954.0,6127.0,6450.0,275413.0,118501.0


In [39]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16 entries, 0 to 15
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Capacity               16 non-null     float64
 1   Door Height            16 non-null     int64  
 2   Door Width             16 non-null     int64  
 3   Metal Depth            16 non-null     int64  
 4   Bath Length            16 non-null     int64  
 5   Back Ramp              16 non-null     int64  
 6   Centre Base            16 non-null     int64  
 7   Front Ramp             16 non-null     int64  
 8   Back Wall              16 non-null     int64  
 9   Left Wall              16 non-null     int64  
 10  Right Wall             16 non-null     int64  
 11  Roof Beams             16 non-null     int64  
 12  Lintel Beam            16 non-null     int64  
 13  Door Shaft             16 non-null     int64  
 14  Door Fabrication       16 non-null     int64  
 15  Heat Shi

In [40]:
X_train

Unnamed: 0,Capacity,Door Height,Door Width,Metal Depth,Bath Length
0,5.09,1500,2750,500,2200
1,10.35,1500,3750,530,2900
2,15.49,1500,4500,580,3300
3,20.6,1500,5000,650,3600
4,25.08,1500,5250,700,3900
5,30.37,1500,5500,720,4300
6,40.84,1800,5750,770,5000
7,50.15,1800,6250,830,5300
8,60.68,1800,7000,870,5500
9,72.11,1800,7750,940,5600


In [41]:
Y_train

Unnamed: 0,Back Ramp,Centre Base,Front Ramp,Back Wall,Left Wall,Right Wall,Roof Beams,Lintel Beam,Door Shaft,Door Fabrication,Heat Shield,Door Surround Casting,Refractory
0,1272,607,2589,5313,2339,2333,422,3238,2232,4750,1698,2619,36300
1,1631,1281,3322,5251,2791,2785,888,3507,2368,4911,2177,3024,52424
2,1911,1710,4073,5371,2996,2997,1070,3703,2470,5318,2475,3251,64252
3,2185,2018,4733,5371,3200,3199,1328,3884,2538,5589,2754,3437,73658
4,2490,2613,5247,5413,3375,3371,1576,3991,2572,5725,2938,3580,81174
5,2670,3098,5647,5502,3663,3662,1848,4089,2606,5861,2957,3620,90370
6,3394,4536,6292,6668,4299,4298,2499,4326,2640,6261,3052,3765,110076
7,4019,5969,7209,6862,4559,4568,2732,4438,2708,6539,3242,3959,122556
8,4466,6373,8562,7153,4672,4671,3248,4869,2810,6955,3629,4211,136998
9,5158,7141,9878,7445,4809,4808,3682,5243,2912,7371,4016,4505,150273


In [42]:
coeffs = final_model.coef_ # extracting the coefficients from the lasso regression
type(coeffs)
ls = np.array
(coeffs).tolist() # convert the coefficient results from numpy to a list

[[1058.4900803152982,
  82.79003882912308,
  59.36449354244909,
  34.84330592519298,
  -0.0],
 [1798.731352466549,
  196.04254506846655,
  -444.2806691780879,
  -0.0,
  663.4507297778883],
 [1757.7629066587106,
  -112.21466739541886,
  718.7011887635206,
  126.01781115899084,
  -325.748715940621],
 [637.3319289144886,
  511.8540522132286,
  -0.0,
  -26.73436116402791,
  -274.21752098116224],
 [101.84548964156043, 109.91504235928467, 0.0, 0.0, 626.6022352575352],
 [96.89860248023219, 111.06608242489094, 0.0, 0.0, 632.7292855700022],
 [641.904100599633,
  69.767379194704,
  113.80693754149218,
  -0.0,
  207.73587919325075],
 [367.1350581871838,
  1.836449800852517,
  316.69836325709554,
  -0.0,
  -105.38789924720784],
 [0.0, 0.0, 189.65270245671618, 0.0, 0.0],
 [328.72787995988165,
  55.90885090502082,
  264.0593946515502,
  236.12427422940283,
  -62.83707907164709],
 [0.0,
  -34.99244891854718,
  642.4800515762312,
  47.84311458960408,
  -25.605144132166334],
 [0.0, 15.223431311867254, 

In [43]:
# intercepts for the each output column
intercept = final_model.intercept_
intercept

array([ 2919.6,  3534.6,  5755.2,  6034.9,  3670.3,  3669.2,  1929.3,
        4128.8,  2585.6,  5928. ,  2893.8,  3597.1, 91808.1])

In [44]:
data.corr(method='pearson')

Unnamed: 0,Capacity,Door Height,Door Width,Metal Depth,Bath Length,Back Ramp,Centre Base,Front Ramp,Back Wall,Left Wall,Right Wall,Roof Beams,Lintel Beam,Door Shaft,Door Fabrication,Heat Shield,Door Surround Casting,Refractory,Total steel
Capacity,1.0,0.901352,0.984995,0.92649,0.946595,0.999505,0.986942,0.998288,0.981041,0.956029,0.95565,0.999115,0.996256,0.975534,0.990623,0.985761,0.988241,0.993353,0.999087
Door Height,0.901352,1.0,0.908574,0.947743,0.941885,0.903565,0.853419,0.877351,0.963909,0.96063,0.960672,0.898201,0.894798,0.899924,0.930028,0.894781,0.905069,0.926683,0.904074
Door Width,0.984995,0.908574,1.0,0.963422,0.978678,0.983108,0.961663,0.981337,0.970442,0.980114,0.979913,0.984829,0.9886,0.986331,0.996589,0.99904,0.997937,0.996036,0.988193
Metal Depth,0.92649,0.947743,0.963422,1.0,0.979725,0.925967,0.869858,0.909771,0.95358,0.98524,0.985206,0.921879,0.928826,0.951526,0.966381,0.955087,0.953132,0.958205,0.929375
Bath Length,0.946595,0.941885,0.978678,0.979725,1.0,0.943233,0.916635,0.934711,0.954574,0.997,0.997052,0.946984,0.947557,0.963948,0.976605,0.972486,0.970961,0.976983,0.954173
Back Ramp,0.999505,0.903565,0.983108,0.925967,0.943233,1.0,0.984728,0.997573,0.982762,0.954179,0.953811,0.998736,0.99606,0.973989,0.989461,0.983612,0.987321,0.991896,0.998181
Centre Base,0.986942,0.853419,0.961663,0.869858,0.916635,0.984728,1.0,0.991562,0.948542,0.921016,0.920666,0.989479,0.983616,0.947543,0.965229,0.966305,0.970008,0.97336,0.988997
Front Ramp,0.998288,0.877351,0.981337,0.909771,0.934711,0.997573,0.991562,1.0,0.969371,0.942307,0.941898,0.998507,0.996109,0.96897,0.984632,0.984186,0.986244,0.988161,0.997692
Back Wall,0.981041,0.963909,0.970442,0.95358,0.954574,0.982762,0.948542,0.969371,1.0,0.97218,0.971919,0.977969,0.975886,0.962636,0.985494,0.964627,0.971782,0.984103,0.979683
Left Wall,0.956029,0.96063,0.980114,0.98524,0.997,0.954179,0.921016,0.942307,0.97218,1.0,0.999997,0.95492,0.954842,0.967571,0.982846,0.972773,0.973478,0.982238,0.961291


## Correlation coefficients

* The correlation coefficients were based on the pearson correlation.
* From the correlation values, it can be concluded that the values are highly correlated