<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Selecting-and-Training-Models" data-toc-modified-id="Selecting-and-Training-Models-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Selecting and Training Models</a></span><ul class="toc-item"><li><span><a href="#From-raw-data-to-processed-data-in-2-steps" data-toc-modified-id="From-raw-data-to-processed-data-in-2-steps-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>From raw data to processed data in 2 steps</a></span></li><li><span><a href="#Selecting-and-Training-Models" data-toc-modified-id="Selecting-and-Training-Models-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Selecting and Training Models</a></span><ul class="toc-item"><li><span><a href="#Mean-Squared-Error" data-toc-modified-id="Mean-Squared-Error-1.2.1"><span class="toc-item-num">1.2.1&nbsp;&nbsp;</span>Mean Squared Error</a></span></li></ul></li><li><span><a href="#Decision-Tree" data-toc-modified-id="Decision-Tree-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Decision Tree</a></span></li><li><span><a href="#Model-Evaluation-using-Cross-Validation" data-toc-modified-id="Model-Evaluation-using-Cross-Validation-1.4"><span class="toc-item-num">1.4&nbsp;&nbsp;</span>Model Evaluation using Cross Validation</a></span></li><li><span><a href="#Random-Forest-model" data-toc-modified-id="Random-Forest-model-1.5"><span class="toc-item-num">1.5&nbsp;&nbsp;</span>Random Forest model</a></span></li><li><span><a href="#Support-Vector-Machine-Regressor" data-toc-modified-id="Support-Vector-Machine-Regressor-1.6"><span class="toc-item-num">1.6&nbsp;&nbsp;</span>Support Vector Machine Regressor</a></span></li><li><span><a href="#Hyperparameter-Tuning-using-GridSearchCV" data-toc-modified-id="Hyperparameter-Tuning-using-GridSearchCV-1.7"><span class="toc-item-num">1.7&nbsp;&nbsp;</span>Hyperparameter Tuning using GridSearchCV</a></span></li><li><span><a href="#Checking-Feature-importance" data-toc-modified-id="Checking-Feature-importance-1.8"><span class="toc-item-num">1.8&nbsp;&nbsp;</span>Checking Feature importance</a></span></li><li><span><a href="#Evaluating-the-entire-system-on-Test-Data" data-toc-modified-id="Evaluating-the-entire-system-on-Test-Data-1.9"><span class="toc-item-num">1.9&nbsp;&nbsp;</span>Evaluating the entire system on Test Data</a></span></li><li><span><a href="#Creating-a-function-to-cover-this-entire-flow" data-toc-modified-id="Creating-a-function-to-cover-this-entire-flow-1.10"><span class="toc-item-num">1.10&nbsp;&nbsp;</span>Creating a function to cover this entire flow</a></span></li><li><span><a href="#Save-the-Model" data-toc-modified-id="Save-the-Model-1.11"><span class="toc-item-num">1.11&nbsp;&nbsp;</span>Save the Model</a></span></li></ul></li></ul></div>

# Predicting Fuel Efficiency of Vehicles - Part 3
## Selecting and Training Models

1. Select and Train a few Algorithms(Linear Regression, Decision Tree, RandomForest)
2. Evaluation using Mean Squared Error
3. Model Evaluation using Cross Validation
4. Hyperparameter Tuning using GridSearchCV
5. Check Feature Importance
6. Evaluate the Final System on test data
7. Saving the Model


In [1]:
##importing a few general use case libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer



import warnings
warnings.filterwarnings('ignore')

In [2]:
# reading the .data file using pandas

cols = ['MPG','Cylinders','Displacement','Horsepower','Weight',
                'Acceleration', 'Model Year', 'Origin']

df = pd.read_csv('./auto-mpg.data', names=cols, na_values = "?",
                comment = '\t',
                sep= " ",
                skipinitialspace=True)

df = df.drop(['Origin'], axis=1)
data = df.copy()

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(data, data["Cylinders"]):
    strat_train_set = data.loc[train_index]
    strat_test_set = data.loc[test_index]

In [3]:
##segregate the feature and target variable
data = strat_train_set.drop("MPG", axis=1)
data_labels = strat_train_set["MPG"].copy()
data

Unnamed: 0,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year
145,4,83.0,61.0,2003.0,19.0,74
151,4,79.0,67.0,2000.0,16.0,74
388,4,156.0,92.0,2585.0,14.5,82
48,6,250.0,88.0,3139.0,14.5,71
114,4,98.0,90.0,2265.0,15.5,73
...,...,...,...,...,...,...
147,4,90.0,75.0,2108.0,15.5,74
156,8,400.0,170.0,4668.0,11.5,75
395,4,135.0,84.0,2295.0,11.6,82
14,4,113.0,95.0,2372.0,15.0,70


In [None]:
##preprocess the Origin column in data
# def preprocess_origin_cols(df):
#     df["Origin"] = df["Origin"].map({1: "India", 2: "USA", 3: "Germany"})
#     return df

In [4]:
##creating custom attribute adder class
acc_ix, hpower_ix, cyl_ix = 4,2, 0

class CustomAttrAdder(BaseEstimator, TransformerMixin):
    def __init__(self, acc_on_power=True): # no *args or **kargs
        self.acc_on_power = acc_on_power
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X):
        acc_on_cyl = X[:, acc_ix] / X[:, cyl_ix]
        if self.acc_on_power:
            acc_on_power = X[:, acc_ix] / X[:, hpower_ix]
            return np.c_[X, acc_on_power, acc_on_cyl]
        
        return np.c_[X, acc_on_cyl]

In [5]:
def num_pipeline_transformer(data):
    '''
    Function to process numerical transformations
    Argument:
        data: original dataframe 
    Returns:
        num_attrs: numerical dataframe
        num_pipeline: numerical pipeline object
        
    '''
    numerics = ['float64', 'int64']

    num_attrs = data.select_dtypes(include=numerics)

    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attrs_adder', CustomAttrAdder()),
        ('std_scaler', StandardScaler()),
        ])
    return num_attrs, num_pipeline


def pipeline_transformer(data):
    '''
    Complete transformation pipeline for both
    nuerical and categorical data.
    
    Argument:
        data: original dataframe 
    Returns:
        prepared_data: transformed data, ready to use
    '''
#     cat_attrs = ["Origin"]
    num_attrs, num_pipeline = num_pipeline_transformer(data)
    full_pipeline = ColumnTransformer([
        ("num", num_pipeline, list(num_attrs)),
#         ("cat", OneHotEncoder(), cat_attrs),
        ])
    prepared_data = full_pipeline.fit_transform(data)
    return prepared_data

### From raw data to processed data in 2 steps

In [6]:
##from raw data to processed data in 2 steps
# preprocessed_df = preprocess_origin_cols(data)
prepared_data = pipeline_transformer(data)
prepared_data

array([[-0.85657842, -1.07804475, -1.15192977, ..., -0.54436373,
         1.70952741,  1.29565517],
       [-0.85657842, -1.1174582 , -0.9900351 , ..., -0.54436373,
         0.79867454,  0.666186  ],
       [-0.85657842, -0.3587492 , -0.31547399, ...,  1.63652025,
        -0.21906787,  0.35145142],
       ...,
       [-0.85657842, -0.56566984, -0.53133355, ...,  1.63652025,
        -0.46365334, -0.25703544],
       [-0.85657842, -0.78244384, -0.23452666, ..., -1.63480572,
        -0.21548258,  0.45636295],
       [ 0.32260746, -0.45728283,  0.44003446, ...,  1.36390976,
        -0.75313354, -0.76061078]])

In [7]:
prepared_data[0]

array([-0.85657842, -1.07804475, -1.15192977, -1.17220298,  1.21586943,
       -0.54436373,  1.70952741,  1.29565517])

### Selecting and Training Models

1. Linear Regression
2. Decision Tree
3. Random Forest
4. SVM regressor

In [8]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(prepared_data, data_labels)

LinearRegression()

In [9]:
##testing the predictions with the 
sample_data = data.iloc[:5]
sample_labels = data_labels.iloc[:5]

sample_data_prepared = pipeline_transformer(sample_data)

print("Prediction of samples: ", lin_reg.predict(sample_data_prepared))

Prediction of samples:  [27.84899392 27.78225885 26.47235545 12.21626047 22.37038288]


In [10]:
print("Actual Labels of samples: ", list(sample_labels))

Actual Labels of samples:  [32.0, 31.0, 26.0, 18.0, 26.0]


#### Mean Squared Error

In [11]:
from sklearn.metrics import mean_squared_error

mpg_predictions = lin_reg.predict(prepared_data)
lin_mse = mean_squared_error(data_labels, mpg_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse


3.013004922657614

### Decision Tree

In [12]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(prepared_data, data_labels)

DecisionTreeRegressor()

In [13]:
mpg_predictions = tree_reg.predict(prepared_data)
tree_mse = mean_squared_error(data_labels, mpg_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

0.0

But no model is perfect, this means that our model has overfit the data to a great extent.

We won't be touching out test data until we finalize our model. So, how do we check for what's happening?

### Model Evaluation using Cross Validation

Scikit-Learn’s K-fold cross-validation feature randomly splits the training set into `K` distinct subsets called folds, then it trains and evaluates the model K times, picking a different fold for evaluation every time and training on the other K-1 folds. 

The result is an array containing the K evaluation scores:


In [14]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree_reg, 
                         prepared_data, 
                         data_labels, 
                         scoring="neg_mean_squared_error", 
                         cv = 10)
tree_reg_rmse_scores = np.sqrt(-scores)


In [15]:
tree_reg_rmse_scores

array([3.26984327, 3.03901711, 3.40307765, 3.4191556 , 2.24053565,
       3.31106101, 3.42084602, 3.6054646 , 4.1944664 , 2.34121557])

In [16]:
tree_reg_rmse_scores.mean()

3.224468287592594

In [17]:
scores = cross_val_score(lin_reg, prepared_data, data_labels, scoring="neg_mean_squared_error", cv = 10)
lin_reg_rmse_scores = np.sqrt(-scores)
lin_reg_rmse_scores

array([3.55607199, 3.44040236, 3.84137434, 2.64287754, 2.44982472,
       2.83954804, 3.35920836, 2.42913501, 3.59460835, 2.90480891])

In [18]:
lin_reg_rmse_scores.mean()

3.105785962225056

### Random Forest model

In [19]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor()
forest_reg.fit(prepared_data, data_labels)
forest_reg_cv_scores = cross_val_score(forest_reg,
                                         prepared_data,
                                         data_labels,
                                         scoring='neg_mean_squared_error',
                                         cv = 10)

forest_reg_rmse_scores = np.sqrt(-forest_reg_cv_scores)
forest_reg_rmse_scores.mean()

2.5648040779637773

### Support Vector Machine Regressor

In [20]:
from sklearn.svm import SVR

svm_reg = SVR(kernel='linear')
svm_reg.fit(prepared_data, data_labels)
svm_cv_scores = cross_val_score(svm_reg, prepared_data, data_labels,
                                scoring='neg_mean_squared_error',
                                cv = 10)
svm_rmse_scores = np.sqrt(-svm_cv_scores)
svm_rmse_scores.mean()



3.149018656476251

### Hyperparameter Tuning using GridSearchCV

In [21]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]

forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid,
                           scoring='neg_mean_squared_error',
                           return_train_score=True,
                           cv=10,
                          )

grid_search.fit(prepared_data, data_labels)

GridSearchCV(cv=10, estimator=RandomForestRegressor(),
             param_grid=[{'max_features': [2, 4, 6, 8],
                          'n_estimators': [3, 10, 30]},
                         {'bootstrap': [False], 'max_features': [2, 3, 4],
                          'n_estimators': [3, 10]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [22]:
grid_search.best_params_

{'max_features': 6, 'n_estimators': 10}

In [23]:
cv_scores = grid_search.cv_results_

##printing all the parameters along with their scores
for mean_score, params in zip(cv_scores['mean_test_score'], cv_scores["params"]):
    print(np.sqrt(-mean_score), params)


3.4785930890278407 {'max_features': 2, 'n_estimators': 3}
2.942002823744286 {'max_features': 2, 'n_estimators': 10}
2.842846265866842 {'max_features': 2, 'n_estimators': 30}
3.2435712022807874 {'max_features': 4, 'n_estimators': 3}
2.9193407376909493 {'max_features': 4, 'n_estimators': 10}
2.6737646839476072 {'max_features': 4, 'n_estimators': 30}
3.0216037393586364 {'max_features': 6, 'n_estimators': 3}
2.6649749060732257 {'max_features': 6, 'n_estimators': 10}
2.7047530682016494 {'max_features': 6, 'n_estimators': 30}
2.931230937225464 {'max_features': 8, 'n_estimators': 3}
2.7137915629571516 {'max_features': 8, 'n_estimators': 10}
2.7009702182452275 {'max_features': 8, 'n_estimators': 30}
3.4816002048711936 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
3.075292683749923 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
3.322726268688233 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
2.8179001866094455 {'bootstrap': False, 'max_features': 3, 'n_esti

### Checking Feature importance

In [24]:
# feature importances 

feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

array([0.06455734, 0.20620092, 0.17239425, 0.35479719, 0.01438541,
       0.12405189, 0.04484678, 0.01876621])

In [25]:
extra_attrs = ["acc_on_power", "acc_on_cyl"]
numerics = ['float64', 'int64']
num_attrs = list(data.select_dtypes(include=numerics))

attrs = num_attrs + extra_attrs
sorted(zip(attrs, feature_importances), reverse=True)

[('acc_on_power', 0.04484678123108904),
 ('acc_on_cyl', 0.018766212170111974),
 ('Weight', 0.3547971901963191),
 ('Model Year', 0.12405189173889639),
 ('Horsepower', 0.17239425106792017),
 ('Displacement', 0.2062009235022128),
 ('Cylinders', 0.06455734374343307),
 ('Acceleration', 0.014385406350017558)]

### Evaluating the entire system on Test Data

In [26]:
final_model = grid_search.best_estimator_

X_test = strat_test_set.drop("MPG", axis=1)
y_test = strat_test_set["MPG"].copy()

# X_test_preprocessed = preprocess_origin_cols(X_test)
X_test_prepared = pipeline_transformer(X_test)

final_predictions = final_model.predict(X_test_prepared)
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

In [27]:
final_rmse

3.112326099559621

### Creating a function to cover this entire flow

In [30]:
def predict_mpg(config, model):
    
    if type(config) == dict:
        df = pd.DataFrame(config)
    else:
        df = config
    
#     preproc_df = preprocess_origin_cols(df)
    prepared_df = pipeline_transformer(df)
    y_pred = model.predict(prepared_df)
    return y_pred
    

In [31]:
##checking it on a random sample
vehicle_config = {
    'Cylinders': [4, 6, 8],
    'Displacement': [155.0, 160.0, 165.5],
    'Horsepower': [93.0, 130.0, 98.0],
    'Weight': [2500.0, 3150.0, 2600.0],
    'Acceleration': [15.0, 14.0, 16.0],
    'Model Year': [81, 80, 78]
}

predict_mpg(vehicle_config, final_model)

array([33.34, 16.65, 18.5 ])

### Save the Model

In [32]:
import pickle

In [33]:
##saving the model
with open("model.bin", 'wb') as f_out:
    pickle.dump(final_model, f_out)
    f_out.close()

In [158]:
##loading the model from the saved file
with open('model.bin', 'rb') as f_in:
    model = pickle.load(f_in)

predict_mpg(vehicle_config, model)

array([34.83333333, 18.50666667, 20.56333333])