<a href="https://colab.research.google.com/github/esiri01/Fuel-Efficiency/blob/main/FuelEfficiency3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Import Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

import warnings
warnings.filterwarnings ('ignore')

In [None]:
!wget "http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data"

--2021-12-27 13:15:12--  http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 30286 (30K) [application/x-httpd-php]
Saving to: ‘auto-mpg.data’


2021-12-27 13:15:12 (466 KB/s) - ‘auto-mpg.data’ saved [30286/30286]



In [None]:
#read from .data file

cols = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight',
        'Acceleration', 'Model Year', 'Origin']

df = pd.read_csv('./auto-mpg.data', names=cols, na_values = "?",
                comment = '\t',
                sep= " ",
                skipinitialspace=True)

data = df.copy()

split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state=30)
for train_index, test_index in split.split(data, data["Cylinders"]):
  strat_train_set = data.loc[train_index]
  strat_test_set = data.loc[test_index]

In [None]:
#segregate the feature and target variable
data = strat_train_set.drop("MPG", axis=1)
data_labels = strat_train_set["MPG"].copy()
data

Unnamed: 0,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
286,8,302.0,129.0,3725.0,13.4,79,1
377,4,91.0,68.0,1970.0,17.6,82,3
117,4,68.0,49.0,1867.0,19.5,73,2
114,4,98.0,90.0,2265.0,15.5,73,2
361,6,168.0,116.0,2900.0,12.6,81,3
...,...,...,...,...,...,...,...
226,6,231.0,105.0,3425.0,16.9,77,1
295,4,98.0,80.0,1915.0,14.4,79,1
187,8,305.0,140.0,4215.0,13.0,76,1
340,4,156.0,92.0,2620.0,14.4,81,1


In [None]:
#Preprocess the origin column in data
def preprocess_origin_cols(df):
  df["Origin"] = df["Origin"].map({1: "India", 2: "USA", 3: "Germany"})
  return df

In [None]:
#Creating custom attribute adder class
acc_ix, hpower_ix, cyl_ix = 4, 2, 0

class CustomAttrAdder(BaseEstimator, TransformerMixin):
  def __init__(self, acc_on_power=True): #no *args or **kwargs
      self.acc_on_power = acc_on_power
  def fit(self, X, y=None):
    return self
  def transform(self, X):
    acc_on_cyl = X[:, acc_ix] / X[:, hpower_ix]
    if self.acc_on_power:
      acc_on_power = X[:, acc_ix] / X[:, hpower_ix]
      return np.c_[X, acc_on_power, acc_on_cyl]
    
    return np.c_[X, acc_on_cyl]

In [None]:
def num_pipeline_transformer(data):
  '''
   Function to process numerical transformations
    Argument:
        data: original dataframe 
    Returns:
        num_attrs: numerical dataframe
        num_pipeline: numerical pipeline object
        
    '''
  numerics = ['float64', 'int64']

  num_attrs = data.select_dtypes(include=numerics)

  num_pipeline = Pipeline([
      ('imputer', SimpleImputer(strategy="median")),
      ('attrs_adder', CustomAttrAdder()),
      ('std_scaler', StandardScaler()),
      ])
  return num_attrs, num_pipeline

def pipeline_transformer(data):
  '''
    Complete transformation pipeline for both
    numerical and categorical data.
    
    Argument:
        data: original dataframe 
    Returns:
        prepared_data: transformed data, ready to use
    '''
  cat_attrs = ["Origin"]
  num_attrs, num_pipeline = num_pipeline_transformer(data)
  full_pipeline = ColumnTransformer([
      ("num", num_pipeline, list(num_attrs)),
      ("cat", OneHotEncoder(), cat_attrs),                               
      ])
  prepared_data = full_pipeline.fit_transform(data)
  return prepared_data

From raw data to processed data

In [None]:
preprocessed_df = preprocess_origin_cols(data)
prepared_data = pipeline_transformer(preprocessed_df)
prepared_data

array([[ 1.50179333,  1.04560678,  0.61204505, ...,  0.        ,
         1.        ,  0.        ],
       [-0.85657842, -0.98006484, -0.96485094, ...,  1.        ,
         0.        ,  0.        ],
       [-0.85657842, -1.20087264, -1.45601526, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 1.50179333,  1.07440779,  0.89640334, ...,  0.        ,
         1.        ,  0.        ],
       [-0.85657842, -0.35604278, -0.34443284, ...,  0.        ,
         1.        ,  0.        ],
       [-0.85657842, -0.50964821, -0.39613435, ...,  0.        ,
         1.        ,  0.        ]])

In [None]:
prepared_data[:,-1]

array([0., 0., 1., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 1., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
       0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
       0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0.,
       0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0.,
       0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       1., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0.,
       0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 1., 0., 0., 0., 0., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 1., 0., 1., 0., 0., 1., 1., 0., 1., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

Selecting and Training Models

In [None]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(prepared_data, data_labels)

LinearRegression()

In [None]:
#test sample
sample_data = data.iloc[:5]
sample_labels = data_labels.iloc[:5]

sample_data_prepared = pipeline_transformer(sample_data)

print("Prediction of samples: ", lin_reg.predict(sample_data_prepared))

Prediction of samples:  [16.20221838 30.14043493 27.03285904 21.44759904 24.329     ]


In [None]:
print("Actual Labels of samples: ", list(sample_labels))

Actual Labels of samples:  [17.6, 31.0, 29.0, 26.0, 25.4]


MSE

In [None]:
from sklearn.metrics import mean_squared_error

mpg_predictions = lin_reg.predict(prepared_data)
lin_mse = mean_squared_error(data_labels, mpg_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_mse

8.208891631933296

Decision Tree

In [None]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(prepared_data, data_labels)

DecisionTreeRegressor()

In [None]:
mpg_predictions = tree_reg.predict(prepared_data)
tree_mse = mean_squared_error(data_labels, mpg_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

0.0

Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree_reg,
                         prepared_data,
                         data_labels,
                         scoring="neg_mean_squared_error",
                         cv=10)
tree_reg_rmse_scores = np.sqrt(-scores)
tree_reg_rmse_scores

array([4.18718282, 3.14473767, 3.82973563, 4.06151911, 4.62756686,
       2.3508642 , 3.95296566, 2.85782916, 3.18868969, 4.26296883])

In [None]:
tree_reg_rmse_scores.mean()

3.6464059627310967

Random Forest Model

In [None]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor()
forest_reg.fit(prepared_data, data_labels)
forest_reg_cv_scores = cross_val_score(forest_reg,
                                       prepared_data,
                                       data_labels,
                                       scoring='neg_mean_squared_error',
                                       cv = 10)

forest_reg_rmse_scores = np.sqrt(-forest_reg_cv_scores)
forest_reg_rmse_scores.mean()

2.5960910375496895

SVM Regressor

In [None]:
from sklearn.svm import SVR

svm_reg = SVR(kernel='linear')
svm_reg.fit(prepared_data, data_labels)
svm_cv_scores = cross_val_score(svm_reg, prepared_data, data_labels,
                                scoring='neg_mean_squared_error',
                                cv=10)

svm_rmse_scores = np.sqrt(-svm_cv_scores)
svm_rmse_scores.mean()

3.0106185595960557

Hyperparameter Tuning using GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = [
      {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
      {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
]

forest_reg = RandomForestRegressor ()

grid_search = GridSearchCV(forest_reg, param_grid,
                           scoring='neg_mean_squared_error',
                           return_train_score=True,
                           cv=10,
                           )
grid_search.fit(prepared_data, data_labels)

GridSearchCV(cv=10, estimator=RandomForestRegressor(),
             param_grid=[{'max_features': [2, 4, 6, 8],
                          'n_estimators': [3, 10, 30]},
                         {'bootstrap': [False], 'max_features': [2, 3, 4],
                          'n_estimators': [3, 10]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [None]:
grid_search.best_params_

{'max_features': 6, 'n_estimators': 30}

In [None]:
cv_scores = grid_search.cv_results_

#print parameters with respective scores
for mean_score, params in zip(cv_scores['mean_test_score'], cv_scores["params"]):
  print(np.sqrt(-mean_score), params)

3.464658307951913 {'max_features': 2, 'n_estimators': 3}
3.0558025432720197 {'max_features': 2, 'n_estimators': 10}
2.922423683274368 {'max_features': 2, 'n_estimators': 30}
3.332633847979004 {'max_features': 4, 'n_estimators': 3}
3.0138865534817443 {'max_features': 4, 'n_estimators': 10}
2.7460284901358896 {'max_features': 4, 'n_estimators': 30}
3.1681035577028966 {'max_features': 6, 'n_estimators': 3}
2.7087881191774255 {'max_features': 6, 'n_estimators': 10}
2.6334401188025898 {'max_features': 6, 'n_estimators': 30}
2.901685256837671 {'max_features': 8, 'n_estimators': 3}
2.7597767612938227 {'max_features': 8, 'n_estimators': 10}
2.6586881287740822 {'max_features': 8, 'n_estimators': 30}
3.301788202102191 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
2.9403510370821646 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
2.87870427172542 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
2.8815162321893952 {'bootstrap': False, 'max_features': 3, 'n_estima

Check Feature Importance

In [None]:
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

array([0.1483486 , 0.26259311, 0.16438161, 0.21823615, 0.02216228,
       0.09289666, 0.03935991, 0.0433186 , 0.00237656, 0.00240365,
       0.00392287])

In [None]:
extra_attrs = ["acc_on_power", "acc_on_cyl"]
numerics = ["float64", "int64"]
num_attrs = list(data.select_dtypes(include=numerics))

attrs = num_attrs + extra_attrs
sorted(zip(attrs, feature_importances), reverse=True)

[('acc_on_power', 0.03935991238081223),
 ('acc_on_cyl', 0.04331859890727371),
 ('Weight', 0.21823615496422336),
 ('Model Year', 0.0928966584514255),
 ('Horsepower', 0.16438160910382976),
 ('Displacement', 0.26259310531260666),
 ('Cylinders', 0.14834859602623632),
 ('Acceleration', 0.0221622822474937)]

Evaluating the entiree system on Test Data

In [None]:
final_model = grid_search.best_estimator_

X_test = strat_test_set.drop("MPG", axis=1)
y_test = strat_test_set["MPG"].copy()

X_test_preprocessed = preprocess_origin_cols(X_test)
X_test_prepared = pipeline_transformer(X_test_preprocessed)

final_predictions = final_model.predict(X_test_prepared)
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

In [None]:
final_rmse

3.3438661090952126

Create a function for entire process

In [None]:
def predict_mpg(config, model):

  if type(config) == dict:
    df = pd.DataFrame(config)
  else:
    df = config
  
  preproc_df = preprocess_origin_cols(df)
  prepared_df = pipeline_transformer(preproc_df)
  y_pred = model.predict(prepared_df)

  return y_pred

In [None]:
#Checking on sample
vehicle_config = {
    'Cylinders': [4, 6, 8],
    'Displacement': [155.0, 160.0, 165.5],
    'Horsepower': [93.0, 130.0, 98.0],
    'Weight': [2500.0, 3150.0, 2600.0],
    'Acceleration': [15.0, 14.0, 16.0],
    'Model Year': [81, 80, 78],
    'Origin': [3, 2, 1]
}

predict_mpg(vehicle_config, final_model)

array([35.54333333, 19.41333333, 20.35      ])

Save the Model

In [None]:
import pickle

In [None]:
with open("model.bin", "wb") as f_out:
  pickle.dump(final_model, f_out)
  f_out.close()

In [None]:
#Loading model from saved file
with open('model.bin', 'rb') as f_in:
  model = pickle.load(f_in)

predict_mpg(vehicle_config, model)

array([35.54333333, 19.41333333, 20.35      ])