In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split


In [5]:
!wget "http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data"

--2020-07-17 14:54:08--  http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 30286 (30K) [application/x-httpd-php]
Saving to: ‘auto-mpg.data.1’


2020-07-17 14:54:09 (879 KB/s) - ‘auto-mpg.data.1’ saved [30286/30286]



In [69]:
# reading the .data file using pandas

cols = ['MPG','Cylinders','Displacement','Horsepower','Weight',
                'Acceleration', 'Model Year', 'Origin']

df = pd.read_csv('./auto-mpg.data', names=cols, na_values = "?",
                comment = '\t',
                sep= " ",
                skipinitialspace=True)

data = df.copy()
data2 = df.copy()

In [20]:
# set aside the test data
train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)

In [21]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(data, data["Cylinders"]):
    strat_train_set = data.loc[train_index]
    strat_test_set = data.loc[test_index]

In [22]:
data = strat_train_set.drop("MPG", axis=1)
data_labels = strat_train_set["MPG"].copy()


In [23]:
data.describe().transpose()


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Cylinders,318.0,5.45283,1.698759,3.0,4.0,4.0,8.0,8.0
Displacement,318.0,192.408805,101.648135,68.0,105.0,146.0,261.5,455.0
Horsepower,314.0,103.840764,37.332289,46.0,75.25,92.0,125.0,230.0
Weight,318.0,2977.081761,832.293528,1755.0,2237.0,2844.0,3597.25,5140.0
Acceleration,318.0,15.638679,2.768898,8.0,13.925,15.5,17.3,24.8
Model Year,318.0,75.996855,3.674019,70.0,73.0,76.0,79.0,82.0
Origin,318.0,1.566038,0.790546,1.0,1.0,1.0,2.0,3.0


### Data Cleaning

### Handling Categorical Attributes

In [24]:
def preprocess_origin_cols(df):
    df["Origin"] = df["Origin"].map({1: "India", 2: "USA", 3: "Germany"})
    return df
data_tr = preprocess_origin_cols(data)

### Attribute adder - Custom Transformation

In [115]:
from sklearn.base import BaseEstimator, TransformerMixin

acc_ix, hpower_ix, cyl_ix = 3, 5, 1

class CustomAttrAdder(BaseEstimator, TransformerMixin):
    def __init__(self, acc_on_power=True): # no *args or **kargs
        self.acc_on_power = acc_on_power
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X):
        acc_on_cyl = X[:, acc_ix] / X[:, cyl_ix]
        if self.acc_on_power:
            acc_on_power = X[:, acc_ix] / X[:, hpower_ix]
#             print(np.c_[X, acc_on_power, acc_on_cyl][:3])
            return np.c_[X, acc_on_power, acc_on_cyl]
        return np.c_[X, acc_on_cyl]

# attr_adder = CustomAttrAdder(acc_on_power=False)
# data_tr_extra_attrs = attr_adder.transform(data_tr.values)

In [72]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

numerics = ['float64', 'int64']

data_num = data_tr.select_dtypes(include=numerics)

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attrs_adder', CustomAttrAdder()),
        ('std_scaler', StandardScaler()),
    ])

# data_num_tr = num_pipeline.fit_transform(data_num)

In [114]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

num_attrs = list(data_num)
cat_attrs = ["Origin"]

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attrs),
        ("cat", OneHotEncoder(), cat_attrs),
    ])

full_pipeline.fit_transform(data_tr)
# print(data_tr)
prepared_data = full_pipeline.transform(data_tr)
(prepared_data[0])


[[   4.           83.           61.         2003.           19.
    74.           27.06756757   24.13253012]
 [   4.           79.           67.         2000.           16.
    74.           27.02702703   25.3164557 ]
 [   4.          156.           92.         2585.           14.5
    82.           31.52439024   16.57051282]]
[[   4.           83.           61.         2003.           19.
    74.           27.06756757   24.13253012]
 [   4.           79.           67.         2000.           16.
    74.           27.02702703   25.3164557 ]
 [   4.          156.           92.         2585.           14.5
    82.           31.52439024   16.57051282]]


array([-0.85657842, -1.07804475, -1.15192977, -1.17220298,  1.21586943,
       -0.54436373, -1.04758895,  1.33350893])

In [74]:
# ##test
# data2 = data2.drop('MPG', axis=1)
# preprocess_df = preprocess_origin_cols(data2)
# fin_data = full_pipeline.transform(preprocess_df)
# print(fin_data[0])

df = pd.DataFrame(vehicle_config)
preprocess_df = preprocess_origin_cols(df)
fin_data = full_pipeline.transform(preprocess_df)
print(len(fin_data[0]))

[[   4.          155.           93.         2500.           15.
    81.           30.86419753   16.12903226]]
11


In [123]:
def num_pipeline_transformer(data):
    numerics = ['float64', 'int64']

    num_attrs = data.select_dtypes(include=numerics)

    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attrs_adder', CustomAttrAdder()),
        ('std_scaler', StandardScaler()),
        ])
    return num_attrs, num_pipeline


def pipeline_transformer(data):
    
    cat_attrs = ["Origin"]
    num_attrs, num_pipeline = num_pipeline_transformer(data)
    print(list(num_attrs))
    full_pipeline = ColumnTransformer([
        ("num", num_pipeline, list(num_attrs)),
        ("cat", OneHotEncoder(), cat_attrs),
        ])
    prep_data = full_pipeline.fit_transform(data)
    prepared_data = full_pipeline.transform(data)
    return prepared_data



In [78]:
##test
data3 = data2.copy()
data3 = data2.drop(['MPG', 'Origin'], axis=1)
# preproc_df = preprocess_origin_cols(data3)
print(preproc_df)
prepared_df = pipeline_transformer(preproc_df)
print(len(prepared_df[0]))


df1 = pd.DataFrame(vehicle_config)
preproc_df = preprocess_origin_cols(df1)
# print(preproc_df)
prepared_df = pipeline_transformer(preproc_df)
print(len(prepared_df[0]))

   Cylinders  Displacement  Horsepower  Weight  Acceleration  Model Year  \
0          4         155.0        93.0  2500.0          15.0          81   

    Origin  
0  Germany  
['Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration', 'Model Year']
[[   4.          155.           93.         2500.           15.
    81.           30.86419753   16.12903226]]
[[   4.          155.           93.         2500.           15.
    81.           30.86419753   16.12903226]]
9
['Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration', 'Model Year']
[[   4.          155.           93.         2500.           15.
    81.           30.86419753   16.12903226]]
[[   4.          155.           93.         2500.           15.
    81.           30.86419753   16.12903226]]
9


In [62]:
vehicle_config = {
    'Cylinders': [4],
    'Displacement': [155.0],
    'Horsepower': [93.0],
    'Weight': [2500.0],
    'Acceleration': [15.0],
    'Model Year': [81],
    'Origin': [3]
}


### Selecting and Training a Model

### Linear Regression

In [31]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(prepared_data, data_labels)

LinearRegression()

In [82]:
sample_data = data_tr.iloc[:5]
sample_labels = data_labels.iloc[:5]

sample_data_prepared = full_pipeline.transform(sample_data)

print("Prediction of samples: ", lin_reg.predict(sample_data_prepared))

Prediction of samples:  [28.97985713 29.3339377  29.32029657 19.36703559 25.41426738]


In [83]:
print("Actual Labels of samples: ", list(sample_labels))

Actual Labels of samples:  [32.0, 31.0, 26.0, 18.0, 26.0]


In [84]:
from sklearn.metrics import mean_squared_error

mpg_predictions = lin_reg.predict(prepared_data)
lin_mse = mean_squared_error(data_labels, mpg_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse


2.9247905148684326

### Decision Tree

In [85]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(prepared_data, data_labels)

DecisionTreeRegressor()

In [86]:
mpg_predictions = tree_reg.predict(prepared_data)
tree_mse = mean_squared_error(data_labels, mpg_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

0.0

But no model is perfect, this means that our model has overfit the data to a great extent.

We won't be touching out test data until we finalize our model. So, how do we check for what's happening?

### Model Evaluation using Cross Validation

In [87]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree_reg, prepared_data, data_labels, scoring="neg_mean_squared_error", cv = 10)
tree_reg_rmse_scores = np.sqrt(-scores)


In [88]:
tree_reg_rmse_scores

array([2.55935783, 3.45425462, 3.77317837, 3.09823538, 2.57706228,
       4.1290132 , 4.17892779, 2.93140325, 4.72303893, 2.59646161])

In [89]:
tree_reg_rmse_scores.mean()

3.402093327314705

In [90]:
scores = cross_val_score(lin_reg, prepared_data, data_labels, scoring="neg_mean_squared_error", cv = 10)
lin_reg_rmse_scores = np.sqrt(-scores)
lin_reg_rmse_scores

array([2.66655902, 3.38815592, 3.50816896, 2.91192052, 2.36554584,
       2.65794004, 3.31187408, 2.90686777, 3.56159392, 2.97528158])

In [91]:
lin_reg_rmse_scores.mean()

3.025390765144343

### Checking for a randomforest model

In [92]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor()
forest_reg.fit(prepared_data, data_labels)
forest_reg_cv_scores = cross_val_score(forest_reg,
                                         prepared_data,
                                         data_labels,
                                         scoring='neg_mean_squared_error',
                                         cv = 10)

forest_reg_rmse_scores = np.sqrt(-forest_reg_cv_scores)

In [93]:
forest_reg_rmse_scores.mean()

2.723661398008636

### Checking for Support Vector Machine Regressor

In [94]:
from sklearn.svm import SVR

svm_reg = SVR(kernel='linear', C=1.0, epsilon=0.1)
svm_reg.fit(prepared_data, data_labels)
svm_cv_scores = cross_val_score(svm_reg, prepared_data, data_labels,
                                scoring='neg_mean_squared_error',
                                cv = 10)
svm_rmse_scores = np.sqrt(-svm_cv_scores)
svm_rmse_scores.mean()



3.4686728377046037

## Hyperparameter Tuning with GridSearchCV

In [95]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]

forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid,
                           scoring='neg_mean_squared_error',
                           return_train_score=True,
                           cv=10,
                          )

grid_search.fit(prepared_data, data_labels)

GridSearchCV(cv=10, estimator=RandomForestRegressor(),
             param_grid=[{'max_features': [2, 4, 6, 8],
                          'n_estimators': [3, 10, 30]},
                         {'bootstrap': [False], 'max_features': [2, 3, 4],
                          'n_estimators': [3, 10]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [96]:
grid_search.best_params_

{'bootstrap': False, 'max_features': 3, 'n_estimators': 10}

In [97]:
cv_scores = grid_search.cv_results_

for mean_score, params in zip(cv_scores['mean_test_score'], cv_scores["params"]):
    print(np.sqrt(-mean_score), params)


3.448298694847778 {'max_features': 2, 'n_estimators': 3}
2.827147426393699 {'max_features': 2, 'n_estimators': 10}
2.8469656701663504 {'max_features': 2, 'n_estimators': 30}
3.233481179415524 {'max_features': 4, 'n_estimators': 3}
2.8587811436023096 {'max_features': 4, 'n_estimators': 10}
2.751763184352872 {'max_features': 4, 'n_estimators': 30}
3.141329307918628 {'max_features': 6, 'n_estimators': 3}
2.9515046924571835 {'max_features': 6, 'n_estimators': 10}
2.796712367020264 {'max_features': 6, 'n_estimators': 30}
3.20925021809855 {'max_features': 8, 'n_estimators': 3}
2.7821679637701657 {'max_features': 8, 'n_estimators': 10}
2.7874434634316727 {'max_features': 8, 'n_estimators': 30}
3.1257697976589682 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
2.853911710514411 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
3.071734577630721 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
2.6867854001185734 {'bootstrap': False, 'max_features': 3, 'n_estimator

### Checking Feature importance

In [98]:
# feature importances 

feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

array([0.01484341, 0.17718559, 0.13820851, 0.13935788, 0.02301053,
       0.09290038, 0.26394044, 0.13492716, 0.00810856, 0.00549644,
       0.0020211 ])

In [99]:
extra_attrs = ["acc_on_power", "acc_on_cyl"]
cat_encoder = full_pipeline.named_transformers_['cat']
cat_one_hot_attrs = list(cat_encoder.categories_[0])
attrs = num_attrs + extra_attrs + cat_one_hot_attrs
sorted(zip(attrs, feature_importances), reverse=True)

[('acc_on_power', 0.26394043679799295),
 ('acc_on_cyl', 0.13492715670501107),
 ('Weight', 0.1393578826538036),
 ('USA', 0.002021099157065391),
 ('Model Year', 0.09290037970076451),
 ('India', 0.005496435765267739),
 ('Horsepower', 0.13820851328632847),
 ('Germany', 0.008108561514362875),
 ('Displacement', 0.17718558912208673),
 ('Cylinders', 0.014843411963056765),
 ('Acceleration', 0.023010533334259876)]

In [100]:
forest_reg = RandomForestRegressor(bootstrap= False, 
                                   max_features= 3, 
                                   n_estimators= 10)

forest_reg.fit(prepared_data, data_labels)
forest_reg_cv_scores = cross_val_score(forest_reg, prepared_data, data_labels,
                                       scoring='neg_mean_squared_error',
                                       cv = 10)
forest_reg_rmse_scores = np.sqrt(-forest_reg_cv_scores)
forest_reg_rmse_scores.mean()


2.8279770901195525

### Evaluating the entire system on Test Data

In [101]:
final_model = grid_search.best_estimator_

X_test = strat_test_set.drop("MPG", axis=1)
y_test = strat_test_set["MPG"].copy()

X_test_preprocessed = preprocess_origin_cols(X_test)
X_test_prepared = full_pipeline.transform(X_test_preprocessed)

final_predictions = final_model.predict(X_test_prepared)
print(len(X_test_prepared[0]))
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

11


In [102]:
final_rmse

3.08075518339254

### Creating a function to cover this entire flow

In [108]:
def predict_mpg(config, model):
    
    if type(config) == dict:
        df = pd.DataFrame(config)
    else:
        df = config
    
    preproc_df = preprocess_origin_cols(df)
    prepared_df = full_pipeline.transform(preproc_df)
    y_pred = model.predict(prepared_df)
    return y_pred
    

In [104]:
## for creating a random example and comparing it
df.sample(5)

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
282,22.3,4,140.0,88.0,2890.0,17.3,79,1
47,19.0,6,250.0,100.0,3282.0,15.0,71,1
27,11.0,8,318.0,210.0,4382.0,13.5,70,1
32,25.0,4,98.0,,2046.0,19.0,71,1
7,14.0,8,440.0,215.0,4312.0,8.5,70,1


In [124]:
# checking for a random example, creating a 

vehicle_config = {
    'Cylinders': [4, 6,8],
    'Displacement': [155.0, 160.0,180.0],
    'Horsepower': [93.0, 130.0, 150.0],
    'Weight': [2500.0, 3150.0, 3300.0],
    'Acceleration': [15.0, 14.0, 18.0],
    'Model Year': [82, 81, 80],
    'Origin': [3, 2, 1]
}


In [109]:
list(predict_mpg(vehicle_config, final_model))

[27.79, 19.799999999999997, 20.06]

### Saving the model

In [220]:
## we'll save the model as a pickle file
import pickle

In [221]:
##saving the model
with open("model.bin", 'wb') as f_out:
    pickle.dump(final_model, f_out)
    f_out.close()

In [222]:
##loading the model from the saved file
with open('model.bin', 'rb') as f_in:
    model = pickle.load(f_in)


In [223]:
predict_mpg(vehicle_config, model)

[[-0.85657842 -0.36860256 -0.28849155 -0.57411676 -0.23102543  1.36390976
  -0.72550314 -0.33568446  1.          0.          0.        ]
 [ 0.32260746 -0.31933574  0.7098589   0.20808857 -0.59274914  1.09129926
  -0.00349209  0.4064624   0.          0.          1.        ]]


array([31.59, 20.33])

### Launch And Monitor!!!

In [112]:
import requests

url = "http://localhost:9696/predict"
r = requests.post(url, json = vehicle_config)
r.text


'{\n  "mpg_prediction": [\n    34.35, \n    20.39, \n    15.77\n  ]\n}\n'