In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings("ignore")

In [4]:
automobile_df = pd.read_csv('datasets/auto-mpg-processed.csv')
automobile_df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,age
0,20.0,4,130.0,102,3150,15.7,44
1,15.0,6,250.0,100,3336,17.0,46
2,14.0,8,440.0,215,4312,8.5,50
3,26.0,4,97.0,46,1950,21.0,47
4,26.4,4,140.0,88,2870,18.1,40


In [5]:
X = automobile_df.drop(['mpg', 'age'], axis = 1)
Y = automobile_df['mpg']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [6]:
parameters = {'alpha': [0.2, 0.4, 0.6, 0.7, 0.8, 0.9, 1.0]}
#3 fold cross validation. all dataset will be splited into 3. the third part will be used for testing
grid_search = GridSearchCV(Lasso(), parameters, cv=3, return_train_score=True)
grid_search.fit(x_train, y_train)

grid_search.best_params_

{'alpha': 1.0}

In [7]:
for i in range(len(parameters['alpha'])):
    print('Parameters: ', grid_search.cv_results_['params'][i])
    print('Mean Test Score: ', grid_search.cv_results_['mean_test_score'][i])
    print('Rank: ', grid_search.cv_results_['rank_test_score'][i])

Parameters:  {'alpha': 0.2}
Mean Test Score:  0.6952620799947287
Rank:  7
Parameters:  {'alpha': 0.4}
Mean Test Score:  0.696715723956034
Rank:  6
Parameters:  {'alpha': 0.6}
Mean Test Score:  0.696960117146641
Rank:  5
Parameters:  {'alpha': 0.7}
Mean Test Score:  0.6969669049738881
Rank:  4
Parameters:  {'alpha': 0.8}
Mean Test Score:  0.6969727673608815
Rank:  3
Parameters:  {'alpha': 0.9}
Mean Test Score:  0.6969778275178827
Rank:  2
Parameters:  {'alpha': 1.0}
Mean Test Score:  0.6969817787637104
Rank:  1


In [8]:
lasso_model = Lasso(alpha=grid_search.best_params_['alpha']).fit(x_train, y_train)

In [9]:
y_pred = lasso_model.predict(x_test)

print('Training score: ', lasso_model.score(x_train, y_train))
print('Test score: ', r2_score(y_test, y_pred))

Training score:  0.7002640174708283
Test score:  0.7226381982464427


In [11]:
parameters = {'n_neighbors': [10, 12, 14, 18, 20, 25, 30, 35, 50]}

grid_search = GridSearchCV(KNeighborsRegressor(), parameters, cv=3, return_train_score=True)
grid_search.fit(x_train, y_train)

grid_search.best_params_

{'n_neighbors': 30}

In [12]:
for i in range(len(parameters['n_neighbors'])):
    print('Parameters: ', grid_search.cv_results_['params'][i])
    print('Mean Test Score: ', grid_search.cv_results_['mean_test_score'][i])
    print('Rank: ', grid_search.cv_results_['rank_test_score'][i])

Parameters:  {'n_neighbors': 10}
Mean Test Score:  0.6636341308642912
Rank:  9
Parameters:  {'n_neighbors': 12}
Mean Test Score:  0.6740369316859048
Rank:  8
Parameters:  {'n_neighbors': 14}
Mean Test Score:  0.6812101660889937
Rank:  7
Parameters:  {'n_neighbors': 18}
Mean Test Score:  0.687316956156609
Rank:  6
Parameters:  {'n_neighbors': 20}
Mean Test Score:  0.6936379951139617
Rank:  5
Parameters:  {'n_neighbors': 25}
Mean Test Score:  0.7001266377692298
Rank:  4
Parameters:  {'n_neighbors': 30}
Mean Test Score:  0.7049922712055819
Rank:  1
Parameters:  {'n_neighbors': 35}
Mean Test Score:  0.7040520307589496
Rank:  2
Parameters:  {'n_neighbors': 50}
Mean Test Score:  0.7035109499460314
Rank:  3


In [13]:
grid_search.cv_results_

{'mean_fit_time': array([0.00399772, 0.00399478, 0.0040137 , 0.00521429, 0.0052096 ,
        0.00167632, 0.        , 0.        , 0.        ]),
 'std_fit_time': array([1.05845200e-05, 3.48413948e-06, 2.08006452e-05, 7.37412501e-03,
        7.36749390e-03, 2.37067594e-03, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00]),
 'mean_score_time': array([0.00400639, 0.00266592, 0.0013384 , 0.        , 0.0052077 ,
        0.00521723, 0.        , 0.        , 0.00520221]),
 'std_score_time': array([7.69123210e-06, 1.88508811e-03, 1.89278687e-03, 0.00000000e+00,
        7.36479650e-03, 7.37828350e-03, 0.00000000e+00, 0.00000000e+00,
        7.35704148e-03]),
 'param_n_neighbors': masked_array(data=[10, 12, 14, 18, 20, 25, 30, 35, 50],
              mask=[False, False, False, False, False, False, False, False,
                    False],
        fill_value='?',
             dtype=object),
 'params': [{'n_neighbors': 10},
  {'n_neighbors': 12},
  {'n_neighbors': 14},
  {'n_neighbors': 18},
  

In [14]:
kneighbors_model = KNeighborsRegressor(n_neighbors=grid_search.best_params_['n_neighbors']).fit(x_train, y_train)

In [16]:
y_pred = kneighbors_model.predict(x_test)

print('Training score: ', kneighbors_model.score(x_train, y_train))
print('Test score: ', r2_score(y_test, y_pred))

Training score:  0.7177282385169992
Test score:  0.7396024824879233


In [18]:
#Now decision Tree
parameters = {'max_depth':[1,2,3,4,5,6,7,8]}

grid_search = GridSearchCV(DecisionTreeRegressor(), parameters, cv=3, return_train_score=True)
grid_search.fit(x_train, y_train)

grid_search.best_params_

{'max_depth': 3}

In [19]:
for i in range(len(parameters['max_depth'])):
    print('Parameters: ', grid_search.cv_results_['params'][i])
    print('Mean Test Score: ', grid_search.cv_results_['mean_test_score'][i])
    print('Rank: ', grid_search.cv_results_['rank_test_score'][i])

Parameters:  {'max_depth': 1}
Mean Test Score:  0.538095683335316
Rank:  8
Parameters:  {'max_depth': 2}
Mean Test Score:  0.6625724470741386
Rank:  3
Parameters:  {'max_depth': 3}
Mean Test Score:  0.694867580940357
Rank:  1
Parameters:  {'max_depth': 4}
Mean Test Score:  0.6860194502729998
Rank:  2
Parameters:  {'max_depth': 5}
Mean Test Score:  0.6186832439626622
Rank:  4
Parameters:  {'max_depth': 6}
Mean Test Score:  0.6118268299577987
Rank:  5
Parameters:  {'max_depth': 7}
Mean Test Score:  0.5872380224564758
Rank:  6
Parameters:  {'max_depth': 8}
Mean Test Score:  0.5647264359278423
Rank:  7


In [21]:
decision_tree_model = DecisionTreeRegressor(max_depth=grid_search.best_params_['max_depth']).fit(x_train, y_train)

In [22]:
y_pred = decision_tree_model.predict(x_test)

print('Training score: ', decision_tree_model.score(x_train, y_train))
print('Test score: ', r2_score(y_test, y_pred))

Training score:  0.7666580458529545
Test score:  0.7468411722508991


In [24]:
#Support Vector Regression
parameters = {'epsilon': [0.05, 0.1, 0.2,  0.3],
              'C':[0.2,0.3]}

grid_search = GridSearchCV(SVR(kernel='linear'), parameters, cv=3, return_train_score=True)
grid_search.fit(x_train, y_train)

grid_search.best_params_

{'C': 0.3, 'epsilon': 0.05}

In [25]:
svr_model = SVR(kernel='linear',
                epsilon=grid_search.best_params_['epsilon'],
                C = grid_search.best_params_['C']).fit(x_train, y_train)

In [26]:
y_pred = svr_model.predict(x_test)

print('Training score: ', svr_model.score(x_train, y_train))
print('Test score: ', r2_score(y_test, y_pred))

Training score:  0.6874549492376445
Test score:  0.7151106967299472
