In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = sns.load_dataset('mpg')
df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790,15.6,82,usa,ford mustang gl
394,44.0,4,97.0,52.0,2130,24.6,82,europe,vw pickup
395,32.0,4,135.0,84.0,2295,11.6,82,usa,dodge rampage
396,28.0,4,120.0,79.0,2625,18.6,82,usa,ford ranger


In [3]:
df.drop('name', axis = 1, inplace=True)

In [4]:
df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
0,18.0,8,307.0,130.0,3504,12.0,70,usa
1,15.0,8,350.0,165.0,3693,11.5,70,usa
2,18.0,8,318.0,150.0,3436,11.0,70,usa
3,16.0,8,304.0,150.0,3433,12.0,70,usa
4,17.0,8,302.0,140.0,3449,10.5,70,usa
...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790,15.6,82,usa
394,44.0,4,97.0,52.0,2130,24.6,82,europe
395,32.0,4,135.0,84.0,2295,11.6,82,usa
396,28.0,4,120.0,79.0,2625,18.6,82,usa


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    object 
dtypes: float64(4), int64(3), object(1)
memory usage: 25.0+ KB


In [7]:
df.isna().sum()

mpg             0
cylinders       0
displacement    0
horsepower      6
weight          0
acceleration    0
model_year      0
origin          0
dtype: int64

In [8]:
#replace with median in horsepower
df['horsepower'].median()

np.float64(93.5)

In [9]:
df['horsepower'] = df['horsepower'].fillna(df['horsepower'].median())

In [10]:
df.isna().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model_year      0
origin          0
dtype: int64

In [12]:
df.origin.value_counts()

origin
usa       249
japan      79
europe     70
Name: count, dtype: int64

In [15]:
#data encoding
#map
df['origin'] = df['origin'].map({'usa': 1, 'japan': 2, 'europe': 3})

In [16]:
df['origin'] = df['origin'].astype('int')

In [17]:
df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
0,18.0,8,307.0,130.0,3504,12.0,70,1
1,15.0,8,350.0,165.0,3693,11.5,70,1
2,18.0,8,318.0,150.0,3436,11.0,70,1
3,16.0,8,304.0,150.0,3433,12.0,70,1
4,17.0,8,302.0,140.0,3449,10.5,70,1
...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790,15.6,82,1
394,44.0,4,97.0,52.0,2130,24.6,82,3
395,32.0,4,135.0,84.0,2295,11.6,82,1
396,28.0,4,120.0,79.0,2625,18.6,82,1


In [18]:
X= df.drop('mpg', axis=1)
y = df['mpg']

In [19]:
X

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
0,8,307.0,130.0,3504,12.0,70,1
1,8,350.0,165.0,3693,11.5,70,1
2,8,318.0,150.0,3436,11.0,70,1
3,8,304.0,150.0,3433,12.0,70,1
4,8,302.0,140.0,3449,10.5,70,1
...,...,...,...,...,...,...,...
393,4,140.0,86.0,2790,15.6,82,1
394,4,97.0,52.0,2130,24.6,82,3
395,4,135.0,84.0,2295,11.6,82,1
396,4,120.0,79.0,2625,18.6,82,1


In [20]:
y

0      18.0
1      15.0
2      18.0
3      16.0
4      17.0
       ... 
393    27.0
394    44.0
395    32.0
396    28.0
397    31.0
Name: mpg, Length: 398, dtype: float64

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X,y, test_size=0.3, random_state=1)

In [22]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model

In [23]:
model.fit(X_train, y_train)

In [25]:
model.coef_

array([-0.31761423,  0.02623748, -0.01827076, -0.00748775,  0.05040673,
        0.84709514,  1.51909584])

In [28]:
for i, col_name in enumerate(X_train.columns):
    print(f"The coefficient value for {col_name} is {model.coef_[i]}")

The coefficient value for cylinders is -0.3176142302799369
The coefficient value for displacement is 0.026237482599078946
The coefficient value for horsepower is -0.018270764913124595
The coefficient value for weight is -0.007487750398361897
The coefficient value for acceleration is 0.0504067346197138
The coefficient value for model_year is 0.8470951427061365
The coefficient value for origin is 1.5190958387975024


In [None]:
#observations--
#coef are relatively small, if one IV changes there will be not much difference in prediction
#sometimes called smoother model
#these features are not contibuting much in training

In [35]:
from sklearn.metrics import r2_score
y_pred = model.predict(X_test)
r2_linear = r2_score(y_test, y_pred)
print(f"The Rsquare of linear regression {r2_linear}")

The Rsquare of linear regression 0.8348001123742285


In [31]:
#regularization
#ridge
from sklearn.linear_model import Ridge
ridge_model = Ridge(alpha=0.1)
ridge_model

In [32]:
ridge_model.fit(X_train, y_train)

In [33]:
for i, col_name in enumerate(X_train.columns):
    print(f"The coefficient value for {col_name} is {ridge_model.coef_[i]}")

The coefficient value for cylinders is -0.317003210100688
The coefficient value for displacement is 0.026213249757983868
The coefficient value for horsepower is -0.018263252481448933
The coefficient value for weight is -0.007487326050213144
The coefficient value for acceleration is 0.05036896947442996
The coefficient value for model_year is 0.8470062938903175
The coefficient value for origin is 1.5174528285653952


In [None]:
y_pred_ridge = model.predict(X_test)
r2_linear_ridge = r2_score(y_test, y_pred_ridge)
print(f"The Rsquare of ridge regression {r2_linear_ridge}")
#not much change

The Rsquare of ridge regression 0.8348001123742285


In [37]:
#lasso
from sklearn.linear_model import Lasso
lasso_reg_model = Lasso(alpha=0.5)
lasso_reg_model

In [38]:
lasso_reg_model.fit(X_train, y_train)

In [39]:
for i, col_name in enumerate(X_train.columns):
    print(f"The coefficient value for {col_name} is {lasso_reg_model.coef_[i]}")

The coefficient value for cylinders is -0.0
The coefficient value for displacement is 0.006208198888300358
The coefficient value for horsepower is -0.011058382987169565
The coefficient value for weight is -0.0069826731680230885
The coefficient value for acceleration is 0.0
The coefficient value for model_year is 0.744654952003819
The coefficient value for origin is 0.0


In [None]:
#few of the coef (3) is 0--- feature selection

In [None]:
y_pred_lasso = lasso_reg_model.predict(X_test)
r2_linear_lasso = r2_score(y_test, y_pred_lasso)
print(f"The Rsquare of lasso regression {r2_linear_lasso}")

The Rsquare of ridge regression 0.8277934716635555


In [None]:
#removing the 3 non important features rsquare is affected by 1%
#we are achieving better r2 value by removing the unnecessary features

In [41]:
#elastic net
from sklearn.linear_model import ElasticNet
ElasticNet_model = ElasticNet(alpha=1, l1_ratio=0.5)
ElasticNet_model

In [42]:
ElasticNet_model.fit(X_train,y_train)

In [43]:
for i, col_name in enumerate(X_train.columns):
    print(f"The coefficient value for {col_name} is {ElasticNet_model.coef_[i]}")

The coefficient value for cylinders is -0.0
The coefficient value for displacement is 0.005888869953667563
The coefficient value for horsepower is -0.012403874933570126
The coefficient value for weight is -0.006934550516257631
The coefficient value for acceleration is 0.0
The coefficient value for model_year is 0.7133150744603874
The coefficient value for origin is 0.0


In [44]:
y_pred_elastic = ElasticNet_model.predict(X_test)
r2_linear_elastic = r2_score(y_test, y_pred_elastic)
print(f"The Rsquare of lasso regression {r2_linear_elastic}")

The Rsquare of lasso regression 0.8284840073256804


In [45]:
#regularization with cross validation
from sklearn.linear_model import LassoCV
lasso_cv= LassoCV(cv = 5, verbose=1)
lasso_cv

In [46]:
lasso_cv.fit(X_train, y_train)

....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

In [48]:
y_pred = lasso_cv.predict(X_test)
score = r2_score(y_test, y_pred)
print(f"The Rsquare of lassoCV regression {score}")

The Rsquare of lassoCV regression 0.8082805983844751


In [50]:
from sklearn.linear_model import RidgeCV
Ridge_cv= RidgeCV(cv = 5)
Ridge_cv

In [52]:
Ridge_cv.fit(X_train, y_train)

In [53]:
y_pred = Ridge_cv.predict(X_test)
score = r2_score(y_test, y_pred)
print(f"The Rsquare of RidgeCV regression {score}")

The Rsquare of RidgeCV regression 0.8354145247502055


In [54]:
Ridge_cv.get_params()

{'alpha_per_target': False,
 'alphas': (0.1, 1.0, 10.0),
 'cv': 5,
 'fit_intercept': True,
 'gcv_mode': None,
 'scoring': None,
 'store_cv_results': None,
 'store_cv_values': 'deprecated'}

In [55]:
from sklearn.linear_model import ElasticNetCV
elasctic_cv = ElasticNetCV()
elasctic_cv

In [56]:
elasctic_cv.fit(X_train, y_train)

In [58]:
y_pred =elasctic_cv.predict(X_test)
elasctic_score = r2_score(y_test, y_pred)
print(f"The Rsquare of elascticCV regression {elasctic_score}")

The Rsquare of elascticCV regression 0.792863401804916


In [59]:
#CV with hyperparameter tuning
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [60]:
lasso = Lasso()
lasso

In [64]:
#param grid
param_grid = {'alpha': [0.001, 0.01, 0.1, 1,10,100]}

In [65]:
GridSearch = GridSearchCV(estimator = lasso, param_grid=param_grid, cv = 5, scoring='r2', verbose = 2)
GridSearch

In [66]:
GridSearch.fit(X_train , y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] END ........................................alpha=0.001; total time=   0.0s
[CV] END ........................................alpha=0.001; total time=   0.0s
[CV] END ........................................alpha=0.001; total time=   0.0s
[CV] END ........................................alpha=0.001; total time=   0.0s
[CV] END ........................................alpha=0.001; total time=   0.0s
[CV] END .........................................alpha=0.01; total time=   0.0s
[CV] END .........................................alpha=0.01; total time=   0.0s
[CV] END .........................................alpha=0.01; total time=   0.0s
[CV] END .........................................alpha=0.01; total time=   0.0s
[CV] END .........................................alpha=0.01; total time=   0.0s
[CV] END ..........................................alpha=0.1; total time=   0.0s
[CV] END ........................................

In [67]:
GridSearch.best_params_

{'alpha': 0.1}

In [68]:
GridSearch.best_score_

np.float64(0.7964209726696481)

In [69]:
GridSearch.best_estimator_

In [70]:
y_pred = GridSearch.best_estimator_.predict(X_test)
r2_score(y_test, y_pred)

0.8345318641232303

In [79]:
#random search cv
param_distribuition = {'alpha': [0.001, 0.01, 0.1, 1,10,100]}
random_search = RandomizedSearchCV(estimator=lasso, n_iter=3, param_distributions=param_distribuition, cv=5, scoring='r2', verbose=2)
random_search

In [80]:
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] END .........................................alpha=0.01; total time=   0.0s
[CV] END .........................................alpha=0.01; total time=   0.0s
[CV] END .........................................alpha=0.01; total time=   0.0s
[CV] END .........................................alpha=0.01; total time=   0.0s
[CV] END .........................................alpha=0.01; total time=   0.0s
[CV] END ..........................................alpha=0.1; total time=   0.0s
[CV] END ..........................................alpha=0.1; total time=   0.0s
[CV] END ..........................................alpha=0.1; total time=   0.0s
[CV] END ..........................................alpha=0.1; total time=   0.0s
[CV] END ..........................................alpha=0.1; total time=   0.0s
[CV] END ........................................alpha=0.001; total time=   0.0s
[CV] END ........................................

In [81]:
random_search.best_estimator_

In [82]:
random_search.best_score_

np.float64(0.7964209726696481)

In [83]:
y_pred = random_search.best_estimator_.predict(X_test)
r2_score(y_test, y_pred)

0.8345318641232303

In [85]:
#ridge grid cv
RidgeCV = GridSearchCV(estimator=ridge_model, param_grid=param_grid, cv =5, scoring='r2', verbose=2)
RidgeCV

In [86]:
RidgeCV.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] END ........................................alpha=0.001; total time=   0.0s
[CV] END ........................................alpha=0.001; total time=   0.0s
[CV] END ........................................alpha=0.001; total time=   0.0s
[CV] END ........................................alpha=0.001; total time=   0.0s
[CV] END ........................................alpha=0.001; total time=   0.0s
[CV] END .........................................alpha=0.01; total time=   0.0s
[CV] END .........................................alpha=0.01; total time=   0.0s
[CV] END .........................................alpha=0.01; total time=   0.0s
[CV] END .........................................alpha=0.01; total time=   0.0s
[CV] END .........................................alpha=0.01; total time=   0.0s
[CV] END ..........................................alpha=0.1; total time=   0.0s
[CV] END ........................................

In [87]:
RidgeCV.best_params_

{'alpha': 10}

In [88]:
y_pred = RidgeCV.best_estimator_.predict(X_test)
r2_score(y_test, y_pred)

0.8354145247502055

In [89]:
#Ridge Randomized search cv CV
ridge = Ridge()
param_distributions = {'alpha' : [0.001, 0.01, 0.1, 1, 10, 100]}
ridge = RandomizedSearchCV(estimator=ridge, param_distributions = param_distributions,n_iter=2, cv=5, scoring = 'r2', verbose=2)
ridge.fit(X_train, y_train)
print(ridge.best_params_)
print(ridge.best_estimator_)
y_pred = ridge.best_estimator_.predict(X_test)
print(r2_score(y_test, y_pred))

Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV] END ........................................alpha=0.001; total time=   0.0s
[CV] END ........................................alpha=0.001; total time=   0.0s
[CV] END ........................................alpha=0.001; total time=   0.0s
[CV] END ........................................alpha=0.001; total time=   0.0s
[CV] END ........................................alpha=0.001; total time=   0.0s
[CV] END ..........................................alpha=0.1; total time=   0.0s
[CV] END ..........................................alpha=0.1; total time=   0.0s
[CV] END ..........................................alpha=0.1; total time=   0.0s
[CV] END ..........................................alpha=0.1; total time=   0.0s
[CV] END ..........................................alpha=0.1; total time=   0.0s
{'alpha': 0.1}
Ridge(alpha=0.1)
0.8348084889168356


In [90]:
#elastic net Grid Search CV
model = ElasticNet()
param_grid = {'alpha' : [0.001, 0.01, 0.1, 1, 10, 100],
             'l1_ratio': [0.1,0.4, 0.9]}
model = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring = 'r2', verbose=2)
model.fit(X_train, y_train)
print(model.best_params_)
print(model.best_estimator_)
y_pred = model.best_estimator_.predict(X_test)
print(r2_score(y_test, y_pred))

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV] END ..........................alpha=0.001, l1_ratio=0.1; total time=   0.0s
[CV] END ..........................alpha=0.001, l1_ratio=0.1; total time=   0.0s
[CV] END ..........................alpha=0.001, l1_ratio=0.1; total time=   0.0s
[CV] END ..........................alpha=0.001, l1_ratio=0.1; total time=   0.0s
[CV] END ..........................alpha=0.001, l1_ratio=0.1; total time=   0.0s
[CV] END ..........................alpha=0.001, l1_ratio=0.4; total time=   0.0s
[CV] END ..........................alpha=0.001, l1_ratio=0.4; total time=   0.0s
[CV] END ..........................alpha=0.001, l1_ratio=0.4; total time=   0.0s
[CV] END ..........................alpha=0.001, l1_ratio=0.4; total time=   0.0s
[CV] END ..........................alpha=0.001, l1_ratio=0.4; total time=   0.0s
[CV] END ..........................alpha=0.001, l1_ratio=0.9; total time=   0.0s
[CV] END ..........................alpha=0.001, 

In [91]:
#elastic netRandomized Search CV
model = ElasticNet()
param_distributions = {'alpha' : [0.001, 0.01, 0.1, 1, 10, 100],
             'l1_ratio': [0.1,0.4, 0.9]}
model = RandomizedSearchCV(estimator=model, param_distributions = param_distributions,n_iter=2, cv=5, scoring = 'r2', verbose=2)
model.fit(X_train, y_train)
print(model.best_params_)
print(model.best_estimator_)
y_pred = model.best_estimator_.predict(X_test)
print(r2_score(y_test, y_pred))

Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV] END .............................alpha=10, l1_ratio=0.9; total time=   0.0s
[CV] END .............................alpha=10, l1_ratio=0.9; total time=   0.0s
[CV] END .............................alpha=10, l1_ratio=0.9; total time=   0.0s
[CV] END .............................alpha=10, l1_ratio=0.9; total time=   0.0s
[CV] END .............................alpha=10, l1_ratio=0.9; total time=   0.0s
[CV] END ............................alpha=0.1, l1_ratio=0.1; total time=   0.0s
[CV] END ............................alpha=0.1, l1_ratio=0.1; total time=   0.0s
[CV] END ............................alpha=0.1, l1_ratio=0.1; total time=   0.0s
[CV] END ............................alpha=0.1, l1_ratio=0.1; total time=   0.0s
[CV] END ............................alpha=0.1, l1_ratio=0.1; total time=   0.0s
{'l1_ratio': 0.1, 'alpha': 0.1}
ElasticNet(alpha=0.1, l1_ratio=0.1)
0.8356976438132142
