In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [2]:
data = sns.load_dataset("mpg")

In [3]:
data.drop("name", axis=1, inplace=True)

In [4]:
data

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
0,18.0,8,307.0,130.0,3504,12.0,70,usa
1,15.0,8,350.0,165.0,3693,11.5,70,usa
2,18.0,8,318.0,150.0,3436,11.0,70,usa
3,16.0,8,304.0,150.0,3433,12.0,70,usa
4,17.0,8,302.0,140.0,3449,10.5,70,usa
...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790,15.6,82,usa
394,44.0,4,97.0,52.0,2130,24.6,82,europe
395,32.0,4,135.0,84.0,2295,11.6,82,usa
396,28.0,4,120.0,79.0,2625,18.6,82,usa


In [5]:
data.isna().sum()

mpg             0
cylinders       0
displacement    0
horsepower      6
weight          0
acceleration    0
model_year      0
origin          0
dtype: int64

In [6]:
med = data['horsepower'].median()

In [7]:
data['horsepower'] = data['horsepower'].fillna(med)

In [8]:
data.isna().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model_year      0
origin          0
dtype: int64

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    float64
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    object 
dtypes: float64(4), int64(3), object(1)
memory usage: 25.0+ KB


In [10]:
data['origin'].unique()

array(['usa', 'japan', 'europe'], dtype=object)

In [11]:
data.origin.value_counts()

usa       249
japan      79
europe     70
Name: origin, dtype: int64

In [12]:
data['origin'] = data['origin'].map({'usa':1, 'japan':2, 'europe':3})

In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    float64
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    int64  
dtypes: float64(4), int64(4)
memory usage: 25.0 KB


In [14]:
x = data.drop('mpg', axis=1)
y = data['mpg']

In [15]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3, random_state=1)

In [16]:
x_train.shape, x_test.shape

((278, 7), (120, 7))

In [17]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [18]:
model.fit(x_train, y_train)

In [19]:
for i, col_name in enumerate(x_train.columns):
    print(f"the coefficient for {col_name} is {model.coef_[i]}")

the coefficient for cylinders is -0.3176142302799304
the coefficient for displacement is 0.026237482599078935
the coefficient for horsepower is -0.018270764913124574
the coefficient for weight is -0.007487750398361904
the coefficient for acceleration is 0.0504067346197142
the coefficient for model_year is 0.847095142706137
the coefficient for origin is 1.5190958387975046


In [20]:
from sklearn.metrics import r2_score
y_pred_linear = model.predict(x_test)
r2_score(y_test, y_pred_linear)

0.8348001123742286

In [28]:
## Ridge Regression
from sklearn.linear_model import ElasticNet, Ridge, LassoCV, Lasso

In [25]:
ridge_reg_model = Ridge(alpha= 0.1)
ridge_reg_model.fit(x_train, y_train)
for i, col_name in enumerate(x_train.columns):
    print(f"the coefficient for {col_name} is {ridge_reg_model.coef_[i]}")

the coefficient for cylinders is -0.31700321010067906
the coefficient for displacement is 0.02621324975798342
the coefficient for horsepower is -0.018263252481449534
the coefficient for weight is -0.00748732605021309
the coefficient for acceleration is 0.050368969474425776
the coefficient for model_year is 0.8470062938903167
the coefficient for origin is 1.5174528285653937


In [26]:
y_pred_ridge = ridge_reg_model.predict(x_test)
r2_score(y_test,y_pred_ridge)

0.8348084889168355

In [29]:
lasso_reg_model = Lasso(alpha= 0.1)
lasso_reg_model.fit(x_train, y_train)
for i, col_name in enumerate(x_train.columns):
    print(f"the coefficient for {col_name} is {lasso_reg_model.coef_[i]}")

the coefficient for cylinders is -0.0
the coefficient for displacement is 0.017751964528123634
the coefficient for horsepower is -0.019307657818043437
the coefficient for weight is -0.007285668218708932
the coefficient for acceleration is 0.011179016975864573
the coefficient for model_year is 0.8258205650724195
the coefficient for origin is 1.1922557769714675


In [30]:
y_pred_lasso = lasso_reg_model.predict(x_test)
r2_score(y_test,y_pred_lasso)

0.8345318641232303

In [36]:
elastic_regression_model = ElasticNet(alpha=0.4)
elastic_regression_model.fit(x_train, y_train)
for i, col_name in enumerate(x_train.columns):
    print(f"The coefficient for {col_name} is {elastic_regression_model.coef_[i]}")

The coefficient for cylinders is -0.0
The coefficient for displacement is 0.01162881749082777
The coefficient for horsepower is -0.01594937620461702
The coefficient for weight is -0.007101799886156715
The coefficient for acceleration is 0.0
The coefficient for model_year is 0.7782693989831739
The coefficient for origin is 0.5532990656888843
