In [1]:
import pandas as pd
import numpy as np

#Import Graphical plotting Libraries
import matplotlib.pyplot as plt
import seaborn as sns
#%matplotlib inline

#Import Linear Regression Machine Learning Library
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score

In [2]:

data = pd.read_csv(r"C:\Users\rosha\OneDrive\Desktop\Gitesh\NIT\TOPICS\ML\Regularization technique project\car-mpg.csv")
data.head()

Unnamed: 0,mpg,cyl,disp,hp,wt,acc,yr,origin,car_type,car_name
0,18.0,8,307.0,130,3504,12.0,70,1,0,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,0,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,0,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,0,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,0,ford torino


In [3]:
data = data.drop(['car_name'], axis=1)
data['origin'] = data['origin'].replace({1: 'america', 2: 'europe', 3: 'asia'})
data = pd.get_dummies(data, columns=['origin'], dtype= int)
data = data.replace('?', np.nan)

In [4]:
data = data.apply(pd.to_numeric, errors='ignore')

#Fill missing values with median only for numeric columns
numeric_cols = data.select_dtypes(include=[np.number]).columns
data[numeric_cols] = data[numeric_cols].apply(lambda x: x.fillna(x.median()))

  data = data.apply(pd.to_numeric, errors='ignore')


In [5]:
data

Unnamed: 0,mpg,cyl,disp,hp,wt,acc,yr,car_type,origin_america,origin_asia,origin_europe
0,18.0,8,307.0,130.0,3504,12.0,70,0,1,0,0
1,15.0,8,350.0,165.0,3693,11.5,70,0,1,0,0
2,18.0,8,318.0,150.0,3436,11.0,70,0,1,0,0
3,16.0,8,304.0,150.0,3433,12.0,70,0,1,0,0
4,17.0,8,302.0,140.0,3449,10.5,70,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790,15.6,82,1,1,0,0
394,44.0,4,97.0,52.0,2130,24.6,82,1,0,0,1
395,32.0,4,135.0,84.0,2295,11.6,82,1,1,0,0
396,28.0,4,120.0,79.0,2625,18.6,82,1,1,0,0


Model Building

In [6]:
x = data.drop(['mpg'], axis=1)   
y = data['mpg']

In [7]:
#Scaling the data
x_s = preprocessing.scale(x)
x_s = pd.DataFrame(x_s, columns=x.columns) #converting scaled data into dataframe

y_s = preprocessing.scale(y)
y_s = pd.DataFrame(y_s, columns =["target"]) #ideally train, test data should be in calumns

In [8]:
x_s

Unnamed: 0,cyl,disp,hp,wt,acc,yr,car_type,origin_america,origin_asia,origin_europe
0,1.498191,1.090604,0.673118,0.630870,-1.295498,-1.627426,-1.062235,0.773559,-0.497643,-0.461968
1,1.498191,1.503514,1.589958,0.854333,-1.477038,-1.627426,-1.062235,0.773559,-0.497643,-0.461968
2,1.498191,1.196232,1.197027,0.550470,-1.658577,-1.627426,-1.062235,0.773559,-0.497643,-0.461968
3,1.498191,1.061796,1.197027,0.546923,-1.295498,-1.627426,-1.062235,0.773559,-0.497643,-0.461968
4,1.498191,1.042591,0.935072,0.565841,-1.840117,-1.627426,-1.062235,0.773559,-0.497643,-0.461968
...,...,...,...,...,...,...,...,...,...,...
393,-0.856321,-0.513026,-0.479482,-0.213324,0.011586,1.621983,0.941412,0.773559,-0.497643,-0.461968
394,-0.856321,-0.925936,-1.370127,-0.993671,3.279296,1.621983,0.941412,-1.292726,-0.497643,2.164651
395,-0.856321,-0.561039,-0.531873,-0.798585,-1.440730,1.621983,0.941412,0.773559,-0.497643,-0.461968
396,-0.856321,-0.705077,-0.662850,-0.408411,1.100822,1.621983,0.941412,0.773559,-0.497643,-0.461968


In [9]:
y_s

Unnamed: 0,target
0,-0.706439
1,-1.090751
2,-0.706439
3,-0.962647
4,-0.834543
...,...
393,0.446497
394,2.624265
395,1.087017
396,0.574601


In [10]:
#Split into train, test sets
X_train, X_test, y_train, y_test = train_test_split(x_s, y_s, test_size = 0.30, random_state = 1)
X_train.shape

(278, 10)

A. Simple Linear Model

In [11]:
#Fit simple linear model and find coefficients
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)

for idx, col_name in enumerate(X_train.columns):
    print('The coefficient for {} is {}'. format(col_name, regression_model.coef_[idx]))
    
intercept = regression_model.intercept_
print('The intercept for our model is {}'.format(intercept))

The coefficient for cyl is [ 0.32102239  0.32483431 -0.2291695  -0.71121019  0.01471368  0.37558119
  0.38147695 -0.07472248  0.04451525  0.04834855]


IndexError: index 1 is out of bounds for axis 0 with size 1

B. Regularized Ridge Regression

In [12]:
# alpha factor here is lambda (penalty term) which helps to reduce the magnitude of the coefficients

ridge_model = Ridge(alpha=0.3)
ridge_model.fit(X_train, y_train)

print('Ridge model coef : {}'.format(ridge_model.coef_))
#as the data has 10 columns hence 10 coefficients appear here

Ridge model coef : [ 0.31649043  0.31320707 -0.22876025 -0.70109447  0.01295851  0.37447352
  0.37725608 -0.07423624  0.04441039  0.04784031]


C Regularized Lasso Regression

In [13]:
#alpha factor here is lambda (penalty term) which helps to reduce the magnitude of coeff

lasso_model = Lasso(alpha = 0.1)
lasso_model.fit(X_train, y_train)

print('Lasso model coef: {}'.format(lasso_model.coef_))
#As the data has 10 columns hence 10 coefficients appear here   

Lasso model coef: [-0.         -0.         -0.01690287 -0.51890013  0.          0.28138241
  0.1278489  -0.01642647  0.          0.        ]


Score Comparison

In [14]:
#Model score - r^2 or coeff of determinant
#r^2 = 1-(RSS/TSS) = Regression error/TSS 


#Simple Linear Model
print(regression_model.score(X_train, y_train))
print(regression_model.score(X_test, y_test))

print('*************************')
#Ridge
print(ridge_model.score(X_train, y_train))
print(ridge_model.score(X_test, y_test))

print('*************************')
#Lasso
print(lasso_model.score(X_train, y_train))
print(lasso_model.score(X_test, y_test))

0.8343770256960538
0.8513421387780065
*************************
0.8343617931312616
0.8518882171608508
*************************
0.7938010766228453
0.8375229615977084


Model Parameter Tuning

In [15]:
data_train_test = pd.concat([X_train, y_train], axis =1)
data_train_test.head()

Unnamed: 0,cyl,disp,hp,wt,acc,yr,car_type,origin_america,origin_asia,origin_europe,target
350,-0.856321,-0.849116,-1.081977,-0.893172,-0.24257,1.351199,0.941412,0.773559,-0.497643,-0.461968,1.432898
59,-0.856321,-0.925936,-1.317736,-0.847061,2.879909,-1.085858,0.941412,-1.292726,-0.497643,2.164651,-0.065919
120,-0.856321,-0.695475,0.2016,-0.121101,-0.024722,-0.815074,0.941412,-1.292726,-0.497643,2.164651,-0.578335
12,1.498191,1.983643,1.197027,0.934732,-2.203196,-1.627426,-1.062235,0.773559,-0.497643,-0.461968,-1.090751
349,-0.856321,-0.983552,-0.951,-1.165111,0.156817,1.351199,0.941412,-1.292726,2.009471,-0.461968,1.356035


In [16]:
import statsmodels.formula.api as smf
ols1 = smf.ols(formula = 'mpg ~ cyl+disp+hp+wt+acc+yr+car_type+origin_america+origin_europe+origin_asia', data = data_train_test).fit()
ols1.params

ImportError: cannot import name '_lazywhere' from 'scipy._lib._util' (c:\Users\rosha\anaconda3\Lib\site-packages\scipy\_lib\_util.py)

In [17]:
print(ols1.summary())

NameError: name 'ols1' is not defined

In [18]:
#Lets check Sum of Squared Errors (SSE) by predicting value of y for test cases and subtracting from the actual y for the test cases
mse  = np.mean((regression_model.predict(X_test)-y_test)**2)

# root of mean_sq_error is standard deviation i.e. avg variance between predicted and actual
import math
rmse = math.sqrt(mse)
print('Root Mean Squared Error: {}'.format(rmse))

Root Mean Squared Error: 0.3776693425408785


In [19]:
# predict mileage (mpg) for a set of attributes not in the training or test set
y_pred = regression_model.predict(X_test)

# Since this is regression, plot the predicted y value vs actual y values for the test data
# A good model's prediction will be close to actual leading to high R and R2 values
#plt.rcParams['figure.dpi'] = 500
plt.scatter(y_test['mpg'], y_pred)

KeyError: 'mpg'