In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.linear_model import RidgeCV
import seaborn as sns

In [6]:
##Importing the dataset

In [10]:
data_=pd.read_csv("ToyotaCorolla.csv",encoding='ANSI')
data_.head()

Unnamed: 0,Id,Model,Price,Age_08_04,Mfg_Month,Mfg_Year,KM,Fuel_Type,HP,Met_Color,...,Central_Lock,Powered_Windows,Power_Steering,Radio,Mistlamps,Sport_Model,Backseat_Divider,Metallic_Rim,Radio_cassette,Tow_Bar
0,1,TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,13500,23,10,2002,46986,Diesel,90,1,...,1,1,1,0,0,0,1,0,0,0
1,2,TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,13750,23,10,2002,72937,Diesel,90,1,...,1,0,1,0,0,0,1,0,0,0
2,3,TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,13950,24,9,2002,41711,Diesel,90,1,...,0,0,1,0,0,0,1,0,0,0
3,4,TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,14950,26,7,2002,48000,Diesel,90,0,...,0,0,1,0,0,0,1,0,0,0
4,5,TOYOTA Corolla 2.0 D4D HATCHB SOL 2/3-Doors,13750,30,3,2002,38500,Diesel,90,0,...,1,1,1,0,1,0,1,0,0,0


In [8]:
data_.shape

(1436, 38)

In [11]:
data_new=data_[["Price","Age_08_04","KM","HP","cc","Doors","Gears","Quarterly_Tax","Weight"]]

In [12]:
data_new.isnull().sum() ##Checking for null values

Price            0
Age_08_04        0
KM               0
HP               0
cc               0
Doors            0
Gears            0
Quarterly_Tax    0
Weight           0
dtype: int64

In [13]:
x=data_new[["Age_08_04","KM","HP","cc","Doors","Gears","Quarterly_Tax","Weight"]].values
y=data_new[["Price"]].values

In [14]:
train,test=train_test_split(data_new,test_size=0.2) ##Splitting the data into training and test data

In [15]:
regressor=LinearRegression()

In [16]:
regressor.fit(x,y)

LinearRegression()

In [17]:
regressor.coef_

array([[-1.21658402e+02, -2.08171292e-02,  3.16809058e+01,
        -1.21100301e-01, -1.61664095e+00,  5.94319936e+02,
         3.94908076e+00,  1.69586318e+01]])

In [18]:
regressor.intercept_

array([-5573.10635791])

In [19]:
regressor.score(x,y)

0.8637627463428192

In [20]:
pred_reg=regressor.predict(train.iloc[:,1:]) ##Predicted values



In [None]:
##Applying Ridge regression

In [None]:
RM=Ridge(alpha=0.01,normalize=True)

In [None]:
RM.fit(train.iloc[:,1:],train['Price'])

In [None]:
RM.coef_

In [None]:
RM.intercept_

In [None]:
RM.alpha

In [None]:
pred_RM=RM.predict(train.iloc[:,1:])

In [None]:
RM.score(train.iloc[:,1:],train['Price'])

In [None]:
np.sqrt(np.mean((pred_RM-train.Price)**2))

In [None]:
##Running Ridge regressor on a set of alpha values and observing how the  R-squared,train and test RMSE changes

In [None]:
train_rmse=[]
test_rmse=[]
R_sqrd=[]

In [None]:
alphas= np.arange(0.01,0.012,0.0001)

In [None]:
for i in alphas:
    RM1=Ridge(alpha=i,normalize=True)
    RM1.fit(train.iloc[:,1:],train['Price'])
    R_sqrd.append(RM1.score(train.iloc[:,1:],train['Price']))
    train_rmse.append(np.sqrt(np.mean((RM1.predict(train.iloc[:,1:])-train.Price)**2)))
    test_rmse.append(np.sqrt(np.mean((RM1.predict(test.iloc[:,1:])-test.Price)**2)))

In [None]:
##alpha vs R-squared values

In [None]:
plt.scatter(alphas,R_sqrd)
plt.xlabel("alphas")
plt.ylabel("R-squared")

In [None]:
plt.scatter(alphas,train_rmse)
plt.xlabel("alphas")
plt.ylabel("train_rmse")

In [None]:
plt.scatter(alphas,test_rmse)
plt.xlabel("alphas")
plt.ylabel("test_rmse")

In [None]:
##Applying Lasso regression

In [None]:
LM=Lasso(alpha=0.001,normalize=True)

In [None]:
LM.fit(train.iloc[:,1:],train['Price'])

In [None]:
pred_LM=LM.predict(train.iloc[:,1:])

In [None]:
LM.score(train.iloc[:,1:],train['Price'])

In [None]:
np.sqrt(np.mean((pred_RM-train.Price)**2))

In [None]:
##Running Lasso regressor on a set of alpha values and observing how the  R-squared,train and test RMSE changes

In [None]:
train_rmse=[]
test_rmse=[]
R_sqrd=[]

In [None]:
alphas= np.arange(0.001,0.01,0.001)

In [None]:
for i in alphas:
    LM1=Lasso(alpha=i,normalize=True)
    LM1.fit(train.iloc[:,1:],train['Price'])
    R_sqrd.append(LM1.score(train.iloc[:,1:],train['Price']))
    train_rmse.append(np.sqrt(np.mean((LM1.predict(train.iloc[:,1:])-train.Price)**2)))
    test_rmse.append(np.sqrt(np.mean((LM1.predict(test.iloc[:,1:])-test.Price)**2)))

In [None]:
##alpha vs R-squared values

In [None]:
plt.scatter(alphas,R_sqrd)
plt.xlabel("alphas")
plt.ylabel("R-squared")

In [None]:
plt.scatter(alphas,train_rmse)
plt.xlabel("alphas")
plt.ylabel("train_rmse")

In [None]:
plt.scatter(alphas,test_rmse)
plt.xlabel("alphas")
plt.ylabel("test_rmse")

In [None]:
##Visualisations - Plotting pred. values vs actual values for linear, ridge and lasso regression models.

In [None]:
sns.regplot(train.Price,pred_RM)

In [None]:
sns.regplot(train.Price,pred_LM)

In [None]:
sns.regplot(train.Price,pred_reg)