In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("cars.csv")

In [3]:
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,body-style,drive-wheels,engine-location,width,height,engine-type,engine-size,horsepower,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,convertible,rwd,front,64.1,48.8,dohc,130,111,21,27,13495
1,3,?,alfa-romero,gas,convertible,rwd,front,64.1,48.8,dohc,130,111,21,27,16500
2,1,?,alfa-romero,gas,hatchback,rwd,front,65.5,52.4,ohcv,152,154,19,26,16500
3,2,164,audi,gas,sedan,fwd,front,66.2,54.3,ohc,109,102,24,30,13950
4,2,164,audi,gas,sedan,4wd,front,66.4,54.3,ohc,136,115,18,22,17450


In [4]:
# preprocessing

df["horsepower"].replace("?", np.nan, inplace=True)
df["normalized-losses"].replace("?", np.nan, inplace=True)

# convert to float

df["normalized-losses"] = df["normalized-losses"].astype("float64")

df["horsepower"] = df["horsepower"].astype("float64")

In [5]:
print(df["normalized-losses"].mean())
print(df["horsepower"].mean())

122.0
104.25615763546799


In [6]:
# fill null

df["normalized-losses"].fillna(122, inplace=True)
df["horsepower"].fillna(104, inplace=True)

In [7]:
# feature and target

X = df.drop("price",axis=1)
y = df["price"]

# train test
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=1)

In [8]:
# label encoding on categorical columns
from sklearn.preprocessing import LabelEncoder
categorical_columns =["make","fuel-type","body-style","drive-wheels","engine-location","engine-type"]


for col in categorical_columns:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col])
    X_test[col] = le.transform(X_test[col])

In [9]:
# Linear Regression

lr = LinearRegression()
lr.fit(X_train,y_train)

LinearRegression()

In [10]:
# train score
lr.score(X_train,y_train)

0.8504568715278972

In [11]:
# test score
lr.score(X_test,y_test)

0.7965544858354369

In [12]:
# coefficients
lr.coef_

array([ 4.51306786e+01,  1.53226590e+00, -2.00105431e+02, -6.22733455e+02,
       -1.70254151e+02,  1.86861852e+03,  1.64130546e+04,  7.89414119e+02,
        3.62704004e+02,  2.83207856e+02,  9.83648625e+01, -1.08091293e+01,
        3.08067510e+02, -4.17054418e+02])

In [13]:
# Regularization
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

In [14]:
# Ridge 
 
for alpha in range(1,5):
    l2 = Ridge(alpha)
    l2.fit(X_train,y_train)
    
    train_score = l2.score(X_train,y_train)
    test_score = l2.score(X_test,y_test)
    
    print("alpha: ",alpha)
    print("train score: ",train_score)
    print("test score: ",test_score)
    print("-----------------------")

alpha:  1
train score:  0.843601737444792
test score:  0.8074548621129003
-----------------------
alpha:  2
train score:  0.8356568441584951
test score:  0.811034506445969
-----------------------
alpha:  3
train score:  0.8295968895778684
test score:  0.8126999425979509
-----------------------
alpha:  4
train score:  0.825005387860624
test score:  0.8136223491977842
-----------------------


In [15]:
# Lasso 
 
for alpha in range(100,151,10):
    l1 = Lasso(alpha)
    l1.fit(X_train,y_train)
    
    train_score = l1.score(X_train,y_train)
    test_score = l1.score(X_test,y_test)
    
    print("alpha: ",alpha)
    print("train score: ",train_score)
    print("test score: ",test_score)
    print("-----------------------")

alpha:  100
train score:  0.8372913548415435
test score:  0.8090021821374433
-----------------------
alpha:  110
train score:  0.8345926568500504
test score:  0.8098695559256961
-----------------------
alpha:  120
train score:  0.8316369953253453
test score:  0.8106533577754124
-----------------------
alpha:  130
train score:  0.8284242259406098
test score:  0.8113536564089348
-----------------------
alpha:  140
train score:  0.8249544618886407
test score:  0.8119703935273413
-----------------------
alpha:  150
train score:  0.8212275306537296
test score:  0.8125036163224489
-----------------------


In [16]:
# Final model

l2 = Ridge(2)
l2.fit(X_train,y_train)

Ridge(alpha=2)

In [17]:
l2.coef_

array([ 1.66481207e+02, -8.84441662e-01, -1.94640409e+02, -1.13899099e+03,
       -4.80930429e+02,  1.88116260e+03,  7.76063665e+03,  5.06205088e+02,
        5.02070049e+02,  4.65935098e+02,  1.00152739e+02,  1.04161416e+01,
        2.44076856e+02, -3.27705725e+02])

In [18]:
# Final model

l1 = Lasso(120)
l1.fit(X_train,y_train)

Lasso(alpha=120)

In [20]:
l1.coef_

array([ 1.75867421e+01,  1.78544194e+00, -1.79795294e+02, -0.00000000e+00,
       -3.07858721e+02,  1.44112573e+03,  7.91251805e+03,  5.26900910e+02,
        4.28198637e+02,  2.93469750e+02,  1.09030232e+02,  3.55672490e+00,
        1.54215859e+02, -2.45172409e+02])

In [21]:
# cross validation

from sklearn.model_selection import cross_val_score

In [22]:
categorical_columns =["make","fuel-type","body-style","drive-wheels","engine-location","engine-type"]


for col in categorical_columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])

In [23]:
# cross validation on lasso

cv1 = cross_val_score(l1,X,y,cv=4)

print(cv1)

print("mean -",np.mean(cv1))

[0.73599708 0.83762008 0.40943319 0.47427828]
mean 0.6143321548746503


In [24]:
# cross validation on ridge

cv2 = cross_val_score(l2,X,y,cv=4)

print(cv2)

print("mean - ",np.mean(cv2))

[0.71178047 0.86472552 0.37642252 0.47023102]
mean -  0.605789882648044
