In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("../cars.csv")

In [3]:
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,body-style,drive-wheels,engine-location,width,height,engine-type,engine-size,horsepower,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,convertible,rwd,front,64.1,48.8,dohc,130,111,21,27,13495
1,3,?,alfa-romero,gas,convertible,rwd,front,64.1,48.8,dohc,130,111,21,27,16500
2,1,?,alfa-romero,gas,hatchback,rwd,front,65.5,52.4,ohcv,152,154,19,26,16500
3,2,164,audi,gas,sedan,fwd,front,66.2,54.3,ohc,109,102,24,30,13950
4,2,164,audi,gas,sedan,4wd,front,66.4,54.3,ohc,136,115,18,22,17450


In [3]:
df["normalized-losses"].replace("?",np.nan,inplace=True)
df["normalized-losses"] = df["normalized-losses"].astype(float)
losses_mean = df["normalized-losses"].mean()
df["normalized-losses"].fillna(losses_mean,inplace=True)

In [4]:
df["horsepower"].replace("?",np.nan,inplace=True)
df["horsepower"] = df["horsepower"].astype(float)
horsepower_mean = df["horsepower"].mean()
df["horsepower"].fillna(horsepower_mean,inplace=True)

In [5]:
df_num = df.select_dtypes(["int64","float64"])
df_cat = df.select_dtypes("object")

In [6]:
for col in df_cat:
    le = LabelEncoder()
    df_cat[col] = le.fit_transform(df_cat[col])

In [7]:
df_new = pd.concat([df_num,df_cat],axis=1)

In [8]:
df_new.head()

Unnamed: 0,symboling,normalized-losses,width,height,engine-size,horsepower,city-mpg,highway-mpg,price,make,fuel-type,body-style,drive-wheels,engine-location,engine-type
0,3,122.0,64.1,48.8,130,111.0,21,27,13495,0,1,0,2,0,0
1,3,122.0,64.1,48.8,130,111.0,21,27,16500,0,1,0,2,0,0
2,1,122.0,65.5,52.4,152,154.0,19,26,16500,0,1,2,2,0,5
3,2,164.0,66.2,54.3,109,102.0,24,30,13950,1,1,3,1,0,3
4,2,164.0,66.4,54.3,136,115.0,18,22,17450,1,1,3,0,0,3


In [10]:
df.shape

(205, 15)

In [9]:
X = df_new.drop("price",axis=1)
y = df_new["price"]

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=1)

### Baseline model

In [10]:
lin = LinearRegression()

lin.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [11]:
lin.score(X_test,y_test)

0.7965566780397374

### Filter Method
<p>1. Correlation Coefficient</p>
<p>2. Chi2 Test</p>
<p>3. ANOVA Test</p>

<h4>Chi2 and ANOVA Test</h4>

In [12]:
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import SelectKBest

In [13]:
# k -> no. of good feature
anova = SelectKBest(score_func=f_regression, k=6)

In [14]:
X_train_f = anova.fit_transform(X_train,y_train)

In [15]:
X_test_f = anova.transform(X_test)

In [16]:
lin_anova = LinearRegression()

lin_anova.fit(X_train_f,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [17]:
lin_anova.score(X_test_f,y_test)

0.7982503502453704

In [18]:
# Perform chi2 on non-negative values
#chi = SelectKBest(score_func=chi2,k=7)

### Wrapper method
<p>1. Forward method</p>
<p>2. Backward method</p>

<h4>Forward selection</h4>

In [19]:
def adjusted_r2(k,r2,n):
    adj_r2 = 1 - (1-r2)*((n-1)/(n-(k+1)))
    return adj_r2

In [21]:
columns = []
i = len(columns) + 1
for col in X:
    columns.append(col)
    
    X_new = df_new[columns]
    
    X_train,X_test,y_train,y_test = train_test_split(X_new,y,test_size=0.3,random_state=1)
    
    lin = LinearRegression()
    
    lin.fit(X_train,y_train)
    
    r2 = lin.score(X_test,y_test)
    
    n = len(X_test)
    
    adj_r2 = adjusted_r2(i,r2,n)
    
    print("cols: ",i,"\tr2 score: ", r2,"\tadjusted r2: ",adj_r2)
    
    i += 1

cols:  1 	r2 score:  -0.0017837050450488778 	adjusted r2:  -0.018480100129133037
cols:  2 	r2 score:  0.041612508167990114 	adjusted r2:  0.00912479658046439
cols:  3 	r2 score:  0.6171737569085426 	adjusted r2:  0.5973723995072604
cols:  4 	r2 score:  0.6183283499764454 	adjusted r2:  0.5915443745361959
cols:  5 	r2 score:  0.7589560567217414 	adjusted r2:  0.7374342760718968
cols:  6 	r2 score:  0.7741546338326669 	adjusted r2:  0.7495169575235032
cols:  7 	r2 score:  0.7768961441883281 	adjusted r2:  0.7479752739905188
cols:  8 	r2 score:  0.7777776382689501 	adjusted r2:  0.7442346402718105
cols:  9 	r2 score:  0.7924572281432336 	adjusted r2:  0.7565363637834086
cols:  10 	r2 score:  0.794854541611225 	adjusted r2:  0.7546299419271515
cols:  11 	r2 score:  0.7972954924014397 	adjusted r2:  0.7527005007297565
cols:  12 	r2 score:  0.8135463420259219 	adjusted r2:  0.7678842217057396
cols:  13 	r2 score:  0.7934456703438277 	adjusted r2:  0.7375038727286145
cols:  14 	r2 score:  0.7

<h4>Backward selection</h4>

In [61]:
columns = []
columns.extend(X.columns)
i = len(columns)
for col in X:
    if i != 0:
        
    
    X_new = df_new[columns]
    
    X_train,X_test,y_train,y_test = train_test_split(X_new,y,test_size=0.3,random_state=1)
    
    lin = LinearRegression()
    
    lin.fit(X_train,y_train)
    
    r2 = lin.score(X_test,y_test)
    
    n = len(X_test)
    
    adj_r2 = adjusted_r2(i,r2,n)
    
    print("cols: ",i,"\tr2 score: ", r2,"\tadjusted r2: ",adj_r2)
    
    i -= 1
    columns.remove(col)

cols:  14 	r2 score:  0.7965566780397374 	adjusted r2:  0.735956539583489
cols:  13 	r2 score:  0.7965551751947559 	adjusted r2:  0.7414555351433356
cols:  12 	r2 score:  0.7997645640326905 	adjusted r2:  0.7507273144080433
cols:  11 	r2 score:  0.7957287200179317 	adjusted r2:  0.7507890384218767
cols:  10 	r2 score:  0.7135419436000406 	adjusted r2:  0.6573736972471074
cols:  9 	r2 score:  0.6295854209531733 	adjusted r2:  0.5654752053489147
cols:  8 	r2 score:  0.6320803654668075 	adjusted r2:  0.576545326291986
cols:  7 	r2 score:  0.36589600468809474 	adjusted r2:  0.28369733862914415
cols:  6 	r2 score:  0.30668706390741907 	adjusted r2:  0.231052925424592
cols:  5 	r2 score:  0.30759536269511856 	adjusted r2:  0.2457735200786113
cols:  4 	r2 score:  0.3065222192658225 	adjusted r2:  0.25785711184588034
cols:  3 	r2 score:  0.03734277093681637 	adjusted r2:  -0.012449844359555096
cols:  2 	r2 score:  -0.017883552394139413 	adjusted r2:  -0.05238807959394065
cols:  1 	r2 score:  -

### Principal component analysis (PCA)

In [62]:
from sklearn.decomposition import PCA

In [63]:
# n_components -> no. of pc's 
pc = PCA(n_components=4,random_state=1)

In [64]:
X = df_new.drop("price",axis=1)
y = df_new["price"]

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=1)

In [65]:
X_train_pc = pc.fit_transform(X_train,y_train)
X_test_pc = pc.transform(X_test)

In [66]:
lin = LinearRegression()
lin.fit(X_train_pc,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [67]:
lin.score(X_test_pc,y_test)

0.7764931460093274