# Data Analysis tutorial: Part5 - Model Evaluation

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('imports3.csv')
df

Unnamed: 0,symboling,normalized-losses,make,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,...,stroke,compression-ratio,horsepower,peak-rpm,city-L/100km,highway-mpg,price,price-binned,fuel-type_diesel,fuel-type_gas
0,3,122.0,alfa-romero,std,two,convertible,rwd,front,88.6,0.413433,...,2.68,9.0,111.0,5000.0,11.190476,27,13495,Low,0,1
1,3,122.0,alfa-romero,std,two,convertible,rwd,front,88.6,0.413433,...,2.68,9.0,111.0,5000.0,11.190476,27,16500,Low,0,1
2,1,122.0,alfa-romero,std,two,hatchback,rwd,front,94.5,0.449254,...,3.47,9.0,154.0,5000.0,12.368421,26,16500,Low,0,1
3,2,164.0,audi,std,four,sedan,fwd,front,99.8,0.529851,...,3.40,10.0,102.0,5500.0,9.791667,30,13950,Low,0,1
4,2,164.0,audi,std,four,sedan,4wd,front,99.4,0.529851,...,3.40,8.0,115.0,5500.0,13.055556,22,17450,Low,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196,-1,95.0,volvo,std,four,sedan,rwd,front,109.1,0.711940,...,3.15,9.5,114.0,5400.0,10.217391,28,16845,Low,0,1
197,-1,95.0,volvo,turbo,four,sedan,rwd,front,109.1,0.711940,...,3.15,8.7,160.0,5300.0,12.368421,25,19045,Medium,0,1
198,-1,95.0,volvo,std,four,sedan,rwd,front,109.1,0.711940,...,2.87,8.8,134.0,5500.0,13.055556,23,21485,Medium,0,1
199,-1,95.0,volvo,turbo,four,sedan,rwd,front,109.1,0.711940,...,3.40,23.0,106.0,4800.0,9.038462,27,22470,Medium,1,0


In [4]:
x_data = df[['symboling', 'normalized-losses', 'horsepower', 'peak-rpm', 'highway-mpg']]
y_data = df.price

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.3, random_state=0)

In [20]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

lr = LinearRegression()
scores = cross_val_score(lr,x_data,y_data,cv=3)

yhat = cross_val_predict(lr,x_data,y_data,cv=3)
np.mean(scores)

0.45907695866113946

### Overfitting and Underfitting and Model Selection

In [61]:
from sklearn.preprocessing import PolynomialFeatures

x1 = df[['horsepower']]
y1 = df.price
x1_train, x1_test, y1_train, y1_test = train_test_split(x1,y1,test_size=0.3, random_state=0)
R2 = []
order = [2,4,6,8]
for n in order:
    pr = PolynomialFeatures(degree=n)
    x1_train_pr = pr.fit_transform(x1_train)
    x1_test_pr = pr.fit_transform(x1_test)
    lr.fit(x1_train_pr, y1_train)
    R2.append(lr.score(x1_test_pr, y1_test))

print(R2)

[0.6971541932568319, 0.7046122604773232, 0.7402691979652276, 0.5744982044997138]


#### Negative R2 Score shows the overfittin (values before n=8 are underfitting and after that are overfitting)

### Ridge Regression

In [62]:
from sklearn.linear_model import Ridge

In [63]:
RidgeModel = Ridge(alpha=0.1)
RidgeModel.fit(x1_train, y1_train)
yhat_r = RidgeModel.predict(x1_test)
RidgeModel.score(x1_test,y1_test)

0.6960379521661612

### Grid Search

In [64]:
from sklearn.model_selection import GridSearchCV

In [73]:
parameters1 = [{'alpha': [0.1,1,10,100,1000,10000,100000]}]
RR = Ridge()
Grid1 = GridSearchCV(RR, parameters1, cv=4)
Grid1.fit(x_data, y_data)
Grid1.best_estimator_

In [79]:
scores1 = Grid1.cv_results_
scores1['mean_test_score']

array([0.50472704, 0.50504395, 0.50792038, 0.5216538 , 0.53320977,
       0.54077307, 0.49904168])

In [81]:
for param,mean_val in zip(scores1['params'], scores1['mean_test_score']):
    print(param, "R^2 on test data:", mean_val)

{'alpha': 0.1} R^2 on test data: 0.5047270420272603
{'alpha': 1} R^2 on test data: 0.50504394668893
{'alpha': 10} R^2 on test data: 0.5079203801285013
{'alpha': 100} R^2 on test data: 0.5216538033226372
{'alpha': 1000} R^2 on test data: 0.5332097677900529
{'alpha': 10000} R^2 on test data: 0.5407730692807112
{'alpha': 100000} R^2 on test data: 0.4990416767690025
