# Model Evaluation and Refinement

#### Importing Libraries

In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns

In [5]:
filepath = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DA0101EN-SkillsNetwork/labs/Data%20files/module_5_auto.csv'
df = pd.read_csv(filepath)


In [6]:
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,symboling,normalized-losses,make,aspiration,num-of-doors,body-style,drive-wheels,engine-location,...,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price,city-L/100km,horsepower-binned,diesel,gas
0,0,0,3,122,alfa-romero,std,two,convertible,rwd,front,...,9.0,111.0,5000.0,21,27,13495.0,11.190476,Medium,0,1
1,1,1,3,122,alfa-romero,std,two,convertible,rwd,front,...,9.0,111.0,5000.0,21,27,16500.0,11.190476,Medium,0,1
2,2,2,1,122,alfa-romero,std,two,hatchback,rwd,front,...,9.0,154.0,5000.0,19,26,16500.0,12.368421,Medium,0,1
3,3,3,2,164,audi,std,four,sedan,fwd,front,...,10.0,102.0,5500.0,24,30,13950.0,9.791667,Medium,0,1
4,4,4,2,164,audi,std,four,sedan,4wd,front,...,8.0,115.0,5500.0,18,22,17450.0,13.055556,Medium,0,1


##### Use numeric data only

In [8]:
df = df._get_numeric_data()
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,symboling,normalized-losses,wheel-base,length,width,height,curb-weight,engine-size,...,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price,city-L/100km,diesel,gas
0,0,0,3,122,88.6,0.811148,0.890278,48.8,2548,130,...,2.68,9.0,111.0,5000.0,21,27,13495.0,11.190476,0,1
1,1,1,3,122,88.6,0.811148,0.890278,48.8,2548,130,...,2.68,9.0,111.0,5000.0,21,27,16500.0,11.190476,0,1
2,2,2,1,122,94.5,0.822681,0.909722,52.4,2823,152,...,3.47,9.0,154.0,5000.0,19,26,16500.0,12.368421,0,1
3,3,3,2,164,99.8,0.84863,0.919444,54.3,2337,109,...,3.4,10.0,102.0,5500.0,24,30,13950.0,9.791667,0,1
4,4,4,2,164,99.4,0.84863,0.922222,54.3,2824,136,...,3.4,8.0,115.0,5500.0,18,22,17450.0,13.055556,0,1


In [10]:
df.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'symboling', 'normalized-losses',
       'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-size',
       'bore', 'stroke', 'compression-ratio', 'horsepower', 'peak-rpm',
       'city-mpg', 'highway-mpg', 'price', 'city-L/100km', 'diesel', 'gas'],
      dtype='object')

As in the above data, there are columns 'Unnamed:0.1' and 'Unnamed:0', which are not valuable in our context. So we remove these two columns. 

In [11]:
df.drop(['Unnamed: 0.1', 'Unnamed: 0'], axis= 1, inplace=True)
df.head()

Unnamed: 0,symboling,normalized-losses,wheel-base,length,width,height,curb-weight,engine-size,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price,city-L/100km,diesel,gas
0,3,122,88.6,0.811148,0.890278,48.8,2548,130,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0,11.190476,0,1
1,3,122,88.6,0.811148,0.890278,48.8,2548,130,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0,11.190476,0,1
2,1,122,94.5,0.822681,0.909722,52.4,2823,152,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0,12.368421,0,1
3,2,164,99.8,0.84863,0.919444,54.3,2337,109,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0,9.791667,0,1
4,2,164,99.4,0.84863,0.922222,54.3,2824,136,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0,13.055556,0,1


#### Function for plotting

In [13]:
def DistributionPlot(RedFunction, BlueFunction, RedName, BlueName, Title):
    width = 12
    height = 10
    plt.figure(figsize=(width, height))
    
    ax1 = sns.kdeplot(RedFunction, color="r", label=RedName)
    ax2 = sns.kdeplot(BlueFunction, color="b", label=BlueName, ax=ax1)


    plt.title(Title)
    plt.xlabel('Price (in dollars)')
    plt.ylabel('Proportion of Cars')
    plt.show()
    plt.close()

In [14]:
def PollyPlot(xtrain, xtest, y_train, y_test, lr,poly_transform):
    width = 12
    height = 10
    plt.figure(figsize=(width, height))
    
    
    #training data 
    #testing data 
    # lr:  linear regression object 
    #poly_transform:  polynomial transformation object 
 
    xmax=max([xtrain.values.max(), xtest.values.max()])


    xmin=min([xtrain.values.min(), xtest.values.min()])


    x=np.arange(xmin, xmax, 0.1)


    plt.plot(xtrain, y_train, 'ro', label='Training Data')
    plt.plot(xtest, y_test, 'go', label='Test Data')
    plt.plot(x, lr.predict(poly_transform.fit_transform(x.reshape(-1, 1))), label='Predicted Function')
    plt.ylim([-10000, 60000])
    plt.ylabel('Price')
    plt.legend()




### Part 1: Training and Testing

split data into training and testing data.
place the target data 'price' in a separate dataframe y_data.

In [15]:
y_data = df['price']

In [16]:
#drop price from dataframe
x_data = df.drop('price', axis=1)

Now, randomly split data into training and testing data using the function **train_test_split**

In [18]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size= 0.10, random_state=1)

print("number of test samples: ", x_test.shape[0])
print("number of training samples: ", x_train.shape[0])

number of test samples:  21
number of training samples:  180


Import linear regression

In [20]:
from sklearn.linear_model import LinearRegression

In [21]:
#create a linear regression object
lre = LinearRegression()

In [22]:
#fit the model using 'horsepower' feature
lre.fit(x_train[['horsepower']], y_train)

In [23]:
#caluclate R2 on the test data
lre.score(x_test[['horsepower']], y_test)

0.36358755750788263

In [24]:
# calcuate R2 on the train data
lre.score(x_train[['horsepower']], y_train)

0.6619724197515104

The $R^2$ is much smaller using the test data compared to the training data

#### Again split up the dataset into 40% testing. But this time, seting 'random_state' = 0

In [27]:
x_train1, x_test1, y_train1, y_test1 = train_test_split(x_data, y_data, test_size=0.40, random_state=0)
print("Nummber of test samples: ", x_test1.shape[0])
print("Number of training samples: ",  x_train.shape[0])

Nummber of test samples:  81
Number of training samples:  180


In [31]:
lre.fit(x_train1[['horsepower']], y_train1)
lre.score(x_test1[['horsepower']], y_test1)

0.7139364665406973

Sometimes, we may not have sufficient testing data, so we may want to perform cross-validation. 

#### Cross-validation