# 5.3 Cross Validation and Bootstrap


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline


In [2]:
#load the data
df = pd.read_csv("Auto.csv")
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 397 entries, 0 to 396
Data columns (total 9 columns):
mpg             397 non-null float64
cylinders       397 non-null int64
displacement    397 non-null float64
horsepower      397 non-null object
weight          397 non-null int64
acceleration    397 non-null float64
year            397 non-null int64
origin          397 non-null int64
name            397 non-null object
dtypes: float64(3), int64(4), object(2)
memory usage: 28.0+ KB


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


Since the data type of horsepower in this case is object instead of int64, we have to do some data preprocessing to convert it to int64

In [3]:
df['horsepower']= df['horsepower'].astype('float')

ValueError: could not convert string to float: '?'

since there are some strings in the horsepower column, first we need to remove the string, add in some index and change all the data type to float 

In [17]:
#Change all the horsepower columns to numeric
rows_select = df['horsepower'].apply(lambda x: x.isnumeric()) #return a list of true and false

df= df.loc[rows_select, :]
df['horsepower'] = np.float64(df['horsepower'])    
df.index = range(1, len(df)+1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 392 entries, 1 to 392
Data columns (total 9 columns):
mpg             392 non-null float64
cylinders       392 non-null int64
displacement    392 non-null float64
horsepower      392 non-null float64
weight          392 non-null int64
acceleration    392 non-null float64
year            392 non-null int64
origin          392 non-null int64
name            392 non-null object
dtypes: float64(4), int64(4), object(1)
memory usage: 27.6+ KB


In [18]:
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
1,18.0,8,307.0,130.0,3504,12.0,70,1,chevrolet chevelle malibu
2,15.0,8,350.0,165.0,3693,11.5,70,1,buick skylark 320
3,18.0,8,318.0,150.0,3436,11.0,70,1,plymouth satellite
4,16.0,8,304.0,150.0,3433,12.0,70,1,amc rebel sst
5,17.0,8,302.0,140.0,3449,10.5,70,1,ford torino


# 5.3.1 the validation set approach

In [25]:
from sklearn.cross_validation import train_test_split, LeaveOneOut 
X = df['horsepower'].values.reshape(-1, 1) #reshape into 1 columns with uknown rows 
# --> [[130.]
#  [165.]
#  [150.]]
Y = df['mpg']
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.5,random_state = 1)

In this case, we use the random_state =1 aka seeding so that later we can access the seeding to get the same results. It doesnt matter what number do we use, but as long as we use the same random_state within each time we try, we will be guaranteed to get the same number.fit a linear regression using only
the observations corresponding to the training set

In [27]:
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
linreg.fit(X_train, Y_train)
Y_prediction = linreg.predict(X_test)

#Now we want to print out the metric of the models that we have just trained 
from sklearn import metrics
print ("The MAE is : {}".format(metrics.mean_absolute_error(Y_test, Y_prediction)))
print ("The MSE is: {}".format( metrics.mean_squared_error(Y_test, Y_prediction) )  )
print ("The RMSE is: {}".format( np.sqrt( metrics.mean_squared_error(Y_test, Y_prediction))))


The MAE is : 3.8955470772080734
The MSE is: 24.80212062059356
The RMSE is: 4.980172750075399


In [40]:
for randomstate in [1, 5, 1234,1]:
    print("\nrandom_seed = %d" % randomstate)
    X = df['horsepower'].values.reshape(-1, 1)
    Y = df['mpg']
    X_train2, X_test2, Y_train2, Y_test2 = train_test_split(X,Y , test_size=0.5, random_state=randomstate)
    lr_model = LinearRegression()
    lr_model.fit(X_train2, Y_train2)
    Y_prediction2 = lr_model.predict(X_test2)
    mse = metrics.mean_squared_error(Y_test, Y_prediction)
    print ("The MSE is: {}".format(mse) )


random_seed = 1
The MSE is: 24.80212062059356

random_seed = 5
The MSE is: 24.80212062059356

random_seed = 1234
The MSE is: 24.80212062059356

random_seed = 1
The MSE is: 24.80212062059356


From this example we can see that with different seeds we have different MSE, with the same seed we obtain the same MSE 