In [1]:
# conventional way to import pandas
import pandas as pd
# conventional way to import numpy
import numpy as np

from sklearn import metrics
import matplotlib.pyplot as plt

data = pd.read_csv("https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/ISLR/Auto.csv", index_col=0)

print(data.shape)
data.head()

(392, 9)


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
1,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
2,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
3,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
4,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
5,17.0,8,302.0,140,3449,10.5,70,1,ford torino


ISLR Auto is a data frame with 392 observations on the following 9 variables:

    mpg: miles per gallon
    cylinders: Number of cylinders between 4 and 8
    displacement: Engine displacement (cu. inches)
    horsepower: Engine horsepower
    weight: Vehicle weight (lbs.)
    acceleration: Time to accelerate from 0 to 60 mph (sec.)
    year: Model year (modulo 100)
    origin: Origin of car (1. American, 2. European, 3. Japanese)
    name: Vehicle name
    
We take a 196 random samples out of the data. We are using a random seed, and because of this our answers will vary from the book:

In [2]:
np.random.seed(1)
train = np.random.choice(data.shape[0], 196, replace=False)
select = np.in1d(range(data.shape[0]), train)

traindata = data[select]

print(traindata.shape)
traindata.head()

(196, 9)


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
1,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
5,17.0,8,302.0,140,3449,10.5,70,1,ford torino
6,15.0,8,429.0,198,4341,10.0,70,1,ford galaxie 500
7,14.0,8,454.0,220,4354,9.0,70,1,chevrolet impala
9,14.0,8,455.0,225,4425,10.0,70,1,pontiac catalina


In [3]:
import statsmodels.formula.api as smf
lm = smf.ols ('mpg~horsepower', traindata).fit()

print(lm.summary())

preds = lm.predict(data)
square_error = (data['mpg'] - preds)**2
print('--------Test Error for 1st order--------')
print(np.mean(square_error[~select]))

                            OLS Regression Results                            
Dep. Variable:                    mpg   R-squared:                       0.620
Model:                            OLS   Adj. R-squared:                  0.618
Method:                 Least Squares   F-statistic:                     316.4
Date:                Fri, 09 Mar 2018   Prob (F-statistic):           1.28e-42
Time:                        11:58:58   Log-Likelihood:                -592.07
No. Observations:                 196   AIC:                             1188.
Df Residuals:                     194   BIC:                             1195.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     40.3338      1.023     39.416      0.0

In [4]:
lm2 = smf.ols ('mpg~horsepower + I(horsepower ** 2.0)', traindata).fit()
preds2 = lm2.predict(data)
square_error2 = (data['mpg'] - preds2)**2
print('--------Test Error for 2nd order--------')
print(np.mean(square_error2[~select]))

--------Test Error for 2nd order--------
20.25269085835005


In [5]:
lm3 = smf.ols ('mpg~horsepower + I(horsepower ** 2.0) + I(horsepower ** 3.0)', traindata).fit()
preds3 = lm3.predict(data)
square_error3 = (data['mpg'] - preds3)**2
print('--------Test Error for 3rd order--------')
print(np.mean(square_error3[~select]))

--------Test Error for 3rd order--------
20.325609365773605
