In [97]:
import os
import numpy as np
import pandas as pd
import csv
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn import feature_selection, linear_model

pd.set_option('display.max_rows', 10)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 10)

%matplotlib inline
plt.style.use('ggplot')

In [98]:
df = pd.read_csv('auto-mpg.csv')


In [99]:
df['gpm'] = 1. / df.mpg

In [100]:
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,car_name,gpm
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu,0.055556
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320,0.066667
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite,0.055556
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst,0.0625
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino,0.058824


Ivan's comment- Drop the cylinders

In [101]:
df.drop('cylinders',axis = 1,inplace='True')


In [103]:
df.corr()

Unnamed: 0,mpg,displacement,horsepower,weight,acceleration,model_year,origin,gpm
mpg,1.0,-0.804203,-0.778427,-0.831741,0.420289,0.579267,0.56345,-0.935364
displacement,-0.804203,1.0,0.897257,0.932824,-0.543684,-0.370164,-0.609409,0.866631
horsepower,-0.778427,0.897257,1.0,0.864538,-0.689196,-0.416361,-0.455171,0.854809
weight,-0.831741,0.932824,0.864538,1.0,-0.417457,-0.306564,-0.581024,0.885219
acceleration,0.420289,-0.543684,-0.689196,-0.417457,1.0,0.288137,0.205873,-0.45575
model_year,0.579267,-0.370164,-0.416361,-0.306564,0.288137,1.0,0.180662,-0.557427
origin,0.56345,-0.609409,-0.455171,-0.581024,0.205873,0.180662,1.0,-0.530213
gpm,-0.935364,0.866631,0.854809,0.885219,-0.45575,-0.557427,-0.530213,1.0


In [None]:
df[['weight','gpm']].plot(x='weight',y='gpm',kind='scatter',color='green',label='Weight&MPG')

In [None]:
df[['horsepower','gpm']].plot(x='horsepower',y='gpm',kind='scatter',color='black',label='HP&mpg')

In [None]:
df[['acceleration','gpm']].plot(x='acceleration',y='gpm',kind='scatter',color='blue',label='Acceleration&mpg')

In [None]:
df[['cylinders','gpm']].plot(x='cylinders',y='gpm',kind='scatter',color='blue')

In [None]:
df[['origin','mpg']].plot(x='origin',y='mpg',kind='scatter',color='red',label='origin&mpg')

In [104]:
df[ ['displacement_sqrt', 'horsepower_sqrt'] ] = df[ ['displacement', 'horsepower'] ].apply(np.sqrt)

In [105]:
df.head()

Unnamed: 0,mpg,displacement,horsepower,weight,acceleration,...,origin,car_name,gpm,displacement_sqrt,horsepower_sqrt
0,18.0,307.0,130.0,3504.0,12.0,...,1,chevrolet chevelle malibu,0.055556,17.521415,11.401754
1,15.0,350.0,165.0,3693.0,11.5,...,1,buick skylark 320,0.066667,18.708287,12.845233
2,18.0,318.0,150.0,3436.0,11.0,...,1,plymouth satellite,0.055556,17.832555,12.247449
3,16.0,304.0,150.0,3433.0,12.0,...,1,amc rebel sst,0.0625,17.435596,12.247449
4,17.0,302.0,140.0,3449.0,10.5,...,1,ford torino,0.058824,17.378147,11.83216


In [106]:
df.corr()

Unnamed: 0,mpg,displacement,horsepower,weight,acceleration,model_year,origin,gpm,displacement_sqrt,horsepower_sqrt
mpg,1.0,-0.804203,-0.778427,-0.831741,0.420289,0.579267,0.56345,-0.935364,-0.820815,-0.802311
displacement,-0.804203,1.0,0.897257,0.932824,-0.543684,-0.370164,-0.609409,0.866631,0.994882,0.897344
horsepower,-0.778427,0.897257,1.0,0.864538,-0.689196,-0.416361,-0.455171,0.854809,0.879372,0.995152
weight,-0.831741,0.932824,0.864538,1.0,-0.417457,-0.306564,-0.581024,0.885219,0.939395,0.875859
acceleration,0.420289,-0.543684,-0.689196,-0.417457,1.0,0.288137,0.205873,-0.45575,-0.52157,-0.696702
model_year,0.579267,-0.370164,-0.416361,-0.306564,0.288137,1.0,0.180662,-0.557427,-0.352616,-0.409083
origin,0.56345,-0.609409,-0.455171,-0.581024,0.205873,0.180662,1.0,-0.530213,-0.640991,-0.471495
gpm,-0.935364,0.866631,0.854809,0.885219,-0.45575,-0.557427,-0.530213,1.0,0.866133,0.859499
displacement_sqrt,-0.820815,0.994882,0.879372,0.939395,-0.52157,-0.352616,-0.640991,0.866133,1.0,0.88647
horsepower_sqrt,-0.802311,0.897344,0.995152,0.875859,-0.696702,-0.409083,-0.471495,0.859499,0.88647,1.0


In [107]:
model=smf.ols(formula='gpm~displacement_sqrt+horsepower_sqrt+displacement+weight+acceleration',data=df).fit()

In [108]:
model.summary()

0,1,2,3
Dep. Variable:,gpm,R-squared:,0.819
Model:,OLS,Adj. R-squared:,0.817
Method:,Least Squares,F-statistic:,349.8
Date:,"Thu, 09 Jun 2016",Prob (F-statistic):,6.58e-141
Time:,20:25:33,Log-Likelihood:,1385.1
No. Observations:,392,AIC:,-2758.0
Df Residuals:,386,BIC:,-2734.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,-0.0344,0.010,-3.384,0.001,-0.054 -0.014
displacement_sqrt,0.0002,0.001,0.186,0.853,-0.002 0.002
horsepower_sqrt,0.0044,0.001,6.603,0.000,0.003 0.006
displacement,1.808e-05,3.66e-05,0.494,0.622,-5.39e-05 9.01e-05
weight,7.336e-06,1.53e-06,4.808,0.000,4.34e-06 1.03e-05
acceleration,0.0007,0.000,2.957,0.003,0.000 0.001

0,1,2,3
Omnibus:,16.855,Durbin-Watson:,1.049
Prob(Omnibus):,0.0,Jarque-Bera (JB):,29.821
Skew:,0.262,Prob(JB):,3.35e-07
Kurtosis:,4.245,Cond. No.,88000.0


dropping the displacement 

In [111]:
model=smf.ols(formula='gpm~displacement_sqrt+horsepower_sqrt+weight+acceleration',data=df).fit()

In [112]:
model.summary()

0,1,2,3
Dep. Variable:,gpm,R-squared:,0.819
Model:,OLS,Adj. R-squared:,0.817
Method:,Least Squares,F-statistic:,438.0
Date:,"Thu, 09 Jun 2016",Prob (F-statistic):,3.33e-142
Time:,20:29:27,Log-Likelihood:,1385.0
No. Observations:,392,AIC:,-2760.0
Df Residuals:,387,BIC:,-2740.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,-0.0380,0.007,-5.448,0.000,-0.052 -0.024
displacement_sqrt,0.0007,0.000,2.181,0.030,6.86e-05 0.001
horsepower_sqrt,0.0044,0.001,6.907,0.000,0.003 0.006
weight,7.257e-06,1.52e-06,4.787,0.000,4.28e-06 1.02e-05
acceleration,0.0007,0.000,2.940,0.003,0.000 0.001

0,1,2,3
Omnibus:,16.613,Durbin-Watson:,1.054
Prob(Omnibus):,0.0,Jarque-Bera (JB):,28.895
Skew:,0.264,Prob(JB):,5.32e-07
Kurtosis:,4.221,Cond. No.,60300.0


looks like displacement sqrt is also not significant

In [113]:
model=smf.ols(formula='gpm~horsepower_sqrt+weight+acceleration',data=df).fit()

In [114]:
model.summary()

0,1,2,3
Dep. Variable:,gpm,R-squared:,0.817
Model:,OLS,Adj. R-squared:,0.815
Method:,Least Squares,F-statistic:,576.9
Date:,"Thu, 09 Jun 2016",Prob (F-statistic):,1.36e-142
Time:,20:30:29,Log-Likelihood:,1382.6
No. Observations:,392,AIC:,-2757.0
Df Residuals:,388,BIC:,-2741.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,-0.0357,0.007,-5.155,0.000,-0.049 -0.022
horsepower_sqrt,0.0046,0.001,7.174,0.000,0.003 0.006
weight,9.625e-06,1.06e-06,9.052,0.000,7.53e-06 1.17e-05
acceleration,0.0006,0.000,2.526,0.012,0.000 0.001

0,1,2,3
Omnibus:,15.601,Durbin-Watson:,1.07
Prob(Omnibus):,0.0,Jarque-Bera (JB):,22.556
Skew:,0.315,Prob(JB):,1.26e-05
Kurtosis:,3.992,Cond. No.,59600.0


In [118]:
model_ic_hp_wei=smf.ols(formula='gpm~horsepower_sqrt+weight+acceleration+horsepower_sqrt*weight',data=df).fit()

In [119]:
model_ic_hp_wei.summary()

0,1,2,3
Dep. Variable:,gpm,R-squared:,0.819
Model:,OLS,Adj. R-squared:,0.817
Method:,Least Squares,F-statistic:,436.4
Date:,"Thu, 09 Jun 2016",Prob (F-statistic):,6.08e-142
Time:,20:41:53,Log-Likelihood:,1384.4
No. Observations:,392,AIC:,-2759.0
Df Residuals:,387,BIC:,-2739.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,-0.0202,0.011,-1.881,0.061,-0.041 0.001
horsepower_sqrt,0.0030,0.001,2.823,0.005,0.001 0.005
weight,4.964e-06,2.69e-06,1.843,0.066,-3.33e-07 1.03e-05
acceleration,0.0006,0.000,2.525,0.012,0.000 0.001
horsepower_sqrt:weight,4.628e-07,2.46e-07,1.882,0.061,-2.07e-08 9.46e-07

0,1,2,3
Omnibus:,14.577,Durbin-Watson:,1.066
Prob(Omnibus):,0.001,Jarque-Bera (JB):,20.416
Skew:,0.306,Prob(JB):,3.69e-05
Kurtosis:,3.935,Cond. No.,1040000.0


interaction with all

In [120]:
model_ic_all=smf.ols(formula='gpm~horsepower_sqrt+weight+acceleration+horsepower_sqrt*weight*acceleration',data=df).fit()

In [121]:
model_ic_all.summary()

0,1,2,3
Dep. Variable:,gpm,R-squared:,0.831
Model:,OLS,Adj. R-squared:,0.828
Method:,Least Squares,F-statistic:,269.7
Date:,"Thu, 09 Jun 2016",Prob (F-statistic):,5.840000000000001e-144
Time:,20:43:53,Log-Likelihood:,1398.3
No. Observations:,392,AIC:,-2781.0
Df Residuals:,384,BIC:,-2749.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,-0.1436,0.041,-3.504,0.001,-0.224 -0.063
horsepower_sqrt,0.0166,0.004,3.808,0.000,0.008 0.025
weight,5.024e-05,1.33e-05,3.782,0.000,2.41e-05 7.64e-05
acceleration,0.0097,0.003,3.768,0.000,0.005 0.015
horsepower_sqrt:weight,-4.309e-06,1.19e-06,-3.608,0.000,-6.66e-06 -1.96e-06
horsepower_sqrt:acceleration,-0.0010,0.000,-3.410,0.001,-0.002 -0.000
weight:acceleration,-3.414e-06,8.16e-07,-4.184,0.000,-5.02e-06 -1.81e-06
horsepower_sqrt:weight:acceleration,3.592e-07,7.96e-08,4.511,0.000,2.03e-07 5.16e-07

0,1,2,3
Omnibus:,15.672,Durbin-Watson:,1.063
Prob(Omnibus):,0.0,Jarque-Bera (JB):,25.423
Skew:,0.273,Prob(JB):,3.02e-06
Kurtosis:,4.122,Cond. No.,59100000.0


In [None]:
df_new[['displacement_sqrt','mpg']].plot(x='displacement_sqrt',y='mpg',kind='scatter',color='red',label='dispsq&mpg')

In [None]:
horsepower_sqrt

In [None]:
df_new[['horsepower_sqrt','mpg']].plot(x='horsepower_sqrt',y='mpg',kind='scatter',color='black',label='dispsq&mpg')

In [None]:
df_new[ ['displacement_log', 'horsepower_log'] ] = df[ ['displacement', 'horsepower'] ].apply(np.log)

In [None]:
df_new.head()

In [None]:
df_new[['displacement_sqrt','mpg']].plot(x='displacement_sqrt',y='mpg',kind='scatter',color='red',label='dispsq&mpg')

In [None]:
df_new[['displacement','mpg']].plot(x='displacement',y='mpg',kind='scatter',color='Magenta')

In [None]:
model = smf.ols(formula='mpg ~ displacement + horsepower + weight + acceleration + model_year + origin ',data=df_new)

In [None]:
model.fit().summary()

P > .025 for the following:
cylinders
horsepower
acceleration
year

In [None]:
model1 = smf.ols(formula='mpg ~  displacement  + weight + acceleration + model_year + origin ',data=df)

In [None]:
model1.fit().summary()

### 

### 

### SCIKIT LEARN

In [123]:
df = df.dropna(axis = 0, subset = ['gpm', 'horsepower_sqrt', 'weight'])

In [124]:
df.head()

Unnamed: 0,mpg,displacement,horsepower,weight,acceleration,...,origin,car_name,gpm,displacement_sqrt,horsepower_sqrt
0,18.0,307.0,130.0,3504.0,12.0,...,1,chevrolet chevelle malibu,0.055556,17.521415,11.401754
1,15.0,350.0,165.0,3693.0,11.5,...,1,buick skylark 320,0.066667,18.708287,12.845233
2,18.0,318.0,150.0,3436.0,11.0,...,1,plymouth satellite,0.055556,17.832555,12.247449
3,16.0,304.0,150.0,3433.0,12.0,...,1,amc rebel sst,0.0625,17.435596,12.247449
4,17.0,302.0,140.0,3449.0,10.5,...,1,ford torino,0.058824,17.378147,11.83216


In [125]:
train_df = df.sample(frac = .6, random_state = 0)
test_df = df.drop(train_df.index)

In [127]:
train_df.columns

Index([u'mpg', u'displacement', u'horsepower', u'weight', u'acceleration',
       u'model_year', u'origin', u'car_name', u'gpm', u'displacement_sqrt',
       u'horsepower_sqrt'],
      dtype='object')

In [None]:
X_train = train_df['displacement','']

In [None]:
model_sk = linear_model.LinearRegression()

In [None]:
df_new['horsepower_inv'] =1./df_new.horsepower

In [None]:
df_new.head()

In [None]:
df_new[['horsepower_inv','mpg']].plot(x='horsepower_inv',y='mpg',kind='scatter',color='Magenta')