In [291]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy
import seaborn as sns
from seaborn import plt
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV

%matplotlib inline


In [292]:
# read in the car dataset
df=pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data',header=None)

columns= ['symboling','normalized-losses','make','fuel-type','aspiration','num-of-doors','body-style','drive-wheels','engine-location','wheel-base','length','width','height','curb-weight','engine-type','num-of-cylinders','engine-size','fuel-system','bore','stroke','compression-ratio','horsepower','peak-rpm','city-mpg','highway-mpg','price']
df.columns=columns
# Use head to view the first few rows
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [293]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
symboling            205 non-null int64
normalized-losses    205 non-null object
make                 205 non-null object
fuel-type            205 non-null object
aspiration           205 non-null object
num-of-doors         205 non-null object
body-style           205 non-null object
drive-wheels         205 non-null object
engine-location      205 non-null object
wheel-base           205 non-null float64
length               205 non-null float64
width                205 non-null float64
height               205 non-null float64
curb-weight          205 non-null int64
engine-type          205 non-null object
num-of-cylinders     205 non-null object
engine-size          205 non-null int64
fuel-system          205 non-null object
bore                 205 non-null object
stroke               205 non-null object
compression-ratio    205 non-null float64
horsepower           205 non-nul

In [294]:
c = ['price','horsepower','symboling','wheel-base','length','width','height','curb-weight','engine-size','compression-ratio','city-mpg','highway-mpg','num-of-cylinders','num-of-doors']

In [295]:
df2 = df[c]

In [296]:
df2.head()

Unnamed: 0,price,horsepower,symboling,wheel-base,length,width,height,curb-weight,engine-size,compression-ratio,city-mpg,highway-mpg,num-of-cylinders,num-of-doors
0,13495,111,3,88.6,168.8,64.1,48.8,2548,130,9.0,21,27,four,two
1,16500,111,3,88.6,168.8,64.1,48.8,2548,130,9.0,21,27,four,two
2,16500,154,1,94.5,171.2,65.5,52.4,2823,152,9.0,19,26,six,two
3,13950,102,2,99.8,176.6,66.2,54.3,2337,109,10.0,24,30,four,four
4,17450,115,2,99.4,176.6,66.4,54.3,2824,136,8.0,18,22,five,four


In [297]:
df2 = df2[df2.price != '?']

In [298]:
df2['price'] = df2['price'].map(lambda x: int(x))

In [299]:
cylTypes = sorted(list(set(df2['num-of-cylinders'])))

In [300]:
cylTypes

['eight', 'five', 'four', 'six', 'three', 'twelve', 'two']

In [301]:
values = [8,5,4,6,3,12,2]

In [302]:
d = {key:value for (key, value) in zip(cylTypes,values)}

In [303]:
d.keys()

['twelve', 'six', 'three', 'two', 'four', 'five', 'eight']

In [304]:
df2.replace({'num-of-cylinders':d},inplace=True)

In [305]:
df2.head()

Unnamed: 0,price,horsepower,symboling,wheel-base,length,width,height,curb-weight,engine-size,compression-ratio,city-mpg,highway-mpg,num-of-cylinders,num-of-doors
0,13495,111,3,88.6,168.8,64.1,48.8,2548,130,9.0,21,27,4,two
1,16500,111,3,88.6,168.8,64.1,48.8,2548,130,9.0,21,27,4,two
2,16500,154,1,94.5,171.2,65.5,52.4,2823,152,9.0,19,26,6,two
3,13950,102,2,99.8,176.6,66.2,54.3,2337,109,10.0,24,30,4,four
4,17450,115,2,99.4,176.6,66.4,54.3,2824,136,8.0,18,22,5,four


In [306]:
df2.replace({'num-of-doors':d},inplace=True)

In [307]:
set(df2['num-of-doors'])

{2, 4, '?'}

In [327]:
df2 = df2[df2['horsepower'] != '?']

In [335]:
df2['horsepower'] = df2['horsepower'].map(lambda x: int(x))

In [336]:
set(df2['horsepower'])

{48,
 52,
 55,
 56,
 58,
 60,
 62,
 68,
 69,
 70,
 72,
 73,
 76,
 78,
 82,
 84,
 85,
 86,
 88,
 90,
 92,
 94,
 95,
 97,
 100,
 101,
 102,
 106,
 110,
 111,
 112,
 114,
 115,
 116,
 120,
 121,
 123,
 134,
 135,
 140,
 142,
 143,
 145,
 152,
 154,
 155,
 156,
 160,
 161,
 162,
 175,
 176,
 182,
 184,
 200,
 207,
 262}

In [330]:
df2.to_pickle('autoproblem.pkl')

In [331]:
df2.sort('price',ascending=False,inplace=True)

  if __name__ == '__main__':


In [332]:
df2.head()

Unnamed: 0,price,horsepower,length,width,height,curbweight,enginesize,citympg,highwaympg,cyls
74,45400,184,199.2,72.0,55.4,3715,304,14,16,8
16,41315,182,193.8,67.9,53.7,3380,209,16,22,6
73,40960,184,208.1,71.7,56.7,3900,308,14,16,8
128,37028,207,168.9,65.0,51.6,2800,194,17,25,6
17,36880,182,197.0,70.9,56.3,3505,209,15,20,6


In [None]:
#sns.pairplot(df2.corr())

In [337]:
df2.corr()

Unnamed: 0,price,horsepower,length,width,height,curbweight,enginesize,citympg,highwaympg,cyls
price,1.0,0.811953,0.694556,0.754249,0.133103,0.834732,0.873708,-0.692948,-0.708659,0.707885
horsepower,0.811953,1.0,0.58611,0.620057,-0.084637,0.759925,0.825286,-0.82196,-0.803658,0.66955
length,0.694556,0.58611,1.0,0.856495,0.489997,0.882718,0.684888,-0.676796,-0.710573,0.443018
width,0.754249,0.620057,0.856495,1.0,0.304199,0.867289,0.729046,-0.642865,-0.690699,0.522276
height,0.133103,-0.084637,0.489997,0.304199,1.0,0.306149,0.07193,-0.059294,-0.113119,0.005509
curbweight,0.834732,0.759925,0.882718,0.867289,0.306149,1.0,0.848932,-0.755559,-0.800113,0.604206
enginesize,0.873708,0.825286,0.684888,0.729046,0.07193,0.848932,1.0,-0.655737,-0.684662,0.848789
citympg,-0.692948,-0.82196,-0.676796,-0.642865,-0.059294,-0.755559,-0.655737,1.0,0.972407,-0.441964
highwaympg,-0.708659,-0.803658,-0.710573,-0.690699,-0.113119,-0.800113,-0.684662,0.972407,1.0,-0.474739
cyls,0.707885,0.66955,0.443018,0.522276,0.005509,0.604206,0.848789,-0.441964,-0.474739,1.0


In [338]:
df2.drop(['symboling','num-of-doors','compression-ratio'],axis=1,inplace=True)

ValueError: labels ['symboling' 'num-of-doors' 'compression-ratio'] not contained in axis

In [None]:
df2.rename(columns = {},inplace=True)

In [378]:
df2.columns

Index([u'price', u'horsepower', u'length', u'width', u'height', u'curbweight',
       u'enginesize', u'citympg', u'highwaympg', u'cyls', u'Intercept',
       u'make[T.audi]', u'make[T.bmw]', u'make[T.chevrolet]', u'make[T.dodge]',
       u'make[T.honda]', u'make[T.isuzu]', u'make[T.jaguar]', u'make[T.mazda]',
       u'make[T.mercedes-benz]', u'make[T.mercury]', u'make[T.mitsubishi]',
       u'make[T.nissan]', u'make[T.peugot]', u'make[T.plymouth]',
       u'make[T.porsche]', u'make[T.renault]', u'make[T.saab]',
       u'make[T.subaru]', u'make[T.toyota]', u'make[T.volkswagen]',
       u'make[T.volvo]'],
      dtype='object')

In [379]:
# Create your feature matrix (X) and target vector (y)
y, X = patsy.dmatrices('price ~ horsepower + length + width + curbweight + enginesize + citympg + highwaympg + cyls + horsepower:cyls + width:curbweight + Intercept + makeTaudi + makeTbmw + makeTchevrolet + makeTdodge + makeThonda + makeTisuzu + makeTjaguar + makeTmazda + makeTmercedes-benz + makeTmercury + makeTmitsubishi + makeTnissan + makeTpeugot + makeTplymouth + makeTporsche + makeTrenault + makeTsaab + makeTsubaru + makeTtoyota + makeTvolkswagen + makeTvolvo', data=df2, return_type="dataframe")
# Create your model
model = sm.OLS(np.log(y), X)
# Fit your model to your training set
fit = model.fit()
# Print summary statistics of the model's performance
fit.summary()

NameError: name 'make' is not defined

In [None]:
df2.drop(['wheelbase'],axis=1,inplace=True)

In [348]:
X=patsy.dmatrix('make',data=df,return_type='dataframe')
X.head()

Unnamed: 0,Intercept,make[T.audi],make[T.bmw],make[T.chevrolet],make[T.dodge],make[T.honda],make[T.isuzu],make[T.jaguar],make[T.mazda],make[T.mercedes-benz],...,make[T.nissan],make[T.peugot],make[T.plymouth],make[T.porsche],make[T.renault],make[T.saab],make[T.subaru],make[T.toyota],make[T.volkswagen],make[T.volvo]
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [349]:
df2 = df2.join(X)

In [350]:
X.columns

Index([u'Intercept', u'make[T.audi]', u'make[T.bmw]', u'make[T.chevrolet]',
       u'make[T.dodge]', u'make[T.honda]', u'make[T.isuzu]', u'make[T.jaguar]',
       u'make[T.mazda]', u'make[T.mercedes-benz]', u'make[T.mercury]',
       u'make[T.mitsubishi]', u'make[T.nissan]', u'make[T.peugot]',
       u'make[T.plymouth]', u'make[T.porsche]', u'make[T.renault]',
       u'make[T.saab]', u'make[T.subaru]', u'make[T.toyota]',
       u'make[T.volkswagen]', u'make[T.volvo]'],
      dtype='object')

In [372]:
templist=[]

In [363]:
X.columns[21]

'make[T.volvo]'

In [373]:
for i in range(0,22):
    templist.append(X.columns[i])

In [376]:
aString = ''

for item in templist:
    aString = aString + ' + ' + item
print aString
    
    
    

 + Intercept + make[T.audi] + make[T.bmw] + make[T.chevrolet] + make[T.dodge] + make[T.honda] + make[T.isuzu] + make[T.jaguar] + make[T.mazda] + make[T.mercedes-benz] + make[T.mercury] + make[T.mitsubishi] + make[T.nissan] + make[T.peugot] + make[T.plymouth] + make[T.porsche] + make[T.renault] + make[T.saab] + make[T.subaru] + make[T.toyota] + make[T.volkswagen] + make[T.volvo]


In [382]:
import re

In [390]:
newString3 = re.sub(r'\.', "", newString2)

In [396]:
print newString3

 + Intercept + makeTaudi + makeTbmw + makeTchevrolet + makeTdodge + makeThonda + makeTisuzu + makeTjaguar + makeTmazda + makeTmercedes-benz + makeTmercury + makeTmitsubishi + makeTnissan + makeTpeugot + makeTplymouth + makeTporsche + makeTrenault + makeTsaab + makeTsubaru + makeTtoyota + makeTvolkswagen + makeTvolvo


In [401]:
newList = newString3.split('+')
for item in newList:
    item = item.rstrip().lstrip()

In [403]:
newList.remove(newList[0])

In [405]:
for item in newList:
    item = item.strip()

In [406]:
newList

[' Intercept ',
 ' makeTaudi ',
 ' makeTbmw ',
 ' makeTchevrolet ',
 ' makeTdodge ',
 ' makeThonda ',
 ' makeTisuzu ',
 ' makeTjaguar ',
 ' makeTmazda ',
 ' makeTmercedes-benz ',
 ' makeTmercury ',
 ' makeTmitsubishi ',
 ' makeTnissan ',
 ' makeTpeugot ',
 ' makeTplymouth ',
 ' makeTporsche ',
 ' makeTrenault ',
 ' makeTsaab ',
 ' makeTsubaru ',
 ' makeTtoyota ',
 ' makeTvolkswagen ',
 ' makeTvolvo']