In [1]:
import numpy as np
import pandas as pd
from pandas.plotting import scatter_matrix
# make sure pandas is updated to 0.22
from scipy.stats import ttest_ind

# for linear regression model
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score


location = 'https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data'
col_names = ['symboling','normalizedlosses','make','fueltype','aspiration','numofdoors', 
    'bodystyle','drivewheels','enginelocation','wheelbase','length','width','height',
    'curbweight','enginetype','numofcylinders','enginesize','fuelsystem',
    'bore','stroke','compressionratio','horsepower','peakrpm','citympg','highwaympg','price']
automobile = pd.read_csv(location, names=col_names)

In [2]:
automobile.head()

Unnamed: 0,symboling,normalizedlosses,make,fueltype,aspiration,numofdoors,bodystyle,drivewheels,enginelocation,wheelbase,...,enginesize,fuelsystem,bore,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [3]:
automobile.shape

(205, 26)

In [4]:
automobile.dtypes

symboling             int64
normalizedlosses     object
make                 object
fueltype             object
aspiration           object
numofdoors           object
bodystyle            object
drivewheels          object
enginelocation       object
wheelbase           float64
length              float64
width               float64
height              float64
curbweight            int64
enginetype           object
numofcylinders       object
enginesize            int64
fuelsystem           object
bore                 object
stroke               object
compressionratio    float64
horsepower           object
peakrpm              object
citympg               int64
highwaympg            int64
price                object
dtype: object

In [5]:
automobile_dropped = automobile.replace('?', np.nan).dropna()

In [6]:
automobile_dropped.head()

Unnamed: 0,symboling,normalizedlosses,make,fueltype,aspiration,numofdoors,bodystyle,drivewheels,enginelocation,wheelbase,...,enginesize,fuelsystem,bore,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450
6,1,158,audi,gas,std,four,sedan,fwd,front,105.8,...,136,mpfi,3.19,3.4,8.5,110,5500,19,25,17710
8,1,158,audi,gas,turbo,four,sedan,fwd,front,105.8,...,131,mpfi,3.13,3.4,8.3,140,5500,17,20,23875
10,2,192,bmw,gas,std,two,sedan,rwd,front,101.2,...,108,mpfi,3.5,2.8,8.8,101,5800,23,29,16430


In [7]:
automobile_dropped.shape

(159, 26)

In [8]:
automobile_dropped.dtypes

symboling             int64
normalizedlosses     object
make                 object
fueltype             object
aspiration           object
numofdoors           object
bodystyle            object
drivewheels          object
enginelocation       object
wheelbase           float64
length              float64
width               float64
height              float64
curbweight            int64
enginetype           object
numofcylinders       object
enginesize            int64
fuelsystem           object
bore                 object
stroke               object
compressionratio    float64
horsepower           object
peakrpm              object
citympg               int64
highwaympg            int64
price                object
dtype: object

In [9]:
#Changing the type from object to float. 
#Keep doing this for as many columns as necessary 
automobile_dropped[['normalizedlosses']] = automobile_dropped[['normalizedlosses']].astype(float)
automobile_dropped.dtypes

symboling             int64
normalizedlosses    float64
make                 object
fueltype             object
aspiration           object
numofdoors           object
bodystyle            object
drivewheels          object
enginelocation       object
wheelbase           float64
length              float64
width               float64
height              float64
curbweight            int64
enginetype           object
numofcylinders       object
enginesize            int64
fuelsystem           object
bore                 object
stroke               object
compressionratio    float64
horsepower           object
peakrpm              object
citympg               int64
highwaympg            int64
price                object
dtype: object

In [10]:
#1. Change column types here

#change price, peakrpm, horsepower from object to int
automobile_dropped[['price','peakrpm','horsepower']] = automobile_dropped[['price','peakrpm','horsepower']].astype(int)
#change bore, stroke from object to float
automobile_dropped[['bore','stroke']] = automobile_dropped[['bore','stroke']].astype(float)

In [11]:
# change `numofdoors` from word to number
automobile_dropped.numofdoors.unique()

array(['four', 'two'], dtype=object)

In [12]:
automobile_dropped.loc[automobile_dropped['numofdoors'] == 'four', 'numofdoors_int'] = 4
automobile_dropped.loc[automobile_dropped['numofdoors'] == 'two', 'numofdoors_int'] = 2
automobile_dropped[['numofdoors_int']] = automobile_dropped[['numofdoors_int']].astype(int)

In [13]:
automobile_dropped.numofdoors_int.unique()

array([4, 2])

In [14]:
#check out the dtypes now
automobile_dropped.dtypes

symboling             int64
normalizedlosses    float64
make                 object
fueltype             object
aspiration           object
numofdoors           object
bodystyle            object
drivewheels          object
enginelocation       object
wheelbase           float64
length              float64
width               float64
height              float64
curbweight            int64
enginetype           object
numofcylinders       object
enginesize            int64
fuelsystem           object
bore                float64
stroke              float64
compressionratio    float64
horsepower            int64
peakrpm               int64
citympg               int64
highwaympg            int64
price                 int64
numofdoors_int        int64
dtype: object

In [15]:
#2. Generate matrix plot

axs = scatter_matrix(automobile_dropped, alpha=0.2, diagonal='kde', figsize=(24,24))

**Comments**: `citympg` and `hightwaympg` seem to have the closest correlation, for obvious reasons. `horsepower` and `*mpg` appear to have a linear relationship. Factors such as `enginesize` and `curbweight` appear also to have a relationship to `*mpg` and, while not as strong, `price`.

In [16]:
#3. Perform a t-test to determine whether there is a difference in mpg for different types of aspiration

# ref. https://dss.princeton.edu/online_help/analysis/interpreting_regression.htm
# ref. https://towardsdatascience.com/inferential-statistics-series-t-test-using-numpy-2718f8f9bf2f

ttest_ind(automobile_dropped[automobile_dropped['aspiration'] == 'std'][['citympg', 'highwaympg']], 
          automobile_dropped[automobile_dropped['aspiration'] == 'turbo'][['citympg', 'highwaympg']], 
          equal_var=False)

Ttest_indResult(statistic=array([ 4.45622681,  5.26320363]), pvalue=array([  5.43019922e-05,   3.40196816e-06]))

In [17]:
ttest_ind(automobile_dropped[automobile_dropped['aspiration'] == 'std'][['citympg', 'highwaympg']], 
          automobile_dropped[automobile_dropped['aspiration'] == 'turbo'][['citympg', 'highwaympg']], 
          equal_var=True)

Ttest_indResult(statistic=array([ 3.79166612,  4.35539076]), pvalue=array([  2.13011970e-04,   2.38802913e-05]))

**Comments:** With p-values of much less than 0.05, we can reject the null hypothesis that there is no meaningful relationship between these variables. There is a positive difference between `aspiration` and `*mpg`, but is the t-score indicating a lower difference? 

In [18]:
#4. Create a linear model with mpg as the response variable

# let's try a model with the variables we noticed had some information about *mpg from the matrix plot.
reg = linear_model.LinearRegression()
reg.fit(automobile_dropped[['horsepower', 'enginesize', 'curbweight', 'price']], automobile_dropped['citympg'])


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [19]:
reg.coef_

array([ -1.40818043e-01,   5.00446675e-02,  -6.21179273e-03,
         7.80303750e-05])

In [20]:
reg.intercept_

48.445838411706646

In [22]:
citympg_pred = reg.predict(automobile_dropped[['horsepower', 'enginesize', 'curbweight', 'price']])

**Comments:** horsepower and enginesize have some impact on the citympg. Curbweight much less so and price is close enough to zero that we can say it is not very impactful and we'll remove it.

In [23]:
mean_squared_error(automobile_dropped[['citympg']], citympg_pred)

9.5558132673992606

In [24]:
r2_score(automobile_dropped[['citympg']], citympg_pred)

0.74132466671115249

**Comments:** Overall, not a very impressive model. What if we added in some of the variables such as fuelsystem, fueltype and enginetype, which we will have to create dummy variables for.

In [25]:
# For fuelsystem, fueltype, and enginetype, let's see what we are working with.
automobile_dropped.fuelsystem.unique()

array(['mpfi', '2bbl', 'mfi', '1bbl', 'idi', 'spdi'], dtype=object)

In [26]:
automobile_dropped.fuelsystem.value_counts()
# drop mfi, there's only one record

mpfi    64
2bbl    63
idi     15
1bbl    11
spdi     5
mfi      1
Name: fuelsystem, dtype: int64

In [27]:
automobile_dropped.fueltype.unique()

array(['gas', 'diesel'], dtype=object)

In [28]:
automobile_dropped.fueltype.value_counts()
# drop diesel

gas       144
diesel     15
Name: fueltype, dtype: int64

In [29]:
automobile_dropped.enginetype.unique()

array(['ohc', 'l', 'dohc', 'ohcv', 'ohcf'], dtype=object)

In [30]:
automobile_dropped.enginetype.value_counts()
# drop l

ohc     123
ohcf     12
dohc      8
ohcv      8
l         8
Name: enginetype, dtype: int64

In [31]:
# create dummies
automobile_dummies = pd.get_dummies(automobile_dropped, columns=['fuelsystem', 'fueltype', 'enginetype'])

In [32]:
automobile_dummies.head()

Unnamed: 0,symboling,normalizedlosses,make,aspiration,numofdoors,bodystyle,drivewheels,enginelocation,wheelbase,length,...,fuelsystem_mfi,fuelsystem_mpfi,fuelsystem_spdi,fueltype_diesel,fueltype_gas,enginetype_dohc,enginetype_l,enginetype_ohc,enginetype_ohcf,enginetype_ohcv
3,2,164.0,audi,std,four,sedan,fwd,front,99.8,176.6,...,0,1,0,0,1,0,0,1,0,0
4,2,164.0,audi,std,four,sedan,4wd,front,99.4,176.6,...,0,1,0,0,1,0,0,1,0,0
6,1,158.0,audi,std,four,sedan,fwd,front,105.8,192.7,...,0,1,0,0,1,0,0,1,0,0
8,1,158.0,audi,turbo,four,sedan,fwd,front,105.8,192.7,...,0,1,0,0,1,0,0,1,0,0
10,2,192.0,bmw,std,two,sedan,rwd,front,101.2,176.8,...,0,1,0,0,1,0,0,1,0,0


In [33]:
# drop one value from each of the dummied variables so we don't run into multicolinearity
automobile_dropped = automobile_dummies.drop(['fueltype_diesel', 'enginetype_l', 'fuelsystem_mfi'], axis=1)

In [34]:
# reinstantiate the linear regression class
reg = linear_model.LinearRegression()

In [35]:
reg.fit(automobile_dropped[['horsepower', 'enginesize', 'curbweight', 'fueltype_gas', 'fuelsystem_2bbl' , 'fuelsystem_mpfi', 'enginetype_ohc']], automobile_dropped['citympg'])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [36]:
reg.coef_

array([-0.08017022,  0.04127001, -0.0093517 , -6.95172165, -0.10110397,
       -0.16166932, -0.76494316])

**Comments:** By the looks of these coefficients, Type of gas has a substantial effect on the citympg

In [37]:
reg.intercept_

59.293380714199202

In [38]:
citympg_pred = reg.predict(automobile_dropped[['horsepower', 'enginesize', 'curbweight', 'fueltype_gas', 'fuelsystem_2bbl' , 'fuelsystem_mpfi', 'enginetype_ohc']])

In [39]:
mean_squared_error(automobile_dropped[['citympg']], citympg_pred)

6.8546966983835231

In [40]:
r2_score(automobile_dropped[['citympg']], citympg_pred)

0.81444374189504187

**Comments**: Looks like we've gotten a little improvement on our model by including whether the fueltype is gas or not and whether the enginetype is ohc or not. We noticed that also factoring fuelsystem mpfi & ohc did not make much of a difference.