In [1]:
#Importing the Libaries 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
url='https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'

In [3]:
columnNames=['mpg','cylinders','displacement','horsepower','weight','acceleration','modelYear','origin','carName']

In [4]:
dataset=pd.read_csv(url,
delim_whitespace=True, 
                 header = None, 
                 names = columnNames,na_values='?')

In [5]:
#Since CarName is string and Unique for each instance we cannot use it
dataset=dataset.drop('carName',axis=1)

In [6]:
#Replacing '?' mark with Nan
dataset=dataset.replace('?', np.NaN)
dataset['origin'] = dataset['origin'].replace({1: 'america', 2: 'europe', 3: 'asia'})
dataset = pd.get_dummies(dataset, columns=['origin'])
dataset=dataset.dropna()

In [7]:
#Preparing feature and lables
X=dataset.iloc[:,1:].values
y=dataset.iloc[:,0:1].values

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

In [10]:
from sklearn.linear_model import LinearRegression

In [11]:
linear_regressor=LinearRegression()

In [12]:
linear_regressor.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [13]:
linear_regressor.predict(X_test)

array([[25.51469604],
       [25.27886826],
       [10.53305685],
       [32.15477571],
       [32.95814026],
       [23.43992958],
       [34.21485266],
       [26.88615822],
       [20.23865435],
       [26.51556003],
       [28.55590056],
       [13.70730486],
       [24.86251701],
       [ 8.78102974],
       [16.82256734],
       [32.92133689],
       [26.61239728],
       [12.68417729],
       [26.27106596],
       [26.86332078],
       [20.99953477],
       [17.83198525],
       [23.53930949],
       [26.91428864],
       [32.56194353],
       [11.23834236],
       [28.62475595],
       [25.45513011],
       [22.92977219],
       [14.89347866],
       [22.18050151],
       [30.74416878],
       [24.77460688],
       [29.027514  ],
       [21.77982546],
       [29.18586549],
       [28.24585686],
       [12.25338166],
       [14.8345176 ],
       [10.63112153],
       [14.01825216],
       [34.3497955 ],
       [30.51149418],
       [13.54998863],
       [31.61464295],
       [15

In [14]:
y_test

array([[28. ],
       [22.3],
       [12. ],
       [38. ],
       [33.8],
       [19.4],
       [38.1],
       [30. ],
       [20. ],
       [20. ],
       [27. ],
       [16.5],
       [24.5],
       [11. ],
       [16.9],
       [33.7],
       [21.6],
       [14. ],
       [26. ],
       [28.4],
       [13. ],
       [16. ],
       [20. ],
       [25. ],
       [41.5],
       [14. ],
       [25.8],
       [25.1],
       [20. ],
       [17. ],
       [20. ],
       [31.6],
       [22. ],
       [26. ],
       [21. ],
       [29.8],
       [31. ],
       [13. ],
       [16. ],
       [14. ],
       [15. ],
       [44.6],
       [31.3],
       [16. ],
       [29. ],
       [16. ],
       [29. ],
       [13. ],
       [17.5],
       [18. ],
       [26. ],
       [15. ],
       [10. ],
       [22. ],
       [34.3],
       [30.7],
       [20.2],
       [22. ],
       [33. ],
       [21. ],
       [22. ],
       [24. ],
       [31.5],
       [15. ],
       [26. ],
       [16. ],
       [14

In [18]:
linear_regressor.score(X_train,y_train)

0.8194239716903474

In [19]:
#Using Backward Elimination Method to choose the required features only

In [20]:
import statsmodels.formula.api as sfm

In [21]:
X=np.append(arr=np.ones((392,1)).astype(int),values=X,axis=1)

In [22]:
X=X[:,:9]

In [23]:
X_opt=X[:,[0,1,2,3,4,5,6,7,8]]

In [24]:
regressor_OLS=sfm.OLS(endog=y,exog=X_opt).fit()

In [25]:
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.824
Model:,OLS,Adj. R-squared:,0.821
Method:,Least Squares,F-statistic:,224.5
Date:,"Wed, 25 Jul 2018",Prob (F-statistic):,1.79e-139
Time:,05:07:55,Log-Likelihood:,-1020.5
No. Observations:,392,AIC:,2059.0
Df Residuals:,383,BIC:,2095.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-15.3246,4.602,-3.330,0.001,-24.374,-6.276
x1,-0.4897,0.321,-1.524,0.128,-1.121,0.142
x2,0.0240,0.008,3.133,0.002,0.009,0.039
x3,-0.0182,0.014,-1.326,0.185,-0.045,0.009
x4,-0.0067,0.001,-10.243,0.000,-0.008,-0.005
x5,0.0791,0.098,0.805,0.421,-0.114,0.272
x6,0.7770,0.052,15.005,0.000,0.675,0.879
x7,-2.6300,0.566,-4.643,0.000,-3.744,-1.516
x8,0.2232,0.566,0.394,0.694,-0.890,1.336

0,1,2,3
Omnibus:,23.395,Durbin-Watson:,1.291
Prob(Omnibus):,0.0,Jarque-Bera (JB):,34.452
Skew:,0.444,Prob(JB):,3.3e-08
Kurtosis:,4.15,Cond. No.,85600.0


In [26]:
X_opt=X[:,[0,1,2,3,4,5,7,8]]
regressor_OLS=sfm.OLS(endog=y,exog=X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.721
Model:,OLS,Adj. R-squared:,0.716
Method:,Least Squares,F-statistic:,141.7
Date:,"Wed, 25 Jul 2018",Prob (F-statistic):,2.78e-102
Time:,05:07:56,Log-Likelihood:,-1111.1
No. Observations:,392,AIC:,2238.0
Df Residuals:,384,BIC:,2270.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,45.8943,2.680,17.122,0.000,40.624,51.164
x1,-0.5662,0.404,-1.401,0.162,-1.361,0.229
x2,0.0114,0.010,1.194,0.233,-0.007,0.030
x3,-0.0613,0.017,-3.636,0.000,-0.094,-0.028
x4,-0.0048,0.001,-5.948,0.000,-0.006,-0.003
x5,-0.0320,0.123,-0.259,0.795,-0.274,0.210
x6,-1.1255,0.702,-1.604,0.109,-2.505,0.254
x7,1.8070,0.700,2.582,0.010,0.431,3.183

0,1,2,3
Omnibus:,34.295,Durbin-Watson:,0.921
Prob(Omnibus):,0.0,Jarque-Bera (JB):,47.569
Skew:,0.635,Prob(JB):,4.68e-11
Kurtosis:,4.14,Cond. No.,39800.0


In [27]:
X_opt=X[:,[0,1,3,4,5,7,8]]
regressor_OLS=sfm.OLS(endog=y,exog=X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.72
Model:,OLS,Adj. R-squared:,0.715
Method:,Least Squares,F-statistic:,164.8
Date:,"Wed, 25 Jul 2018",Prob (F-statistic):,4.2e-103
Time:,05:07:58,Log-Likelihood:,-1111.8
No. Observations:,392,AIC:,2238.0
Df Residuals:,385,BIC:,2265.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,44.6093,2.456,18.163,0.000,39.780,49.438
x1,-0.2515,0.307,-0.820,0.412,-0.854,0.351
x2,-0.0548,0.016,-3.433,0.001,-0.086,-0.023
x3,-0.0044,0.001,-5.956,0.000,-0.006,-0.003
x4,-0.0454,0.123,-0.369,0.712,-0.287,0.196
x5,-0.8492,0.663,-1.282,0.201,-2.152,0.454
x6,1.8239,0.700,2.605,0.010,0.447,3.201

0,1,2,3
Omnibus:,38.033,Durbin-Watson:,0.921
Prob(Omnibus):,0.0,Jarque-Bera (JB):,54.486
Skew:,0.676,Prob(JB):,1.47e-12
Kurtosis:,4.229,Cond. No.,36300.0


In [28]:
regressor_OLS.conf_int()

array([[ 3.97803797e+01,  4.94381446e+01],
       [-8.54277088e-01,  3.51240328e-01],
       [-8.61362487e-02, -2.33989056e-02],
       [-5.89534707e-03, -2.96906557e-03],
       [-2.86828667e-01,  1.96093882e-01],
       [-2.15202074e+00,  4.53596664e-01],
       [ 4.47370746e-01,  3.20050281e+00]])