In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv("auto-mpg.csv") 
df['horsepower'].astype(str).astype(int)

acc = df["acceleration"]
logdisp = np.log(df["displacement"])
loghorse = np.log(df["horsepower"])
logweight= np.log(df["weight"])

scaled_acc = (acc-min(acc))/(max(acc)-min(acc))	
scaled_disp = (logdisp-np.mean(logdisp))/np.sqrt(np.var(logdisp))
scaled_horse = (loghorse-np.mean(loghorse))/(max(loghorse)-min(loghorse))
scaled_weight= (logweight-np.mean(logweight))/np.sqrt(np.var(logweight))

df_fin = pd.DataFrame([])
df_fin["acc"]= scaled_acc
df_fin["disp"]= scaled_disp
df_fin["horse"] = scaled_horse
df_fin["weight"] = scaled_weight
cyl_dummies = pd.get_dummies(df["cylinders"], prefix="cyl")
yr_dummies = pd.get_dummies(df["model year"], prefix="yr")
orig_dummies = pd.get_dummies(df["origin"], prefix="orig")
mpg = df["mpg"]
df_fin = pd.concat([mpg, df_fin, cyl_dummies, yr_dummies, orig_dummies], axis=1)

In [2]:
data_ols = pd.concat([mpg, scaled_acc, scaled_weight, orig_dummies], axis= 1)
data_ols.head()

Unnamed: 0,mpg,acceleration,weight,orig_1,orig_2,orig_3
0,18.0,0.238095,0.720986,1,0,0
1,15.0,0.208333,0.908047,1,0,0
2,18.0,0.178571,0.651205,1,0,0
3,16.0,0.238095,0.648095,1,0,0
4,17.0,0.14881,0.664652,1,0,0


In [4]:
import statsmodels.api as sm
from statsmodels.formula.api import ols
df_ols = pd.concat([mpg, scaled_acc, scaled_weight, orig_dummies], axis= 1)
outcome = 'mpg'
predictors = df_ols.drop('mpg', axis=1)
pred_sum = "+".join(predictors.columns)
formula = outcome + "~" + pred_sum
model = ols(formula= formula, data=df_ols).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                    mpg   R-squared:                       0.726
Model:                            OLS   Adj. R-squared:                  0.723
Method:                 Least Squares   F-statistic:                     256.7
Date:                Tue, 16 Jul 2019   Prob (F-statistic):          1.86e-107
Time:                        14:35:31   Log-Likelihood:                -1107.2
No. Observations:                 392   AIC:                             2224.
Df Residuals:                     387   BIC:                             2244.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept       16.1041      0.509     31.636   

In [6]:
import statsmodels.api as sm
predictors_int = sm.add_constant(predictors)
model = sm.OLS(df['mpg'],predictors_int).fit()
model.summary()

0,1,2,3
Dep. Variable:,mpg,R-squared:,0.726
Model:,OLS,Adj. R-squared:,0.723
Method:,Least Squares,F-statistic:,256.7
Date:,"Tue, 16 Jul 2019",Prob (F-statistic):,1.86e-107
Time:,14:37:15,Log-Likelihood:,-1107.2
No. Observations:,392,AIC:,2224.0
Df Residuals:,387,BIC:,2244.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,16.1041,0.509,31.636,0.000,15.103,17.105
acceleration,5.0494,1.389,3.634,0.000,2.318,7.781
weight,-5.8764,0.282,-20.831,0.000,-6.431,-5.322
orig_1,4.6566,0.363,12.839,0.000,3.944,5.370
orig_2,5.0690,0.454,11.176,0.000,4.177,5.961
orig_3,6.3785,0.430,14.829,0.000,5.533,7.224

0,1,2,3
Omnibus:,37.427,Durbin-Watson:,0.84
Prob(Omnibus):,0.0,Jarque-Bera (JB):,55.989
Skew:,0.648,Prob(JB):,6.95e-13
Kurtosis:,4.322,Cond. No.,2760000000000000.0


In [10]:
from sklearn.linear_model import LinearRegression
y = df_ols['mpg']
predictors = predictors.drop("orig_3",axis=1)
linreg = LinearRegression()
linreg.fit(predictors, y)
print(linreg.coef_)
print(linreg.intercept_)

[ 5.04941007 -5.87640551 -1.72184708 -1.30947254]
22.48260416045567
