In [59]:
%matplotlib inline


# Numerical libraries
import numpy as np   

# Import Linear Regression machine learning library
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

from sklearn.metrics import r2_score

# to handle data in form of rows and columns 
import pandas as pd    

# importing ploting libraries
import matplotlib.pyplot as plt   

#importing seaborn for statistical plots
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [60]:
mpg_df = pd.read_csv("./dataset/auto-mpg.csv")  
mpg_df = mpg_df.drop('car_name', axis=1)
mpg_df['origin'] = mpg_df['origin'].replace({1: 'america', 2: 'europe', 3: 'asia'})
mpg_df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
0,18.0,8,307.0,130,3504,12.0,70,america
1,15.0,8,350.0,165,3693,11.5,70,america
2,18.0,8,318.0,150,3436,11.0,70,america
3,16.0,8,304.0,150,3433,12.0,70,america
4,17.0,8,302.0,140,3449,10.5,70,america


In [61]:
mpg_df = pd.get_dummies(mpg_df, columns=['origin'])
mpg_df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin_america,origin_asia,origin_europe
0,18.0,8,307.0,130,3504,12.0,70,True,False,False
1,15.0,8,350.0,165,3693,11.5,70,True,False,False
2,18.0,8,318.0,150,3436,11.0,70,True,False,False
3,16.0,8,304.0,150,3433,12.0,70,True,False,False
4,17.0,8,302.0,140,3449,10.5,70,True,False,False


In [62]:
# hp is missing cause it does not seem to be reqcognized as a numerical column!
mpg_df.dtypes

mpg               float64
cylinders           int64
displacement      float64
horsepower         object
weight              int64
acceleration      float64
model_year          int64
origin_america       bool
origin_asia          bool
origin_europe        bool
dtype: object

In [63]:
# isdigit()? on 'horsepower' 
hpIsDigit = pd.DataFrame(mpg_df.horsepower.str.isdigit())  # if the string is made of digits store True else False

#print isDigit = False!
mpg_df[hpIsDigit['horsepower'] == False]   # from temp take only those rows where hp has false

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin_america,origin_asia,origin_europe
32,25.0,4,98.0,?,2046,19.0,71,True,False,False
126,21.0,6,200.0,?,2875,17.0,74,True,False,False
330,40.9,4,85.0,?,1835,17.3,80,False,False,True
336,23.6,4,140.0,?,2905,14.3,80,True,False,False
354,34.5,4,100.0,?,2320,15.8,81,False,False,True
374,23.0,4,151.0,?,3035,20.5,82,True,False,False


In [64]:
# Missing values have a'?''
# Replace missing values with NaN
mpg_df = mpg_df.replace('?', np.nan)
mpg_df[hpIsDigit['horsepower'] == False] 

#mpg_df = mpg_df.apply(lambda x: x.fillna(x.median()),axis=0)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin_america,origin_asia,origin_europe
32,25.0,4,98.0,,2046,19.0,71,True,False,False
126,21.0,6,200.0,,2875,17.0,74,True,False,False
330,40.9,4,85.0,,1835,17.3,80,False,False,True
336,23.6,4,140.0,,2905,14.3,80,True,False,False
354,34.5,4,100.0,,2320,15.8,81,False,False,True
374,23.0,4,151.0,,3035,20.5,82,True,False,False


In [65]:
#instead of dropping the rows, lets replace the missing values with median value. 
mpg_df.median()

mpg                 23.0
cylinders            4.0
displacement       148.5
horsepower          93.5
weight            2803.5
acceleration        15.5
model_year          76.0
origin_america       1.0
origin_asia          0.0
origin_europe        0.0
dtype: object

In [66]:
# replace the missing values with median value.
# Note, we do not need to specify the column names below
# every column's missing value is replaced with that column's median respectively  (axis =0 means columnwise)
#cData = cData.fillna(cData.median())

medianFiller = lambda x: x.fillna(x.median())
mpg_df = mpg_df.apply(medianFiller,axis=0)

mpg_df['horsepower'] = mpg_df['horsepower'].astype('float64')  # converting the hp column from object / string type to float

In [67]:
mpg_df.describe()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year
count,398.0,398.0,398.0,398.0,398.0,398.0,398.0
mean,23.514573,5.454774,193.425879,104.30402,2970.424623,15.56809,76.01005
std,7.815984,1.701004,104.269838,38.222625,846.841774,2.757689,3.697627
min,9.0,3.0,68.0,46.0,1613.0,8.0,70.0
25%,17.5,4.0,104.25,76.0,2223.75,13.825,73.0
50%,23.0,4.0,148.5,93.5,2803.5,15.5,76.0
75%,29.0,8.0,262.0,125.0,3608.0,17.175,79.0
max,46.6,8.0,455.0,230.0,5140.0,24.8,82.0


# separate independent and dependent variables

In [68]:
# Copy all the predictor variables into X dataframe. Since 'mpg' is dependent variable drop it
X = mpg_df.drop('mpg', axis=1)

# Copy the 'mpg' column alone into the y dataframe. This is the dependent variable
y = mpg_df[['mpg']]


In [69]:
from sklearn import preprocessing

# scale all the columns of the mpg_df. This will produce a numpy array
X_scaled = preprocessing.scale(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)  # ideally the training and test should be 

y_scaled = preprocessing.scale(y)
y_scaled = pd.DataFrame(y_scaled, columns=y.columns)  # ideally the training and test should be 

In [70]:

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.30, random_state=1)

# fit a simple linear model

In [71]:
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)


In [72]:
for idx, col_name in enumerate(X_train.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))

The coefficient for cylinders is -0.0859226425444867
The coefficient for displacement is 0.38615017668954216
The coefficient for horsepower is -0.10637514644618906
The coefficient for weight is -0.7965737428612094
The coefficient for acceleration is 0.021846813318919477
The coefficient for model_year is 0.3959410531014954
The coefficient for origin_america is -0.09399896644893826
The coefficient for origin_asia is 0.04491789013804896
The coefficient for origin_europe is 0.07243059852959066


In [73]:
intercept = regression_model.intercept_[0]

print("The intercept for our model is {}".format(intercept))

The intercept for our model is 0.015510225561902385


# Create a regularized RIDGE model and note the coefficients

In [74]:
ridge = Ridge(alpha=.3)
ridge.fit(X_train,y_train)
print ("Ridge model:", (ridge.coef_))
    

Ridge model: [[-0.0800581   0.36661042 -0.10890119 -0.78324655  0.01917898  0.39442138
  -0.0930884   0.04466769  0.07153523]]


# Create a regularized LASSO model and note the coefficients

In [75]:
lasso = Lasso(alpha=0.1)
lasso.fit(X_train,y_train)
print ("Lasso model:", (lasso.coef_))

# Observe, many of the coefficients have become 0 indicating drop of those dimensions from the model

Lasso model: [-0.         -0.         -0.01464723 -0.60711757  0.          0.29460087
 -0.04017427  0.          0.        ]


## Let us compare their scores

In [76]:
print(regression_model.score(X_train, y_train))
print(regression_model.score(X_test, y_test))


0.8141025501610559
0.8433135132808833


In [77]:
print(ridge.score(X_train, y_train))
print(ridge.score(X_test, y_test))

0.8140828080856513
0.8437999817350271


In [78]:
print(lasso.score(X_train, y_train))
print(lasso.score(X_test, y_test))

0.7878910251573478
0.8315130533007058


In [79]:
# More or less similar results but with less complex models.  Complexity is a function of variables and coefficients
## Note - with Lasso, we get equally good result in test though not so in training.  Further, the number of dimensions is much less
# in LASSO model than ridge or un-regularized model

# Let us generate polynomial models reflecting the non-linear interaction between some dimensions

In [80]:
from sklearn.preprocessing import PolynomialFeatures

In [81]:
poly = PolynomialFeatures(degree = 2, interaction_only=True)

#poly = PolynomialFeatures(2)

In [82]:
X_poly = poly.fit_transform(X_scaled)
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.30, random_state=1)
X_train.shape

(278, 46)

# Fit a simple non regularized linear model on poly features-

In [83]:
regression_model.fit(X_train, y_train)
print(regression_model.coef_[0])


[-3.68062164e-13 -1.56671819e-02 -5.13530107e-01 -1.92736388e+00
 -5.27786850e+00 -6.16513638e-01  3.06647253e+00 -5.34999114e+09
 -1.14415891e+12  1.05885111e+12 -1.82241655e+00  1.81409645e-01
  2.00332922e+00  1.51361675e+00 -1.53851151e+00  1.46475595e+12
  1.20720590e+12  1.15228101e+12  2.84910186e-01  1.61581097e+00
 -1.02081090e+00  2.60712904e+00  2.91447275e+11  2.40201701e+11
  2.29273116e+11 -6.26577848e-01 -2.61508615e-01 -1.53232412e+00
  1.01422925e+11  8.35895933e+10  7.97864730e+10 -1.36548293e-01
  2.45765883e-01 -5.85654082e+11 -4.82677722e+11 -4.60717076e+11
  5.14993866e-01 -1.55654591e+11 -1.28285631e+11 -1.22448951e+11
 -3.58596593e+10 -2.95544062e+10 -2.82097536e+10 -9.64099384e+10
  1.61366242e+12 -9.53040389e+11]


In [84]:
ridge = Ridge(alpha=.3)
ridge.fit(X_train,y_train)
print ("Ridge model:", (ridge.coef_))

Ridge model: [[ 0.          0.06740642 -0.61900803 -1.97236759 -5.15141317 -0.62282102
   3.04381568  0.1723188   0.15891088 -0.38553368 -1.4895438   0.02925116
   1.72762625  1.4201127  -1.38679985 -0.05712906  1.13569653 -1.11720961
   0.30089657  1.53987731 -0.84218996  2.38658282  0.21457492  0.50684735
  -0.80377008 -0.47592772 -0.30069342 -1.50318104 -0.61710306  0.43440955
   0.32933252 -0.14480549  0.25597746  0.47585604 -0.93880283  0.37865359
   0.4784103  -0.67039722  0.22915648  0.61211567 -0.49887694  0.4033774
   0.21155723 -0.29118156  0.41878367  0.1184465 ]]


In [85]:
print(ridge.score(X_train, y_train))
print(ridge.score(X_test, y_test))


0.9025975935207239
0.8673792928418459


In [86]:
lasso = Lasso(alpha=0.01)
lasso.fit(X_train,y_train)
print ("Lasso model:", (lasso.coef_))


Lasso model: [ 0.         -0.         -0.08692269 -1.94971176 -5.29180738 -0.47273225
  2.98385949 -0.          0.         -0.         -0.79527606 -0.05882027
  1.1561088   1.08164446 -0.9493974  -0.          1.23352833 -0.87530628
 -0.          1.38994992 -0.4329787   1.86995071 -0.          0.
 -0.         -0.         -0.24115467 -1.24997101 -0.6285489   0.
  0.         -0.16019703  0.          0.26564551 -0.61312578 -0.
  0.47635543 -0.89394     0.          0.3945189  -0.68253648  0.12935612
  0.         -0.23102258  0.38217499  0.        ]


In [87]:
print(lasso.score(X_train, y_train))
print(lasso.score(X_test, y_test))


0.9013410674767774
0.8704180363604553
