In [1]:
%matplotlib inline


# Numerical libraries
import numpy as np   

# Import Linear Regression machine learning library
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

from sklearn.metrics import r2_score

# to handle data in form of rows and columns 
import pandas as pd    

# importing ploting libraries
import matplotlib.pyplot as plt   

#importing seaborn for statistical plots
import seaborn as sns

In [44]:
mpg_df = pd.read_csv("car-mpg.csv")  
mpg_df = mpg_df.drop('car_name', axis=1)
mpg_df['origin'] = mpg_df['origin'].replace({1: 'america', 2: 'europe', 3: 'asia'})
# Dummy variable - drop_first=True to ensure one less dummy variable is defined
mpg_df = pd.get_dummies(mpg_df, columns=['origin'],drop_first=True)
mpg_df = mpg_df.replace('?', np.nan)
mpg_df = mpg_df.apply(lambda x: x.fillna(x.median()),axis=0)
mpg_df['hp'] = mpg_df['hp'].astype('float64')

In [45]:
mpg_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 10 columns):
mpg              398 non-null float64
cyl              398 non-null int64
disp             398 non-null float64
hp               398 non-null float64
wt               398 non-null int64
acc              398 non-null float64
yr               398 non-null int64
car_type         398 non-null int64
origin_asia      398 non-null int64
origin_europe    398 non-null int64
dtypes: float64(4), int64(6)
memory usage: 31.2 KB


# separate independent and dependent variables

In [46]:
# Copy all the predictor variables into X dataframe. Since 'mpg' is dependent variable drop it
X = mpg_df.drop('mpg', axis=1)

# Copy the 'mpg' column alone into the y dataframe. This is the dependent variable
y = mpg_df[['mpg']]


In [47]:
#from sklearn import preprocessing

# scale all the columns of the mpg_df. This will produce a numpy array
#X_scaled = preprocessing.scale(X)
#X_scaled = pd.DataFrame(X_scaled, columns=X.columns) 

In [48]:
from sklearn.preprocessing import StandardScaler  
scaler = StandardScaler()  
X_scaled = scaler.fit_transform(X)  
X_scaled = pd.DataFrame(X_scaled, columns=X.columns) 

In [49]:

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.30, random_state=1)

# fit a simple linear model

In [50]:
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)

print("Intercept: ", regression_model.intercept_)
for idx, col_name in enumerate(X_train.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))

Intercept:  [23.66510774]
The coefficient for cyl is 2.505951804938502
The coefficient for disp is 2.5357082860560487
The coefficient for hp is -1.7889335736325305
The coefficient for wt is -5.551819873098723
The coefficient for acc is 0.11485734803440784
The coefficient for yr is 2.9318465482116087
The coefficient for car_type is 2.9778697376019427
The coefficient for origin_asia is 0.82822701429572
The coefficient for origin_europe is 0.8362781383948816


In [51]:
print(regression_model.score(X_train, y_train))
print(regression_model.score(X_test, y_test))


0.8343770256960538
0.8513421387780066


# Create a regularized RIDGE model and note the coefficients

In [52]:
ridge = Ridge(alpha=1)
ridge.fit(X_train,y_train)
print ("Ridge model:", (ridge.coef_))

Ridge model: [[ 2.38873168  2.24584874 -1.77595406 -5.30121587  0.0716231   2.90348182
   2.87200551  0.8120093   0.81048147]]


In [53]:
print("Intercept: ", ridge.intercept_)
for idx, col_name in enumerate(X_train.columns):
    print("The coefficient for {} is {}".format(col_name, ridge.coef_[0][idx]))

Intercept:  [23.6626684]
The coefficient for cyl is 2.3887316759448187
The coefficient for disp is 2.245848744133584
The coefficient for hp is -1.7759540641596028
The coefficient for wt is -5.30121587484919
The coefficient for acc is 0.07162310453347369
The coefficient for yr is 2.9034818184786206
The coefficient for car_type is 2.8720055111319427
The coefficient for origin_asia is 0.812009304699243
The coefficient for origin_europe is 0.8104814685705717


In [54]:
print(ridge.score(X_train, y_train))
print(ridge.score(X_test, y_test))

0.8342199644938272
0.852973535261167


# Create a regularized LASSO model and note the coefficients

In [55]:
lasso = Lasso(alpha=1)
lasso.fit(X_train,y_train)
print ("Lasso model:", (lasso.coef_))

# Observe, many of the coefficients have become 0 indicating drop of those dimensions from the model

Lasso model: [-0.         -0.         -0.05656463 -4.05009448  0.          2.03034296
  0.95988039  0.          0.        ]


In [56]:
print(lasso.score(X_train, y_train))
print(lasso.score(X_test, y_test))

0.7821044353977104
0.8251153919895945


## Let us compare their scores

In [57]:
print(regression_model.score(X_train, y_train))
print(regression_model.score(X_test, y_test))


0.8343770256960538
0.8513421387780066


In [58]:
print(ridge.score(X_train, y_train))
print(ridge.score(X_test, y_test))

0.8342199644938272
0.852973535261167


In [59]:
print(lasso.score(X_train, y_train))
print(lasso.score(X_test, y_test))

0.7821044353977104
0.8251153919895945


# Let us generate polynomial models reflecting the non-linear interaction between some dimensions

In [60]:
from sklearn.preprocessing import PolynomialFeatures

In [61]:
poly = PolynomialFeatures(degree = 2, include_bias=False)


In [62]:
X_poly = poly.fit_transform(X_scaled )
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.30, random_state=1)
X_train.shape

(278, 54)

In [63]:
#Formating of output can be done by getting columns using following command
#poly.get_feature_names(X_scaled.columns)

# Fit a simple non regularized linear model on poly features-

In [76]:
regression_model.fit(X_train, y_train)
print(regression_model.coef_[0])

[-1.14014742e+12 -5.27307129e+00 -2.76361084e+00 -2.00994873e+00
 -1.58142090e+00  3.07208252e+00 -1.44300744e+12  2.82896700e+11
  1.82679488e+11  1.19471019e+12 -3.41394043e+00 -1.39099121e+00
  4.14916992e-01  1.98034668e+00 -1.71093750e+00 -1.80170924e+12
  3.40338119e+12  2.71306429e+12  1.97998047e+00  8.87451172e-02
 -8.35449219e-01 -5.10742188e-01  3.82397461e+00 -1.97119141e+00
 -6.44921875e+00 -1.02514648e+00 -4.48608398e-01  1.05834961e+00
 -1.51733398e+00 -1.15234375e+00 -1.40539551e+00  9.95117188e-01
  1.50341797e+00 -2.13439941e-01  4.14550781e-01 -2.79541016e-01
 -3.28442383e+00  2.19335938e+00  6.40869141e-02 -2.58911133e-01
  4.56665039e-01  8.45703125e-01 -7.20214844e-03  1.03594971e+00
  1.07716370e+00  2.28256226e-01  7.12219238e-01  1.08795166e+00
  1.74348325e+11  1.99967968e+12  1.59407931e+12  4.27691123e+11
 -2.21598517e+11  3.11045233e+11]


In [77]:
print(regression_model.score(X_train, y_train))
print(regression_model.score(X_test, y_test))

0.922796908951997
0.8484016423240605


## Ridge - modify ALPHA values to get better score

In [66]:
ridge = Ridge(alpha=1)
ridge.fit(X_train,y_train)
print ("Ridge model:", (ridge.coef_))

Ridge model: [[ 2.63865541e+00 -2.37408486e+00 -2.65212967e+00 -3.12107114e+00
  -1.12746233e+00  3.07077235e+00  1.52909161e+00  2.02604877e-01
  -2.87629850e-01 -1.37945713e+00 -1.09966208e+00 -1.01078797e+00
   3.32260057e-01  1.56812916e+00 -9.84590552e-01  1.30667566e-02
   2.36932639e+00 -1.39028192e+00  4.81491658e-01  2.39331914e-01
   9.28459487e-02 -2.52840506e-01  2.70741772e+00 -1.61539650e+00
  -9.84221502e-01 -4.98508396e-01 -1.74610449e-01  2.22020751e-01
  -9.95456295e-01 -1.05636540e+00 -1.36102812e+00  6.38279930e-01
   1.17200558e+00 -9.22402736e-02 -1.58827979e-02 -1.17973218e-01
  -2.39724426e+00  1.94066565e-01 -1.01230473e-03 -1.36767707e-01
   4.02066040e-01  4.16867222e-01  5.62555026e-01  1.02873950e+00
   1.09019774e+00  2.10712626e-01  6.45560484e-01  1.00792770e+00
  -1.84749264e-01  9.68725309e-01  4.50008326e-01  3.06303705e-01
   4.95400677e-02 -4.89742415e-01]]


In [67]:
print(ridge.score(X_train, y_train))
print(ridge.score(X_test, y_test))

0.9200983304122193
0.8572159964789557


## LASSO - modify ALPHA values to get better score

In [74]:
lasso = Lasso(alpha=0.2)
lasso.fit(X_train,y_train)
print ("Lasso model:", (lasso.coef_))

Lasso model: [-0.         -0.         -1.29512682 -5.13889594 -0.          2.81617244
  0.          0.          0.         -0.          0.          0.
  0.          0.         -0.          0.          0.         -0.
  0.          0.          0.1992994  -0.         -0.         -0.
 -0.         -0.          0.          0.         -0.         -0.528416
 -0.         -0.         -0.          1.09341203 -0.         -0.
 -0.         -0.         -0.          0.          0.12428471 -0.
  0.16431421  0.61434992  0.51474933  0.          0.07423823  0.08444451
 -0.          0.          0.          0.10224055 -0.          0.16432798]


In [75]:
print(lasso.score(X_train, y_train))
print(lasso.score(X_test, y_test))

0.8862674336344545
0.8820190094307797
