In [25]:
%matplotlib inline


# Numerical libraries
import numpy as np   

# Import Linear Regression machine learning library
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

from sklearn.metrics import r2_score

# to handle data in form of rows and columns 
import pandas as pd    

# importing ploting libraries
import matplotlib.pyplot as plt   

#importing seaborn for statistical plots
import seaborn as sns

In [26]:
mpg_df = pd.read_csv("./dataset/car-mpg.csv")  
mpg_df = mpg_df.drop('car_name', axis=1)
mpg_df['origin'] = mpg_df['origin'].replace({1: 'america', 2: 'europe', 3: 'asia'})
mpg_df = pd.get_dummies(mpg_df, columns=['origin'])
mpg_df = mpg_df.replace('?', np.nan)
mpg_df = mpg_df.apply(lambda x: x.fillna(x.median()),axis=0)

In [27]:
mpg_df.describe()

Unnamed: 0,mpg,cylinders,displacement,weight,acceleration,model_year,car_type
count,398.0,398.0,398.0,398.0,398.0,398.0,398.0
mean,23.514573,5.454774,193.425879,2970.424623,15.56809,76.01005,0.530151
std,7.815984,1.701004,104.269838,846.841774,2.757689,3.697627,0.499718
min,9.0,3.0,68.0,1613.0,8.0,70.0,0.0
25%,17.5,4.0,104.25,2223.75,13.825,73.0,0.0
50%,23.0,4.0,148.5,2803.5,15.5,76.0,1.0
75%,29.0,8.0,262.0,3608.0,17.175,79.0,1.0
max,46.6,8.0,455.0,5140.0,24.8,82.0,1.0


# separate independent and dependent variables

In [28]:
# Copy all the predictor variables into X dataframe. Since 'mpg' is dependent variable drop it
X = mpg_df.drop('mpg', axis=1)

# Copy the 'mpg' column alone into the y dataframe. This is the dependent variable
y = mpg_df[['mpg']]


In [29]:
from sklearn import preprocessing

# scale all the columns of the mpg_df. This will produce a numpy array
X_scaled = preprocessing.scale(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)  # ideally the training and test should be 

y_scaled = preprocessing.scale(y)
y_scaled = pd.DataFrame(y_scaled, columns=y.columns)  # ideally the training and test should be 

In [30]:

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.30, random_state=1)

# fit a simple linear model

In [31]:
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)

for idx, col_name in enumerate(X_train.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))

The coefficient for cylinders is 0.3210223856916098
The coefficient for displacement is 0.32483430918483935
The coefficient for horsepower is -0.22916950059437635
The coefficient for weight is -0.7112101905072296
The coefficient for acceleration is 0.014713682764190952
The coefficient for model_year is 0.37558119495107395
The coefficient for car_type is 0.3814769484233098
The coefficient for origin_america is -0.07472247547584174
The coefficient for origin_asia is 0.04451525203567835
The coefficient for origin_europe is 0.04834854953945395


In [32]:
intercept = regression_model.intercept_[0]

print("The intercept for our model is {}".format(intercept))

The intercept for our model is 0.0192841161036397


# Create a regularized RIDGE model and note the coefficients

In [33]:
ridge = Ridge(alpha=.3)
ridge.fit(X_train,y_train)
print ("Ridge model:", (ridge.coef_))
    

Ridge model: [[ 0.31649043  0.31320707 -0.22876025 -0.70109447  0.01295851  0.37447352
   0.37725608 -0.07423624  0.04441039  0.04784031]]


# Create a regularized LASSO model and note the coefficients

In [34]:
lasso = Lasso(alpha=0.1)
lasso.fit(X_train,y_train)
print ("Lasso model:", (lasso.coef_))

# Observe, many of the coefficients have become 0 indicating drop of those dimensions from the model

Lasso model: [-0.         -0.         -0.01690287 -0.51890013  0.          0.28138241
  0.1278489  -0.01642647  0.          0.        ]


## Let us compare their scores

In [35]:
print(regression_model.score(X_train, y_train))
print(regression_model.score(X_test, y_test))


0.8343770256960538
0.8513421387780067


In [36]:
print(ridge.score(X_train, y_train))
print(ridge.score(X_test, y_test))

0.8343617931312616
0.8518882171608506


In [37]:
print(lasso.score(X_train, y_train))
print(lasso.score(X_test, y_test))

0.7938010766228453
0.8375229615977083


In [38]:
# More or less similar results but with less complex models.  Complexity is a function of variables and coefficients
## Note - with Lasso, we get equally good result in test though not so in training.  Further, the number of dimensions is much less
# in LASSO model than ridge or un-regularized model

# Let us generate polynomial models reflecting the non-linear interaction between some dimensions

In [39]:
from sklearn.preprocessing import PolynomialFeatures

In [40]:
poly = PolynomialFeatures(degree = 2, interaction_only=True)

#poly = PolynomialFeatures(2)

In [41]:
X_poly = poly.fit_transform(X_scaled)
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.30, random_state=1)
X_train.shape

(278, 56)

# Fit a simple non regularized linear model on poly features-

In [42]:
regression_model.fit(X_train, y_train)
print(regression_model.coef_[0])


[-9.67853872e-13 -6.30341061e+11 -4.44072903e+00 -2.22113191e+00
 -2.96781988e+00 -1.54792500e+00  3.00918310e+00 -8.39286182e+11
 -3.25494764e+11  2.00078715e+12 -1.58882282e+12 -1.22092481e+00
 -1.28176076e+00 -8.11378881e-02  2.72210039e+00 -1.95630981e+00
 -2.01950968e+12 -3.41000931e+12 -1.19646259e+12 -1.14202649e+12
  3.93177932e-01  1.81943802e-01 -4.80558889e-01  3.53461893e+00
 -2.04420743e+00 -4.42851696e+11 -3.64984475e+11 -3.48378581e+11
  2.09458171e-01 -6.43968644e-01 -1.90596098e+00 -6.36937020e-01
 -8.87781522e+10 -7.31681680e+10 -6.98391968e+10 -1.85668945e-01
  5.23010254e-01 -3.44946289e+00  9.33573028e+11  7.69421602e+11
  7.34414817e+11  5.46630859e-01  1.66870117e+00  1.99491820e+11
  1.64414899e+11  1.56934427e+11  3.82690430e-01  1.13796790e+11
  9.37877443e+10  8.95206332e+10 -4.80087219e+11  5.52620374e+11
  5.27477510e+11  9.99300159e+10 -2.67487213e+12  1.45700514e+12]


In [43]:
ridge = Ridge(alpha=.3)
ridge.fit(X_train,y_train)
print ("Ridge model:", (ridge.coef_))

Ridge model: [[ 0.          3.73512981 -2.93500874 -2.13974194 -3.56547812 -1.28898893
   3.01290805  2.04739082  0.0786974   0.21972225 -0.3302341  -1.46231096
  -1.17221896  0.00856067  2.48054694 -1.67596093  0.99537516 -2.29024279
   4.7699338  -2.08598898  0.34009408  0.35024058 -0.41761834  3.06970569
  -2.21649433  1.86339518 -2.62934278  0.38596397  0.12088534 -0.53440382
  -1.88265835 -0.7675926  -0.90146842  0.52416091  0.59678246 -0.26349448
   0.5827378  -3.02842915 -0.36548074  0.5956112  -0.15941014  0.49168856
   1.45652375 -0.43819158 -0.20964198  0.77665496  0.36489921 -0.4750838
   0.3551047   0.23188557 -1.42941282  2.06831543 -0.34986402 -0.32320394
   0.39054656  0.06283411]]


In [44]:
print(ridge.score(X_train, y_train))
print(ridge.score(X_test, y_test))


0.9143225702003361
0.8613398053698548


In [45]:
lasso = Lasso(alpha=0.01)
lasso.fit(X_train,y_train)
print ("Lasso model:", (lasso.coef_))


Lasso model: [ 0.          0.52263805 -0.5402102  -1.99423315 -4.55360385 -0.85285179
  2.99044036  0.00711821 -0.          0.76073274 -0.         -0.
 -0.19736449  0.          2.04221833 -1.00014513  0.         -0.
  4.28412669 -0.          0.          0.31442062 -0.          2.13894094
 -1.06760107  0.         -0.          0.          0.         -0.44991392
 -1.55885506 -0.         -0.68837902  0.          0.17455864 -0.34653644
  0.3313704  -2.84931966  0.         -0.34340563  0.00815105  0.47019445
  1.25759712 -0.69634581  0.          0.55528147  0.2948979  -0.67289549
  0.06490671  0.         -1.19639935  1.06711702  0.         -0.88034391
  0.         -0.        ]


In [46]:
print(lasso.score(X_train, y_train))
print(lasso.score(X_test, y_test))


0.9098286193898273
0.8695296858772457


In [None]:
'''
Lasso model used in dimentionality reduction
Sigmoid curve - best illustration of classification problem

 - logistic regression (confusion matrix)
 - probit regression

'''