In [1]:
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

from sklearn.metrics import r2_score

import pandas as pd

import matplotlib.pyplot as plt

import seaborn as sns

In [4]:
# load data
mpg_df = pd.read_csv("datasets/auto-mpg.csv") 
#dropping/ignoring car_name 
mpg_df = mpg_df.drop('car name', axis=1)
# Also replacing the categorical var with actual values
mpg_df['origin'] = mpg_df['origin'].replace({1: 'america', 2: 'europe', 3: 'asia'})
mpg_df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
0,18.0,8,307.0,130,3504,12.0,70,america
1,15.0,8,350.0,165,3693,11.5,70,america
2,18.0,8,318.0,150,3436,11.0,70,america
3,16.0,8,304.0,150,3433,12.0,70,america
4,17.0,8,302.0,140,3449,10.5,70,america


In [5]:
mpg_df  = pd.get_dummies(mpg_df , columns=['origin'])
mpg_df .head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin_america,origin_asia,origin_europe
0,18.0,8,307.0,130,3504,12.0,70,True,False,False
1,15.0,8,350.0,165,3693,11.5,70,True,False,False
2,18.0,8,318.0,150,3436,11.0,70,True,False,False
3,16.0,8,304.0,150,3433,12.0,70,True,False,False
4,17.0,8,302.0,140,3449,10.5,70,True,False,False


In [6]:
mpg_df .describe()

Unnamed: 0,mpg,cylinders,displacement,weight,acceleration,model year
count,398.0,398.0,398.0,398.0,398.0,398.0
mean,23.514573,5.454774,193.425879,2970.424623,15.56809,76.01005
std,7.815984,1.701004,104.269838,846.841774,2.757689,3.697627
min,9.0,3.0,68.0,1613.0,8.0,70.0
25%,17.5,4.0,104.25,2223.75,13.825,73.0
50%,23.0,4.0,148.5,2803.5,15.5,76.0
75%,29.0,8.0,262.0,3608.0,17.175,79.0
max,46.6,8.0,455.0,5140.0,24.8,82.0


In [7]:
 mpg_df .dtypes


mpg               float64
cylinders           int64
displacement      float64
horsepower         object
weight              int64
acceleration      float64
model year          int64
origin_america       bool
origin_asia          bool
origin_europe        bool
dtype: object

In [10]:
mpg_df = mpg_df.replace('?', np.nan)


In [11]:
mpg_df['horsepower'] = pd.to_numeric(mpg_df['horsepower'], errors='coerce')

# Step 2: Convert the column to numeric, forcing errors to NaN
# Already done above with pd.to_numeric

# Step 3: Fill NaN values with the median
medianFiller = lambda x: x.fillna(x.median())
mpg_df = mpg_df.apply(medianFiller, axis=0)

# Step 4: Convert the 'horsepower' column to float
mpg_df['horsepower'] = mpg_df['horsepower'].astype('float64')


In [12]:
X = mpg_df.drop('mpg', axis=1)
y = mpg_df[['mpg']]


In [13]:
from sklearn import preprocessing

# scale all the columns of the mpg_df. This will produce a numpy array
X_scaled = preprocessing.scale(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)  # ideally the training and test should be 

y_scaled = preprocessing.scale(y)
y_scaled = pd.DataFrame(y_scaled, columns=y.columns)

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.30, random_state=1)

In [15]:
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)

for idx, col_name in enumerate(X_train.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))

The coefficient for cylinders is -0.08592264254448734
The coefficient for displacement is 0.3861501766895437
The coefficient for horsepower is -0.10637514644618916
The coefficient for weight is -0.7965737428612097
The coefficient for acceleration is 0.02184681331891979
The coefficient for model year is 0.3959410531014954
The coefficient for origin_america is -0.09399896644893509
The coefficient for origin_asia is 0.044917890138051704
The coefficient for origin_europe is 0.07243059852959383


In [16]:
intercept = regression_model.intercept_[0]

print("The intercept for our model is {}".format(intercept))

The intercept for our model is 0.015510225561902383


In [17]:
ridge = Ridge(alpha=.3)
ridge.fit(X_train,y_train)
print ("Ridge model:", (ridge.coef_))

Ridge model: [[-0.0800581   0.36661042 -0.10890119 -0.78324655  0.01917898  0.39442138
  -0.0930884   0.04466769  0.07153523]]


In [18]:
lasso = Lasso(alpha=0.1)
lasso.fit(X_train,y_train)
print ("Lasso model:", (lasso.coef_))

Lasso model: [-0.         -0.         -0.01464723 -0.60711757  0.          0.29460087
 -0.04017427  0.          0.        ]


In [19]:
print(regression_model.score(X_train, y_train))
print(regression_model.score(X_test, y_test))

0.8141025501610559
0.8433135132808832


In [20]:
print(ridge.score(X_train, y_train))
print(ridge.score(X_test, y_test))

0.8140828080856514
0.8437999817350272


In [21]:
print(lasso.score(X_train, y_train))
print(lasso.score(X_test, y_test))

0.7878910251573478
0.8315130533007058


In [22]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree = 2, interaction_only=True)


In [26]:
X_poly = poly.fit_transform(X_scaled)
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.30, random_state=1)
X_train.shape

(278, 46)

In [24]:
regression_model.fit(X_train, y_train)
print(regression_model.coef_[0])

[-3.68062164e-13  7.76532695e-02 -5.76620093e-01 -1.90640538e+00
 -5.29355791e+00 -6.12398171e-01  3.06569911e+00 -7.48647628e+10
  1.61115729e+12 -1.55750368e+12 -1.81900389e+00  1.94158758e-01
  1.86208505e+00  1.50413610e+00 -1.48461224e+00 -2.10197908e+12
 -1.73238522e+12 -1.65356596e+12  4.04633275e-01  1.65410460e+00
 -8.90737384e-01  2.46149134e+00 -4.47609825e+11 -3.68905976e+11
 -3.52121664e+11 -6.94999353e-01 -3.09415674e-01 -1.48338310e+00
 -1.15271960e+11 -9.50035332e+10 -9.06811067e+10 -1.55318722e-01
  2.51965010e-01  7.48932052e+11  6.17246303e+11  5.89163118e+11
  4.75131713e-01  2.54584007e+11  2.09820152e+11  2.00273853e+11
  6.39662437e+10  5.27189713e+10  5.03203881e+10  9.67662504e+10
 -2.36721691e+12  1.48529836e+12]


In [25]:
ridge = Ridge(alpha=.3)
ridge.fit(X_train,y_train)
print ("Ridge model:", (ridge.coef_))

Ridge model: [[ 0.          0.06740642 -0.61900803 -1.97236759 -5.15141317 -0.62282102
   3.04381568  0.1723188   0.15891088 -0.38553368 -1.4895438   0.02925116
   1.72762625  1.4201127  -1.38679985 -0.05712906  1.13569653 -1.11720961
   0.30089657  1.53987731 -0.84218996  2.38658282  0.21457492  0.50684735
  -0.80377008 -0.47592772 -0.30069342 -1.50318104 -0.61710306  0.43440955
   0.32933252 -0.14480549  0.25597746  0.47585604 -0.93880283  0.37865359
   0.4784103  -0.67039722  0.22915648  0.61211567 -0.49887694  0.4033774
   0.21155723 -0.29118156  0.41878367  0.1184465 ]]


In [27]:
print(ridge.score(X_train, y_train))
print(ridge.score(X_test, y_test))

0.9025975935207239
0.8673792928418451


In [28]:
lasso = Lasso(alpha=0.01)
lasso.fit(X_train,y_train)
print ("Lasso model:", (lasso.coef_))

Lasso model: [ 0.         -0.         -0.08692269 -1.94971176 -5.29180738 -0.47273225
  2.98385949 -0.          0.         -0.         -0.79527606 -0.05882027
  1.1561088   1.08164446 -0.9493974  -0.          1.23352833 -0.87530628
 -0.          1.38994992 -0.4329787   1.86995071 -0.          0.
 -0.         -0.         -0.24115467 -1.24997101 -0.6285489   0.
  0.         -0.16019703  0.          0.26564551 -0.61312578 -0.
  0.47635543 -0.89394     0.          0.3945189  -0.68253648  0.12935612
  0.         -0.23102258  0.38217499  0.        ]


In [29]:
print(lasso.score(X_train, y_train))
print(lasso.score(X_test, y_test))

0.9013410674767774
0.8704180363604552
