In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [3]:
df = pd.read_csv('car_fuel_efficiency.csv')

In [4]:
len(df)

9704

In [5]:
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


In [9]:
df[["engine_displacement","horsepower","vehicle_weight","model_year","fuel_efficiency_mpg"]]

Unnamed: 0,engine_displacement,horsepower,vehicle_weight,model_year,fuel_efficiency_mpg
0,170,159.0,3413.433759,2003,13.231729
1,130,97.0,3149.664934,2007,13.688217
2,170,78.0,3079.038997,2018,14.246341
3,220,,2542.392402,2009,16.912736
4,210,140.0,3460.870990,2009,12.488369
...,...,...,...,...,...
9699,140,164.0,2981.107371,2013,15.101802
9700,180,154.0,2439.525729,2004,17.962326
9701,220,138.0,2583.471318,2008,17.186587
9702,230,177.0,2905.527390,2011,15.331551


In [10]:
df = df[["engine_displacement","horsepower","vehicle_weight","model_year","fuel_efficiency_mpg"]]

In [11]:
df.head()

Unnamed: 0,engine_displacement,horsepower,vehicle_weight,model_year,fuel_efficiency_mpg
0,170,159.0,3413.433759,2003,13.231729
1,130,97.0,3149.664934,2007,13.688217
2,170,78.0,3079.038997,2018,14.246341
3,220,,2542.392402,2009,16.912736
4,210,140.0,3460.87099,2009,12.488369


In [13]:
df.isnull().sum()

engine_displacement      0
horsepower             708
vehicle_weight           0
model_year               0
fuel_efficiency_mpg      0
dtype: int64

In [18]:
df["horsepower"].median()

149.0

In [19]:
np.random.seed(42)

In [20]:
n = len(df)

In [21]:
n

9704

In [22]:
n_val = int(0.2 * n)
n_test = int(0.2 * n)
n_train = n - (n_val + n_test)

In [23]:
idx = np.arange(n)

In [24]:
idx

array([   0,    1,    2, ..., 9701, 9702, 9703])

In [25]:
np.random.shuffle(idx)

In [26]:
df_shuffled = df.iloc[idx]

In [27]:
df_train = df_shuffled.iloc[:n_train].copy()
df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
df_test = df_shuffled.iloc[n_train+n_val:].copy()

In [28]:
y_train_orig = df_train.fuel_efficiency_mpg.values
y_val_orig = df_val.fuel_efficiency_mpg.values
y_test_orig = df_test.fuel_efficiency_mpg.values

In [30]:
y_train = np.log1p(df_train.fuel_efficiency_mpg.values)
y_val = np.log1p(df_val.fuel_efficiency_mpg.values)
y_test = np.log1p(df_test.fuel_efficiency_mpg.values)

In [31]:
del df_train['fuel_efficiency_mpg']
del df_val['fuel_efficiency_mpg']
del df_test['fuel_efficiency_mpg']

In [32]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

In [33]:
def prepare_X(df,z):
    df_num = df
    df_num = df_num.fillna(z)
    X = df_num.values
    return X

In [34]:
X_train_zeros = prepare_X(df_train,0)

In [36]:
w_0, w = train_linear_regression(X_train_zeros, y_train)

In [38]:
y_pred_zeros = w_0 + X_train_zeros.dot(w)

In [39]:
def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return round(np.sqrt(mse),2)

In [40]:
rmse(y_train, y_pred_zeros)

np.float64(0.04)

In [41]:
mean_horsepower = df_train["horsepower"].mean()

In [42]:
mean_horsepower

np.float64(149.54476367006487)

In [43]:
X_train_mean = prepare_X(df_train,mean_horsepower)

In [44]:
w_0, w = train_linear_regression(X_train_mean, y_train)

In [45]:
y_pred_mean = w_0 + X_train_mean.dot(w)

In [46]:
rmse(y_train, y_pred_mean)

np.float64(0.04)

In [47]:
def train_linear_regression_reg(X, y, r=0.0):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])
    XTX = XTX + reg

    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

In [48]:
X_train = prepare_X(df_train,0)

In [50]:
for r in [0, 0.01, 0.1, 1, 5, 10, 100]:
    w_0, w = train_linear_regression_reg(X_train, y_train, r=r)
    y_pred = w_0 + X_train.dot(w)
    result = rmse(y_train, y_pred_mean)
    print('%5s -> %.2f' % (r, result))

    0 -> 0.04
 0.01 -> 0.04
  0.1 -> 0.04
    1 -> 0.04
    5 -> 0.04
   10 -> 0.04
  100 -> 0.04


In [51]:
def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.sqrt(mse)

In [54]:
result = []
for s in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
    np.random.seed(s)
    n_val = int(0.2 * n)
    n_test = int(0.2 * n)
    n_train = n - (n_val + n_test)
    idx = np.arange(n)
    np.random.shuffle(idx)
    df_shuffled = df.iloc[idx]
    df_train = df_shuffled.iloc[:n_train].copy()
    df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
    df_test = df_shuffled.iloc[n_train+n_val:].copy()
    y_train = np.log1p(df_train.fuel_efficiency_mpg.values)
    y_val = np.log1p(df_val.fuel_efficiency_mpg.values)
    y_test = np.log1p(df_test.fuel_efficiency_mpg.values)
    X_train = prepare_X(df_train,0)
    w_0, w = train_linear_regression(X_train, y_train)
    y_pred = w_0 + X_train.dot(w)
    result.append(rmse(y_train, y_pred_mean))

In [55]:
result

[np.float64(0.23416990189398448),
 np.float64(0.23344069575394652),
 np.float64(0.2334966333541239),
 np.float64(0.22975021664415937),
 np.float64(0.2331717492951361),
 np.float64(0.23234964518530632),
 np.float64(0.2320871300101032),
 np.float64(0.23154118872533955),
 np.float64(0.23300503412340867),
 np.float64(0.23172162972469051)]

In [58]:
round(np.array(result).std(),3)

np.float64(0.001)

In [59]:
np.random.seed(9)
n_val = int(0.2 * n)
n_test = int(0.2 * n)
n_train = n - (n_val + n_test)
idx = np.arange(n)
np.random.shuffle(idx)
df_shuffled = df.iloc[idx]
df_train = df_shuffled.iloc[:n_train].copy()
df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
df_test = df_shuffled.iloc[n_train+n_val:].copy()
df_val_train = pd.concat([df_val,df_train])

In [63]:
len(df_val_train)

7764

In [64]:
df_val_train.head()

Unnamed: 0,engine_displacement,horsepower,vehicle_weight,model_year,fuel_efficiency_mpg
4543,180,175.0,2775.381743,2014,15.931657
6826,160,166.0,3542.655183,2012,13.130355
5416,250,173.0,1354.78712,2007,22.858156
1366,260,75.0,2531.997079,2005,16.399893
8856,250,175.0,2609.369103,2004,17.464552


In [65]:
y_train = np.log1p(df_train.fuel_efficiency_mpg.values)
y_val = np.log1p(df_val.fuel_efficiency_mpg.values)
y_test = np.log1p(df_test.fuel_efficiency_mpg.values)


In [67]:
y_val_train = np.concat([y_val,y_train])

In [68]:
len(y_val_train)

7764

In [69]:
X_train = prepare_X(df_val_train,0)

In [71]:
w_0, w = train_linear_regression_reg(X_train, y_val_train, r=0.001)

In [72]:
y_pred = w_0 + X_train.dot(w)

In [74]:
round(rmse(y_val_train, y_pred),2)

np.float64(0.02)