In [63]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [64]:
df = pd.read_csv("https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv")

In [65]:
df=df[['engine_displacement','horsepower','vehicle_weight','model_year','fuel_efficiency_mpg']]
df

Unnamed: 0,engine_displacement,horsepower,vehicle_weight,model_year,fuel_efficiency_mpg
0,170,159.0,3413.433759,2003,13.231729
1,130,97.0,3149.664934,2007,13.688217
2,170,78.0,3079.038997,2018,14.246341
3,220,,2542.392402,2009,16.912736
4,210,140.0,3460.870990,2009,12.488369
...,...,...,...,...,...
9699,140,164.0,2981.107371,2013,15.101802
9700,180,154.0,2439.525729,2004,17.962326
9701,220,138.0,2583.471318,2008,17.186587
9702,230,177.0,2905.527390,2011,15.331551


## Q1

In [66]:
df.isna().sum()

engine_displacement      0
horsepower             708
vehicle_weight           0
model_year               0
fuel_efficiency_mpg      0
dtype: int64

## Q2

In [67]:
df.horsepower.median()

np.float64(149.0)

In [68]:
n=len(df)
n_test = int(0.2*n)
n_val = int(0.2*n)
n_train = n - n_val - n_test

In [111]:
idx = np.arange(n)

In [105]:
np.random.seed(42)
np.random.shuffle(idx)

## Q3

In [70]:
df_train=df.iloc[idx[:n_train]]
df_val=df.iloc[idx[n_train:n_train+n_val]]
df_test=df.iloc[idx[n_train+n_val:]]

In [71]:
y_train=df_train.fuel_efficiency_mpg.values
y_val=df_val.fuel_efficiency_mpg.values
y_test=df_test.fuel_efficiency_mpg.values

In [72]:
del df_train['fuel_efficiency_mpg']
del df_val['fuel_efficiency_mpg']
del df_test['fuel_efficiency_mpg']

In [73]:
df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
df_val= df_val.reset_index(drop=True)

In [74]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

In [75]:
def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.sqrt(mse)

### Option 1: fill missing values with 0

In [76]:
X_train=df_train.fillna(0).values
X_val=df_val.fillna(0).values
X_test=df_test.fillna(0).values

In [77]:
w_0,w=train_linear_regression(X_train,y_train)

In [78]:
w_0,w

(np.float64(28.827365475106156),
 array([ 9.93613218e-05,  3.54221891e-03, -5.01328015e-03,  3.42992869e-04]))

In [79]:
y_pred = w_0 + X_val.dot(w)
round(rmse(y_val, y_pred),2)

np.float64(0.52)

In [80]:
model = LinearRegression()
model.fit(X_train, y_train)
print(model.coef_, model.intercept_) 

[ 9.93613218e-05  3.54221891e-03 -5.01328015e-03  3.42992869e-04] 28.827365474603408


In [81]:
y_pred = model.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
print("RMSE:", np.sqrt(mse))

RMSE: 0.5173782638840942


### Option 2: fill missing values with the mean

In [82]:
X_train=df_train.fillna(df_train.horsepower.mean()).values
X_val=df_val.fillna(df_train.horsepower.mean()).values
X_test=df_test.fillna(df_train.horsepower.mean()).values

In [83]:
w_0,w=train_linear_regression(X_train,y_train)

In [84]:
w_0,w

(np.float64(28.925259952113596),
 array([ 0.00012093,  0.01030308, -0.00501009, -0.00023463]))

In [85]:
y_pred = w_0 + X_val.dot(w)
round(rmse(y_val, y_pred),2)

np.float64(0.46)

In [86]:
model = LinearRegression()
model.fit(X_train, y_train)
print(model.coef_, model.intercept_) 

[ 0.00012093  0.01030308 -0.00501009 -0.00023463] 28.92525995185654


In [87]:
y_pred = model.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
print("RMSE:", np.sqrt(mse))

RMSE: 0.46359650423335236


## Q4

In [98]:
def train_linear_regression_reg(X, y, r=0.0):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX = XTX + r * np.eye(XTX.shape[0])

    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

In [99]:
X_train=df_train.fillna(0).values
X_val=df_val.fillna(0).values
X_test=df_test.fillna(0).values

In [100]:
for r in [0, 0.01, 0.1, 1, 5, 10, 100]:
    w0, w = train_linear_regression_reg(X_train, y_train, r=r)
    y_pred = w0 + X_val.dot(w)
    score = round(rmse(y_val, y_pred),4)
    
    print(r, score)

0 0.5174
0.01 0.5171
0.1 0.5188
1 0.5222
5 0.5229
10 0.523
100 0.5231


## Q5

In [121]:
rmse_scores=[]
for seed in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
    idx = np.arange(n)
    np.random.seed(seed)
    np.random.shuffle(idx)
    df_train=df.iloc[idx[:n_train]]
    df_val=df.iloc[idx[n_train:n_train+n_val]]
    df_test=df.iloc[idx[n_train+n_val:]]
    y_train=df_train.fuel_efficiency_mpg.values
    y_val=df_val.fuel_efficiency_mpg.values
    y_test=df_test.fuel_efficiency_mpg.values
    del df_train['fuel_efficiency_mpg']
    del df_val['fuel_efficiency_mpg']
    del df_test['fuel_efficiency_mpg']
    df_train = df_train.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)
    df_val= df_val.reset_index(drop=True)
    X_train=df_train.fillna(0).values
    X_val=df_val.fillna(0).values
    X_test=df_test.fillna(0).values
    w_0,w=train_linear_regression(X_train,y_train)
    y_pred = w_0 + X_val.dot(w)
    rmse_scores.append(rmse(y_val, y_pred))
    print(seed, round(rmse(y_val, y_pred),4))
    
    

0 0.5207
1 0.5213
2 0.5228
3 0.516
4 0.5109
5 0.5283
6 0.5314
7 0.5091
8 0.5147
9 0.5132


In [122]:
std=np.std(rmse_scores)
round(std,3)

np.float64(0.007)

## Q6

In [125]:
idx = np.arange(n)
np.random.seed(9)
np.random.shuffle(idx)
df_train=df.iloc[idx[:n_train]]
df_val=df.iloc[idx[n_train:n_train+n_val]]
df_test=df.iloc[idx[n_train+n_val:]]

df_full_train = pd.concat([df_train, df_val])

y_train=df_full_train.fuel_efficiency_mpg.values
y_test=df_test.fuel_efficiency_mpg.values

del df_full_train['fuel_efficiency_mpg']
del df_test['fuel_efficiency_mpg']

df_full_train = df_full_train.reset_index(drop=True)
df_val= df_val.reset_index(drop=True)
    
X_train=df_full_train.fillna(0).values
X_test=df_test.fillna(0).values


In [126]:
w_0,w=train_linear_regression_reg(X_train,y_train, r=0.001)
y_pred = w_0 + X_test.dot(w)
print(rmse(y_test, y_pred))

0.5156261299167999
