In [129]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [130]:
# Read car_fuel_efficiency.csv
cfe_raw_data = pd.read_csv("car_fuel_efficiency.csv")
cfe_raw_data = cfe_raw_data[['engine_displacement', 'horsepower','vehicle_weight', 'model_year','fuel_efficiency_mpg']]
cfe_raw_data

Unnamed: 0,engine_displacement,horsepower,vehicle_weight,model_year,fuel_efficiency_mpg
0,170,159.0,3413.433759,2003,13.231729
1,130,97.0,3149.664934,2007,13.688217
2,170,78.0,3079.038997,2018,14.246341
3,220,,2542.392402,2009,16.912736
4,210,140.0,3460.870990,2009,12.488369
...,...,...,...,...,...
9699,140,164.0,2981.107371,2013,15.101802
9700,180,154.0,2439.525729,2004,17.962326
9701,220,138.0,2583.471318,2008,17.186587
9702,230,177.0,2905.527390,2011,15.331551


# Question 1. Missing values

In [131]:
cfe_raw_data.isnull().sum()

engine_displacement      0
horsepower             708
vehicle_weight           0
model_year               0
fuel_efficiency_mpg      0
dtype: int64

# Question 2. Median for horse power

In [132]:
np.median(cfe_raw_data["horsepower"])

np.float64(nan)

In [133]:
cfe_raw_data["horsepower"].mean() # ignoring NaN

np.float64(149.65729212983547)

# Prepare data

In [134]:
n = len(cfe_raw_data)

n_val = int(n* 0.2) 
n_test = int(n* 0.2) 
n_train = int(n * 0.6)

In [135]:
n_train, n_val, n_test

(5822, 1940, 1940)

In [136]:
idx = np.arange(n)
np.random.seed(42)
np.random.shuffle(idx)


car_data_train = cfe_raw_data.iloc[idx[:n_train]]
car_data_val = cfe_raw_data.iloc[idx[n_train:n_train+n_val]]
car_data_test = cfe_raw_data.iloc[idx[n_train+n_val:]]

In [137]:

y_train = np.log1p(car_data_train.fuel_efficiency_mpg.values)
y_val = np.log1p(car_data_val.fuel_efficiency_mpg.values)
y_test = np.log1p(car_data_test.fuel_efficiency_mpg.values)

In [138]:
del car_data_train['fuel_efficiency_mpg']
del car_data_val['fuel_efficiency_mpg']
del car_data_test['fuel_efficiency_mpg']

In [139]:
len(y_train)

5822

In [140]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

In [141]:
def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.sqrt(mse)

# Question 3. Filling NAs

In [142]:
car_data_train.dtypes

engine_displacement      int64
horsepower             float64
vehicle_weight         float64
model_year               int64
dtype: object

In [143]:
X_train = car_data_train
X_train

Unnamed: 0,engine_displacement,horsepower,vehicle_weight,model_year
483,220,144.0,2535.887591,2009
7506,160,141.0,2741.170484,2019
8795,230,155.0,2471.880237,2017
1688,150,206.0,3748.164469,2015
6217,300,111.0,2135.716359,2006
...,...,...,...,...
8387,160,111.0,3038.134712,2002
7331,160,157.0,2877.624938,2006
1696,260,139.0,2606.972984,2009
5685,280,132.0,4004.214323,2014


In [144]:
y_train

array([2.8703359 , 2.85061269, 2.97511223, ..., 2.88837278, 2.40325308,
       2.92279789], shape=(5822,))

In [145]:
train_linear_regression(X_train, y_train)

(np.float64(nan), array([nan, nan, nan, nan]))

In [146]:
car_data_train.isnull().sum()

engine_displacement      0
horsepower             429
vehicle_weight           0
model_year               0
dtype: int64

## Option 1. Fillna 0

In [147]:
def prepare_X_0(df):
    df_num = df.copy()
    df_num["horsepower"] = df_num["horsepower"].fillna(0)
    X = df_num.values
    return X
    

In [148]:
X_train = prepare_X_0(car_data_train)
X_train

array([[ 220.        ,  144.        , 2535.88759124, 2009.        ],
       [ 160.        ,  141.        , 2741.17048439, 2019.        ],
       [ 230.        ,  155.        , 2471.88023726, 2017.        ],
       ...,
       [ 260.        ,  139.        , 2606.9729844 , 2009.        ],
       [ 280.        ,  132.        , 4004.21432295, 2014.        ],
       [ 210.        ,  152.        , 2500.17568746, 2020.        ]],
      shape=(5822, 4))

In [149]:
w_0, w = train_linear_regression(X_train, y_train)
y_pred = w_0 + X_train.dot(w)
rmse(y_train, y_pred)

np.float64(0.03909276558794462)

In [150]:
X_val = prepare_X_0(car_data_val)

w_0, w = train_linear_regression(X_val, y_val)
y_pred = w_0 + X_val.dot(w)
rmse(y_val, y_pred)


np.float64(0.03990506117826308)

## Option 2. Fillna mean

In [151]:
def prepare_X_mean(df):
    df_num = df.copy()
    mean_HP = df_num["horsepower"].mean()
    df_num["horsepower"] = df_num["horsepower"].fillna(mean_HP)
    X = df_num.values
    return X

In [152]:
X_train = prepare_X_mean(car_data_train)
X_train

array([[ 220.        ,  144.        , 2535.88759124, 2009.        ],
       [ 160.        ,  141.        , 2741.17048439, 2019.        ],
       [ 230.        ,  155.        , 2471.88023726, 2017.        ],
       ...,
       [ 260.        ,  139.        , 2606.9729844 , 2009.        ],
       [ 280.        ,  132.        , 4004.21432295, 2014.        ],
       [ 210.        ,  152.        , 2500.17568746, 2020.        ]],
      shape=(5822, 4))

In [153]:
w_0, w = train_linear_regression(X_train, y_train)
y_pred = w_0 + X_train.dot(w)
rmse(y_train, y_pred)

np.float64(0.03581670894125062)

In [154]:
X_val = prepare_X_mean(car_data_val)

w_0, w = train_linear_regression(X_val, y_val)
y_pred = w_0 + X_val.dot(w)
rmse(y_val, y_pred)

np.float64(0.03728585390515352)

# Question 4. Best regularization 

In [155]:
def train_linear_regression_reg(X, y, r=0.0):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])
    XTX = XTX + reg

    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

In [156]:
X_train = prepare_X_0(car_data_train)


In [157]:
for r in [0, 0.01, 0.1, 1, 5, 10, 100]:
    w_0, w = train_linear_regression_reg(X_train, y_train, r=r)

    X_val = prepare_X_0(car_data_val)
    y_pred = w_0 + X_val.dot(w)
    score = rmse(y_val, y_pred)
    print(r, round(w_0,15), round(score, 2))
    

0 3.665576873560184 0.04
0.01 3.169350835540468 0.04
0.1 1.42868384941689 0.04
1 0.220062722162232 0.04
5 0.046233733531194 0.04
10 0.023264028199729 0.04
100 0.002340587309107 0.04


# Question 5. RMSE Standard Deviation 

In [158]:
np.random.seed(42)

In [159]:
def RMSE_sd(raw_data, seed):
    n = len(raw_data)
    idx = np.arange(n)
    np.random.seed(seed)
    np.random.shuffle(idx)

    car_data_train = cfe_raw_data.iloc[idx[:n_train]]
    car_data_val = cfe_raw_data.iloc[idx[n_train:n_train+n_val]]
    car_data_test = cfe_raw_data.iloc[idx[n_train+n_val:]]

    y_train = np.log1p(car_data_train.fuel_efficiency_mpg.values)
    y_val = np.log1p(car_data_val.fuel_efficiency_mpg.values)
    y_test = np.log1p(car_data_test.fuel_efficiency_mpg.values)

    del car_data_train['fuel_efficiency_mpg']
    del car_data_val['fuel_efficiency_mpg']
    del car_data_test['fuel_efficiency_mpg']

    X_train = prepare_X_0(car_data_train)
    X_val = prepare_X_0(car_data_val)
    X_test = prepare_X_0(car_data_test)


    w_0, w = train_linear_regression(X_val, y_val)
    y_pred = w_0 + X_val.dot(w)
    score_val = rmse(y_val, y_pred)

    return score_val

all_scores = []
for s in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
    score = RMSE_sd(raw_data=cfe_raw_data, seed=s)
    all_scores.append(score)
std = np.std(all_scores)
print(round(std, 3))

0.001


# Question 6. Evaluation on test

In [170]:
def evaluation_raw_target(raw_data, seed=9):
    n = len(raw_data)
    idx = np.arange(n)
    np.random.seed(seed)
    np.random.shuffle(idx)

    car_train = raw_data.iloc[idx[:n_train]].copy()
    car_val   = raw_data.iloc[idx[n_train:n_train+n_val]].copy()
    car_test  = raw_data.iloc[idx[n_train+n_val:]].copy()

    # === NO log transform here ===
    y_train = car_train.pop('fuel_efficiency_mpg').values
    y_val   = car_val.pop('fuel_efficiency_mpg').values
    y_test  = car_test.pop('fuel_efficiency_mpg').values

    # combine train+val, fill NAs with 0 as required
    full_train = pd.concat([car_train, car_val], ignore_index=True).fillna(0)
    car_test   = car_test.fillna(0)

    X_full = prepare_X_0(full_train)
    X_test = prepare_X_0(car_test)

    # train once with r=0.001 on full train
    w0, w = train_linear_regression_reg(X_full, np.concatenate([y_train, y_val]), r=0.001)

    # predict & evaluate directly in original units
    y_pred_test = w0 + X_test.dot(w)
    return rmse(y_test, y_pred_test)

rmse_test_raw = evaluation_raw_target(cfe_raw_data, seed=9)
print(f"Test RMSE without log transform: {rmse_test_raw}")


Test RMSE without log transform: 0.5155043369325081
