In [2]:
import pandas as pd
import numpy as np

source_data = pd.read_csv(r"C:\Users\10086211\Documents\Py Projects\ML Zoomcamp\Assignment 2\car_fuel_efficiency.csv")

# Data Preparation
data = source_data.loc[:, ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year', 'fuel_efficiency_mpg']]

Question 1: There's one column with missing values. What is it?

In [3]:
missing_values_per_column = data.isnull().sum()
print(missing_values_per_column)

engine_displacement      0
horsepower             708
vehicle_weight           0
model_year               0
fuel_efficiency_mpg      0
dtype: int64


Question 2: What's the median (50% percentile) for variable 'horsepower'?

In [4]:
median_horsepower = data['horsepower'].median()
print(median_horsepower)

149.0


Prepare and split the dataset. Shuffle the dataset (the filtered one you created above), use seed 42. Split the data in train/val/test sets, with 60%/20%/20% distribution.

In [5]:
np.random.seed(42)
n = len(data)

n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - (n_val + n_test)

idx = np.arange(n)
np.random.shuffle(idx)

df_shuffled = data.iloc[idx]

df_train = df_shuffled.iloc[:n_train].copy().reset_index(drop=True)
df_val = df_shuffled.iloc[n_train:n_train+n_val].copy().reset_index(drop=True)
df_test = df_shuffled.iloc[n_train+n_val:].copy().reset_index(drop=True)

Question 3: 
We need to deal with missing values for the column from Question 1.
We have two options: fill it with 0 or with the mean of this variable.
Try both options. For each, train a linear regression model without regularization using the code from the lessons.
For computing the mean, use the training only!
Use the validation dataset to evaluate the models and compare the RMSE of each option.
Round the RMSE scores to 2 decimal digits using round(score, 2)
Which option gives better RMSE?

In [6]:
y_train = np.log1p(df_train.fuel_efficiency_mpg.values)
y_val = np.log1p(df_val.fuel_efficiency_mpg.values)
y_test = np.log1p(df_test.fuel_efficiency_mpg.values)

del df_train['fuel_efficiency_mpg']
del df_val['fuel_efficiency_mpg']
del df_test['fuel_efficiency_mpg']

def train_linear_regression(X, y, r=0.0):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    
    # Add regularization term
    reg = r * np.eye(XTX.shape[0])
    XTX = XTX + reg

    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

def prepare_X(df, fill_value=0):
    df_num = df.copy()
    df_num = df_num.fillna(fill_value)
    X = df_num.values
    return X

def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.sqrt(mse)

# Option 1: Fill with 0
X_train_zero = prepare_X(df_train, fill_value=0)
w_0_zero, w_zero = train_linear_regression(X_train_zero, y_train)

X_val_zero = prepare_X(df_val, fill_value=0)
y_pred_zero = w_0_zero + X_val_zero.dot(w_zero)
rmse_zero = rmse(y_val, y_pred_zero)
print(f"RMSE with zero-fill: {round(rmse_zero, 2)}")

# Option 2: Fill with mean
mean_hp = df_train.horsepower.mean()
X_train_mean = prepare_X(df_train, fill_value=mean_hp)
w_0_mean, w_mean = train_linear_regression(X_train_mean, y_train)

X_val_mean = prepare_X(df_val, fill_value=mean_hp)
y_pred_mean = w_0_mean + X_val_mean.dot(w_mean)
rmse_mean = rmse(y_val, y_pred_mean)
print(f"RMSE with mean-fill: {round(rmse_mean, 2)}")

if round(rmse_mean, 2) < round(rmse_zero, 2):
    print("Filling with the mean gives a better RMSE.")
elif round(rmse_zero, 2) < round(rmse_mean, 2):
    print("Filling with zero gives a better RMSE.")
else:
    print("Both imputation methods give the same RMSE.")

RMSE with zero-fill: 0.04
RMSE with mean-fill: 0.04
Both imputation methods give the same RMSE.


Question 4: 
Now let's train a regularized linear regression.
For this question, fill the NAs with 0.
Try different values of r from this list: [0, 0.01, 0.1, 1, 5, 10, 100]
Use RMSE to evaluate the model on the validation dataset.
Round the RMSE scores to 2 decimal digits.
Which r gives the best RMSE?
If multiple options give the same best RMSE, select the smallest r.

In [7]:
r_values = [0, 0.01, 0.1, 1, 5, 10, 100]

for r in r_values:
    w_0, w = train_linear_regression(X_train_zero, y_train, r=r)
    y_pred = w_0 + X_val_zero.dot(w)
    score = rmse(y_val, y_pred)
    print(f"r={r}, Rounded RMSE={round(score, 2)}, Raw RMSE={score}")

r=0, Rounded RMSE=0.04, Raw RMSE=0.0399792578229878
r=0.01, Rounded RMSE=0.04, Raw RMSE=0.03997787201794628
r=0.1, Rounded RMSE=0.04, Raw RMSE=0.04050873984549153
r=1, Rounded RMSE=0.04, Raw RMSE=0.04135385150826845
r=5, Rounded RMSE=0.04, Raw RMSE=0.04150611823274443
r=10, Rounded RMSE=0.04, Raw RMSE=0.041526801237522025
r=100, Rounded RMSE=0.04, Raw RMSE=0.04154575656081158


Question 5:
We used seed 42 for splitting the data. Let's find out how selecting the seed influences our score.
Try different seed values: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9].
For each seed, do the train/validation/test split with 60%/20%/20% distribution.
Fill the missing values with 0 and train a model without regularization.
For each seed, evaluate the model on the validation dataset and collect the RMSE scores.
What's the standard deviation of all the scores? To compute the standard deviation, use np.std.
Round the result to 3 decimal digits (round(std, 3))

In [8]:
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
rmse_scores = []

for seed in seeds:
    np.random.seed(seed)
    
    n = len(data)
    n_val = int(n * 0.2)
    n_test = int(n * 0.2)
    n_train = n - (n_val + n_test)

    idx = np.arange(n)
    np.random.shuffle(idx)
    df_shuffled = data.iloc[idx]

    df_train_seed = df_shuffled.iloc[:n_train].copy().reset_index(drop=True)
    df_val_seed = df_shuffled.iloc[n_train:n_train+n_val].copy().reset_index(drop=True)

    y_train_seed = np.log1p(df_train_seed.fuel_efficiency_mpg.values)
    y_val_seed = np.log1p(df_val_seed.fuel_efficiency_mpg.values)

    X_train_seed = prepare_X(df_train_seed.drop('fuel_efficiency_mpg', axis=1), fill_value=0)
    w_0, w = train_linear_regression(X_train_seed, y_train_seed, r=0)
    
    X_val_seed = prepare_X(df_val_seed.drop('fuel_efficiency_mpg', axis=1), fill_value=0)
    y_pred_seed = w_0 + X_val_seed.dot(w)
    score = rmse(y_val_seed, y_pred_seed)
    rmse_scores.append(score)

std_dev = np.std(rmse_scores)
print(f"Standard deviation of RMSE scores: {round(std_dev, 3)}")

Standard deviation of RMSE scores: 0.001


Question 6:
Split the dataset like previously, use seed 9.
Combine train and validation datasets.
Fill the missing values with 0 and train a model with r=0.001.
What's the RMSE on the test dataset?

In [9]:
np.random.seed(9)

n = len(data)
n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - (n_val + n_test)

idx = np.arange(n)
np.random.shuffle(idx)
df_shuffled = data.iloc[idx]

df_train = df_shuffled.iloc[:n_train].copy()
df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
df_test = df_shuffled.iloc[n_train+n_val:].copy()

df_full_train = pd.concat([df_train, df_val]).reset_index(drop=True)
y_full_train = np.log1p(df_full_train.fuel_efficiency_mpg.values)
y_test = np.log1p(df_test.fuel_efficiency_mpg.values)

X_full_train = prepare_X(df_full_train.drop('fuel_efficiency_mpg', axis=1), fill_value=0)
w_0, w = train_linear_regression(X_full_train, y_full_train, r=0.001)

X_test = prepare_X(df_test.drop('fuel_efficiency_mpg', axis=1), fill_value=0)
y_pred = w_0 + X_test.dot(w)

original_y_test = np.expm1(y_test)
original_y_pred = np.expm1(y_pred)

score = rmse(original_y_test, original_y_pred)

print(f"RMSE on the test set: {round(score, 3)}")

RMSE on the test set: 0.607
