# ML Zoomcamp Homework 2: Regression

In [1]:
import pandas as pd
import numpy as np

## Preparing the Dataset

In [2]:
df = pd.read_csv('data/car_fuel_efficiency.csv')

columns = [
    'engine_displacement',
    'horsepower',
    'vehicle_weight',
    'model_year',
    'fuel_efficiency_mpg'
]

df_filtered = df[columns].copy()

## Question 1

There's one column with missing values. What is it?

In [3]:
df_filtered.isnull().sum()

engine_displacement      0
horsepower             708
vehicle_weight           0
model_year               0
fuel_efficiency_mpg      0
dtype: int64

## Question 2

What's the median (50% percentile) for variable 'horsepower'?

In [4]:
df_filtered['horsepower'].median()

149.0

## Prepare and split the dataset

* Shuffle the dataset (the filtered one you created above), use seed `42`.
* Split your data in train/val/test sets, with 60%/20%/20% distribution.

In [5]:
# Get the number of rows in the DataFrame
n = len(df_filtered)

# Calculate the size of each dataset
n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - n_val - n_test

# Create a random permutation of the indices
np.random.seed(42)
idx = np.arange(n)
np.random.shuffle(idx)

# Use the shuffled indices to split the DataFrame
df_train = df_filtered.iloc[idx[:n_train]]
df_val = df_filtered.iloc[idx[n_train:n_train+n_val]]
df_test = df_filtered.iloc[idx[n_train+n_val:]]

## Question 3

* We need to deal with missing values for the column from Q1.
* We have two options: fill it with 0 or with the mean of this variable.
* Try both options. For each, train a linear regression model without regularization using the code from the lessons.
* For computing the mean, use the training only!
* Use the validation dataset to evaluate the models and compare the RMSE of each option.
* Round the RMSE scores to 2 decimal digits using `round(score, 2)`
* Which option gives better RMSE?

### Helper Functions

In [6]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

### Prepare Target Variable

In [7]:
y_train = df_train.fuel_efficiency_mpg.values
y_val = df_val.fuel_efficiency_mpg.values
y_test = df_test.fuel_efficiency_mpg.values

### Option 1: Fill with 0

In [8]:
base = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']
X_train_0 = df_train[base].fillna(0).values
w0_0, w_0 = train_linear_regression(X_train_0, y_train)

X_val_0 = df_val[base].fillna(0).values
y_pred_val_0 = w0_0 + X_val_0.dot(w_0)
rmse_val_0 = rmse(y_val, y_pred_val_0)

print(f'RMSE on validation (fill with 0): {round(rmse_val_0, 2)}')

RMSE on validation (fill with 0): 0.52


### Option 2: Fill with Mean

In [9]:
mean_hp = df_train['horsepower'].mean()
X_train_mean = df_train[base].fillna(mean_hp).values
w0_mean, w_mean = train_linear_regression(X_train_mean, y_train)

X_val_mean = df_val[base].fillna(mean_hp).values
y_pred_val_mean = w0_mean + X_val_mean.dot(w_mean)
rmse_val_mean = rmse(y_val, y_pred_val_mean)

print(f'RMSE on validation (fill with mean): {round(rmse_val_mean, 2)}')

RMSE on validation (fill with mean): 0.46


## Question 4

* Now let's train a regularized linear regression.
* For this question, fill the NAs with 0.
* Try different values of `r` from this list: `[0, 0.01, 0.1, 1, 5, 10, 100]`.
* Use RMSE to evaluate the model on the validation dataset.
* Round the RMSE scores to 2 decimal digits.
* Which `r` gives the best RMSE?

In [10]:
def train_linear_regression_reg(X, y, r=0.0):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX = XTX + r * np.eye(XTX.shape[0])

    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

X_train_0 = df_train[base].fillna(0).values
X_val_0 = df_val[base].fillna(0).values

for r in [0, 0.01, 0.1, 1, 5, 10, 100]:
    w0, w = train_linear_regression_reg(X_train_0, y_train, r=r)
    y_pred_val = w0 + X_val_0.dot(w)
    rmse_val = rmse(y_val, y_pred_val)
    print(f'r={r}, RMSE={round(rmse_val, 2)}')

r=0, RMSE=0.52
r=0.01, RMSE=0.52
r=0.1, RMSE=0.52
r=1, RMSE=0.52
r=5, RMSE=0.52
r=10, RMSE=0.52
r=100, RMSE=0.52


## Question 5

* We used seed 42 for splitting the data. Let's find out how selecting the seed influences our score.
* Try different seed values: `[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]`.
* For each seed, do the train/validation/test split with 60%/20%/20% distribution.
* Fill the missing values with 0 and train a model without regularization.
* For each seed, evaluate the model on the validation dataset and collect the RMSE scores.
* What's the standard deviation of all the scores? To compute the standard deviation, use `np.std`.
* Round the result to 3 decimal digits (`round(std, 3)`)

In [11]:
rmse_scores = []
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

for seed in seeds:
    # Split data with the current seed
    np.random.seed(seed)
    idx = np.arange(n)
    np.random.shuffle(idx)
    
    df_train = df_filtered.iloc[idx[:n_train]]
    df_val = df_filtered.iloc[idx[n_train:n_train+n_val]]
    
    # Prepare target variable
    y_train = df_train.fuel_efficiency_mpg.values
    y_val = df_val.fuel_efficiency_mpg.values
    
    # Prepare features (fill with 0)
    X_train = df_train[base].fillna(0).values
    X_val = df_val[base].fillna(0).values
    
    # Train model (non-regularized)
    w0, w = train_linear_regression(X_train, y_train)
    
    # Predict and calculate RMSE
    y_pred_val = w0 + X_val.dot(w)
    rmse_val = rmse(y_val, y_pred_val)
    
    rmse_scores.append(rmse_val)
    print(f'Seed={seed}, RMSE={round(rmse_val, 3)}')

# Calculate standard deviation
std_dev = np.std(rmse_scores)
print(f'\nStandard Deviation of RMSE scores: {round(std_dev, 3)}')

Seed=0, RMSE=0.521
Seed=1, RMSE=0.521
Seed=2, RMSE=0.523
Seed=3, RMSE=0.516
Seed=4, RMSE=0.511
Seed=5, RMSE=0.528
Seed=6, RMSE=0.531
Seed=7, RMSE=0.509
Seed=8, RMSE=0.515
Seed=9, RMSE=0.513

Standard Deviation of RMSE scores: 0.007


## Question 6

* Split the dataset like previously, use seed 9.
* Combine train and validation datasets.
* Fill the missing values with 0 and train a model with `r=0.001`.
* What's the RMSE on the test dataset?

In [12]:
# Split data with seed 9
np.random.seed(9)
idx = np.arange(n)
np.random.shuffle(idx)

df_train = df_filtered.iloc[idx[:n_train]]
df_val = df_filtered.iloc[idx[n_train:n_train+n_val]]
df_test = df_filtered.iloc[idx[n_train+n_val:]]

# Combine train and validation
df_full_train = pd.concat([df_train, df_val])

# Prepare target variable
y_full_train = df_full_train.fuel_efficiency_mpg.values
y_test = df_test.fuel_efficiency_mpg.values

# Prepare features (fill with 0)
X_full_train = df_full_train[base].fillna(0).values
X_test = df_test[base].fillna(0).values

# Train model
w0, w = train_linear_regression_reg(X_full_train, y_full_train, r=0.001)

# Predict and calculate RMSE on test set
y_pred_test = w0 + X_test.dot(w)
rmse_test = rmse(y_test, y_pred_test)

print(f'RMSE on test set: {round(rmse_test, 2)}')

RMSE on test set: 0.52
