In [1]:
import pandas as pd

In [3]:
df = pd.read_csv('../01-intro/car_fuel_efficiency.csv')

In [4]:
# Select only the required columns
cols = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year', 'fuel_efficiency_mpg']
df_selected = df[cols]

# Check for missing values in each column
missing_counts = df_selected.isnull().sum()
print(missing_counts)

engine_displacement      0
horsepower             708
vehicle_weight           0
model_year               0
fuel_efficiency_mpg      0
dtype: int64


In [5]:
# Calculate the median (50th percentile) for 'horsepower'
median_horsepower = df_selected['horsepower'].median()
print("Median horsepower:", median_horsepower)

# Shuffle the filtered dataset with seed 42
df_shuffled = df_selected.sample(frac=1, random_state=42).reset_index(drop=True)

# Split into train/val/test sets: 60%/20%/20%
n = len(df_shuffled)
train_end = int(0.6 * n)
val_end = int(0.8 * n)

train_df = df_shuffled.iloc[:train_end]
val_df = df_shuffled.iloc[train_end:val_end]
test_df = df_shuffled.iloc[val_end:]

print("Train set size:", len(train_df))
print("Validation set size:", len(val_df))
print("Test set size:", len(test_df))

Median horsepower: 149.0
Train set size: 5822
Validation set size: 1941
Test set size: 1941


In [9]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error
import numpy as np

# Option 1: Fill missing 'horsepower' with 0
train_0 = train_df.copy()
val_0 = val_df.copy()
train_0['horsepower'] = train_0['horsepower'].fillna(0)
val_0['horsepower'] = val_0['horsepower'].fillna(0)

X_train_0 = train_0.drop('fuel_efficiency_mpg', axis=1)
y_train_0 = train_0['fuel_efficiency_mpg']
X_val_0 = val_0.drop('fuel_efficiency_mpg', axis=1)
y_val_0 = val_0['fuel_efficiency_mpg']

model_0 = LinearRegression()
model_0.fit(X_train_0, y_train_0)
preds_0 = model_0.predict(X_val_0)
rmse_0 = root_mean_squared_error(y_val_0, preds_0)
rmse_0 = round(rmse_0, 2)

# Option 2: Fill missing 'horsepower' with mean (from training set)
mean_hp = train_df['horsepower'].mean()
train_mean = train_df.copy()
val_mean = val_df.copy()
train_mean['horsepower'] = train_mean['horsepower'].fillna(mean_hp)
val_mean['horsepower'] = val_mean['horsepower'].fillna(mean_hp)

X_train_mean = train_mean.drop('fuel_efficiency_mpg', axis=1)
y_train_mean = train_mean['fuel_efficiency_mpg']
X_val_mean = val_mean.drop('fuel_efficiency_mpg', axis=1)
y_val_mean = val_mean['fuel_efficiency_mpg']

model_mean = LinearRegression()
model_mean.fit(X_train_mean, y_train_mean)
preds_mean = model_mean.predict(X_val_mean)
rmse_mean = root_mean_squared_error(y_val_mean, preds_mean)
rmse_mean = round(rmse_mean, 2)

print("RMSE (fill with 0):", rmse_0)
print("RMSE (fill with mean):", rmse_mean)

if rmse_0 < rmse_mean:
    print("With 0 gives better RMSE")
elif rmse_mean < rmse_0:
    print("With mean gives better RMSE")
else:
    print("Both are equally good")

RMSE (fill with 0): 0.52
RMSE (fill with mean): 0.46
With mean gives better RMSE


In [10]:
from sklearn.linear_model import Ridge

r_values = [0, 0.01, 0.1, 1, 5, 10, 100]
rmse_scores = []

for r in r_values:
    model = Ridge(alpha=r)
    model.fit(X_train_0, y_train_0)
    preds = model.predict(X_val_0)
    rmse = root_mean_squared_error(y_val_0, preds)
    rmse_scores.append(round(rmse, 2))

for r, score in zip(r_values, rmse_scores):
    print(f"r={r}: RMSE={score}")

min_rmse = min(rmse_scores)
best_r_candidates = [r for r, score in zip(r_values, rmse_scores) if score == min_rmse]
best_r = min(best_r_candidates)
print("Best r:", best_r)

r=0: RMSE=0.52
r=0.01: RMSE=0.52
r=0.1: RMSE=0.52
r=1: RMSE=0.52
r=5: RMSE=0.52
r=10: RMSE=0.52
r=100: RMSE=0.52
Best r: 0


In [12]:
rmse_list = []
seeds = list(range(10))

for seed in seeds:
    # Shuffle the filtered dataset with current seed
    df_shuffled = df_selected.sample(frac=1, random_state=seed).reset_index(drop=True)
    n = len(df_shuffled)
    train_end = int(0.6 * n)
    val_end = int(0.8 * n)
    train_df = df_shuffled.iloc[:train_end]
    val_df = df_shuffled.iloc[train_end:val_end]

    # Fill missing 'horsepower' with 0
    train_0 = train_df.copy()
    val_0 = val_df.copy()
    train_0['horsepower'] = train_0['horsepower'].fillna(0)
    val_0['horsepower'] = val_0['horsepower'].fillna(0)

    X_train_0 = train_0.drop('fuel_efficiency_mpg', axis=1)
    y_train_0 = train_0['fuel_efficiency_mpg']
    X_val_0 = val_0.drop('fuel_efficiency_mpg', axis=1)
    y_val_0 = val_0['fuel_efficiency_mpg']

    model = LinearRegression()
    model.fit(X_train_0, y_train_0)
    preds = model.predict(X_val_0)
    rmse = root_mean_squared_error(y_val_0, preds)
    rmse_list.append(rmse)

std = np.std(rmse_list)
print("Standard deviation of RMSE scores:", round(std, 3))
seeds = list(range(10))

Standard deviation of RMSE scores: 0.007


In [13]:
# Shuffle and split with seed 9
df_shuffled = df_selected.sample(frac=1, random_state=9).reset_index(drop=True)
n = len(df_shuffled)
train_end = int(0.6 * n)
val_end = int(0.8 * n)

train_df = df_shuffled.iloc[:train_end]
val_df = df_shuffled.iloc[train_end:val_end]
test_df = df_shuffled.iloc[val_end:]

# Combine train and validation sets
trainval_df = pd.concat([train_df, val_df], ignore_index=True)

# Fill missing 'horsepower' with 0
trainval_0 = trainval_df.copy()
test_0 = test_df.copy()
trainval_0['horsepower'] = trainval_0['horsepower'].fillna(0)
test_0['horsepower'] = test_0['horsepower'].fillna(0)

X_trainval_0 = trainval_0.drop('fuel_efficiency_mpg', axis=1)
y_trainval_0 = trainval_0['fuel_efficiency_mpg']
X_test_0 = test_0.drop('fuel_efficiency_mpg', axis=1)
y_test_0 = test_0['fuel_efficiency_mpg']

# Train Ridge model with r=0.001
model = Ridge(alpha=0.001)
model.fit(X_trainval_0, y_trainval_0)
preds = model.predict(X_test_0)

# Calculate RMSE
rmse = root_mean_squared_error(y_test_0, preds)
print("RMSE on test set:", round(rmse, 3))

RMSE on test set: 0.516
