In [1]:
import pandas as pd
import numpy as np

In [2]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv

--2025-10-06 18:49:46--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 874188 (854K) [text/plain]
Saving to: ‘car_fuel_efficiency.csv’


2025-10-06 18:49:46 (3.46 MB/s) - ‘car_fuel_efficiency.csv’ saved [874188/874188]



In [3]:
import pandas as pd

# Read the dataset
df = pd.read_csv("car_fuel_efficiency.csv")

# Keep only the specified columns
columns_to_use = [
    "engine_displacement",
    "horsepower",
    "vehicle_weight",
    "model_year",
    "fuel_efficiency_mpg"
]

df = df[columns_to_use]

# Display the first few rows
df.head()


Unnamed: 0,engine_displacement,horsepower,vehicle_weight,model_year,fuel_efficiency_mpg
0,170,159.0,3413.433759,2003,13.231729
1,130,97.0,3149.664934,2007,13.688217
2,170,78.0,3079.038997,2018,14.246341
3,220,,2542.392402,2009,16.912736
4,210,140.0,3460.87099,2009,12.488369


In [4]:
df.isnull().sum()

engine_displacement      0
horsepower             708
vehicle_weight           0
model_year               0
fuel_efficiency_mpg      0
dtype: int64

In [5]:
median_hp = df["horsepower"].median()
print(median_hp)

149.0


In [6]:
# Shuffle the dataset with a fixed random seed
df_shuffled = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Show the first few rows
df_shuffled.head()

Unnamed: 0,engine_displacement,horsepower,vehicle_weight,model_year,fuel_efficiency_mpg
0,220,144.0,2535.887591,2009,16.642943
1,160,141.0,2741.170484,2019,16.298377
2,230,155.0,2471.880237,2017,18.591822
3,150,206.0,3748.164469,2015,11.818843
4,300,111.0,2135.716359,2006,19.402209


In [7]:
from sklearn.model_selection import train_test_split

# 60% train, 40% temp (val + test)
train_df, temp_df = train_test_split(df_shuffled, test_size=0.4, random_state=42)

# Split the remaining 40% equally into val and test (20% each overall)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Check the resulting shapes
print("Train:", len(train_df))
print("Validation:", len(val_df))
print("Test:", len(test_df))

Train: 5822
Validation: 1941
Test: 1941


In [8]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# 1️⃣ Split (as before)
train_df, temp_df = train_test_split(df_shuffled, test_size=0.4, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# 2️⃣ Separate target and features
target = "fuel_efficiency_mpg"
features = ["engine_displacement", "horsepower", "vehicle_weight", "model_year"]

# Ensure numeric
for col in features:
    train_df[col] = pd.to_numeric(train_df[col], errors="coerce")
    val_df[col] = pd.to_numeric(val_df[col], errors="coerce")

# 3️⃣ Option 1 — Fill missing horsepower with 0
train_zero = train_df.copy()
val_zero = val_df.copy()

train_zero["horsepower"] = train_zero["horsepower"].fillna(0)
val_zero["horsepower"] = val_zero["horsepower"].fillna(0)

model_zero = LinearRegression()
model_zero.fit(train_zero[features], train_zero[target])

pred_zero = model_zero.predict(val_zero[features])
rmse_zero = np.sqrt(mean_squared_error(val_zero[target], pred_zero))

# 4️⃣ Option 2 — Fill missing horsepower with mean (computed from TRAIN only)
train_mean = train_df.copy()
val_mean = val_df.copy()

hp_mean = train_mean["horsepower"].mean()
train_mean["horsepower"] = train_mean["horsepower"].fillna(hp_mean)
val_mean["horsepower"] = val_mean["horsepower"].fillna(hp_mean)

model_mean = LinearRegression()
model_mean.fit(train_mean[features], train_mean[target])

pred_mean = model_mean.predict(val_mean[features])
rmse_mean = np.sqrt(mean_squared_error(val_mean[target], pred_mean))

# 5️⃣ Print results
print("RMSE (fill with 0):", round(rmse_zero, 2))
print("RMSE (fill with mean):", round(rmse_mean, 2))

# Compare
better = "mean" if rmse_mean < rmse_zero else "zero"
print(f"✅ Better RMSE with: {better}")


RMSE (fill with 0): 0.51
RMSE (fill with mean): 0.46
✅ Better RMSE with: mean


In [9]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# 1️⃣ Prepare train/validation split (shuffled df already available)
train_df, temp_df = train_test_split(df_shuffled, test_size=0.4, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

features = ["engine_displacement", "horsepower", "vehicle_weight", "model_year"]
target = "fuel_efficiency_mpg"

# Ensure numeric
for col in features:
    train_df[col] = pd.to_numeric(train_df[col], errors="coerce")
    val_df[col] = pd.to_numeric(val_df[col], errors="coerce")

# 2️⃣ Fill missing horsepower with 0
train_df_filled = train_df.fillna(0)
val_df_filled = val_df.fillna(0)

# 3️⃣ List of regularization strengths
r_values = [0, 0.01, 0.1, 1, 5, 10, 100]
rmse_scores = {}

# 4️⃣ Train Ridge regression for each r and compute RMSE on validation set
for r in r_values:
    model = Ridge(alpha=r)  # alpha = regularization strength
    model.fit(train_df_filled[features], train_df_filled[target])
    
    pred = model.predict(val_df_filled[features])
    rmse = np.sqrt(mean_squared_error(val_df_filled[target], pred))
    
    rmse_scores[r] = round(rmse, 2)

# 5️⃣ Show results
for r, score in rmse_scores.items():
    print(f"r={r}: RMSE={score}")

# 6️⃣ Find best r
best_r = min(rmse_scores, key=rmse_scores.get)
print(f"✅ Best r: {best_r} with RMSE={rmse_scores[best_r]}")


r=0: RMSE=0.51
r=0.01: RMSE=0.51
r=0.1: RMSE=0.51
r=1: RMSE=0.51
r=5: RMSE=0.51
r=10: RMSE=0.51
r=100: RMSE=0.51
✅ Best r: 0 with RMSE=0.51


In [10]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

features = ["engine_displacement", "horsepower", "vehicle_weight", "model_year"]
target = "fuel_efficiency_mpg"

# Ensure numeric
for col in features:
    df_shuffled[col] = pd.to_numeric(df_shuffled[col], errors="coerce")

seeds = list(range(10))
rmse_scores = []

for seed in seeds:
    # 1️⃣ Split data
    train_df, temp_df = train_test_split(df_shuffled, test_size=0.4, random_state=seed)
    val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=seed)
    
    # 2️⃣ Fill missing values with 0
    train_filled = train_df.fillna(0)
    val_filled = val_df.fillna(0)
    
    # 3️⃣ Train linear regression
    model = LinearRegression()
    model.fit(train_filled[features], train_filled[target])
    
    # 4️⃣ Predict & compute RMSE
    pred = model.predict(val_filled[features])
    rmse = np.sqrt(mean_squared_error(val_filled[target], pred))
    rmse_scores.append(rmse)

# 5️⃣ Compute standard deviation of RMSE scores
std_rmse = round(np.std(rmse_scores), 3)
print("Standard deviation of RMSE:", std_rmse)


Standard deviation of RMSE: 0.01


In [11]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

features = ["engine_displacement", "horsepower", "vehicle_weight", "model_year"]
target = "fuel_efficiency_mpg"

# Ensure numeric
for col in features:
    df_shuffled[col] = pd.to_numeric(df_shuffled[col], errors="coerce")

# 1️⃣ Split with seed 9
train_df, temp_df = train_test_split(df_shuffled, test_size=0.4, random_state=9)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=9)

# 2️⃣ Combine train + validation
train_val_df = pd.concat([train_df, val_df], ignore_index=True)

# 3️⃣ Fill missing values with 0
train_val_filled = train_val_df.fillna(0)
test_filled = test_df.fillna(0)

# 4️⃣ Train Ridge regression with r=0.001
model = Ridge(alpha=0.001)
model.fit(train_val_filled[features], train_val_filled[target])

# 5️⃣ Predict on test set
pred_test = model.predict(test_filled[features])

# 6️⃣ Compute RMSE
rmse_test = np.sqrt(mean_squared_error(test_filled[target], pred_test))
print("RMSE on test dataset:", round(rmse_test, 3))



RMSE on test dataset: 0.531
