In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from scipy.stats import skew    

df = pd.read_csv('car_fuel_efficiency.csv')

# Calculate skewness
skewness = df['fuel_efficiency_mpg'].skew()
print(f"Skewness of column 'fuel_efficiency_mpg': {skewness:.4f}")

# Interpret the skewness
if skewness > 1:
    print("The distribution is highly positively skewed (long right tail).")
elif skewness < -1:
    print("The distribution is highly negatively skewed (long left tail).")
elif 0.5 < skewness <= 1:
    print("The distribution is moderately positively skewed (long right tail).")
elif -1 <= skewness < -0.5:
    print("The distribution is moderately negatively skewed (long left tail).")
else:
    print("The distribution is approximately symmetric.")

def prepare_X(df_input):
    """Prepares the feature matrix X by adding a bias column."""
    df = df_input.copy()
    X = df.values
    # Add the bias term (column of ones)
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    return X

def train_linear_regression(X, y, r=0.0):
    """Trains a regularized linear regression model using the Normal Equation."""
    # Add regularization term (rI) to X.T @ X
    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])
    XTX = XTX + reg
    
    # Solve for weights w: w = (X.T X + rI)^-1 X.T y
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w

def rmse(y_true, y_pred):
    """Calculates the Root Mean Squared Error (RMSE)."""
    error = y_true - y_pred
    mse = (error ** 2).mean()
    return np.sqrt(mse)

def predict_and_evaluate(df_input, w, y_true):
    """Makes predictions and evaluates the model using RMSE."""
    X = prepare_X(df_input)
    y_pred = X.dot(w)
    score = rmse(y_true, y_pred)
    return score

# --- 1. Data Loading and Filtering ---

# Load the dataset
#file_name = 'car_fuel_efficiency.csv'
df_original = pd.read_csv('car_fuel_efficiency.csv')

# Define the required columns
cols = [
    'engine_displacement',
    'horsepower',
    'vehicle_weight',
    'model_year',
    'fuel_efficiency_mpg'
]

# Filter the dataset
df = df_original[cols].copy()

# --- 2. EDA: Long Tail Check ---

print("\n--- EDA: Fuel Efficiency Long Tail Check ---")
mpg_skewness = df['fuel_efficiency_mpg'].skew()
print(f"Skewness of 'fuel_efficiency_mpg': {mpg_skewness:.4f}")

# The skewness is close to 0, which suggests the distribution is relatively symmetric.
print("Conclusion: No, the variable does not have a strong long tail (relatively symmetric).")

# --- 3. Question 1: Column with missing values ---

print("\n--- Question 1: Column with Missing Values ---")
missing_counts = df.isnull().sum()
missing_col_name = missing_counts[missing_counts > 0].index.tolist()[0]
print(f"The column with missing values is: '{missing_col_name}'")

# --- 4. Question 2: Median Horsepower ---

print("\n--- Question 2: Median Horsepower ---")
median_hp = df['horsepower'].median()
print(f"Median (50% percentile) for 'horsepower': {median_hp:.0f}")

np.random.seed(2)

n = len(df)
n_test = int(0.2 * n)
n_val = int(0.2 * n)
n_train = n - n_val - n_test

idx = np.arange(n)
np.random.shuffle(idx)

df_shuffled = df.iloc[idx].reset_index(drop=True)

# Split the data
df_train = df_shuffled.iloc[:n_train].copy()
df_val = df_shuffled.iloc[n_train:n_train + n_val].copy()
df_test = df_shuffled.iloc[n_train + n_val:].copy()

y_train = df_train['fuel_efficiency_mpg'].values
y_val = df_val['fuel_efficiency_mpg'].values
y_test = df_test['fuel_efficiency_mpg'].values

# Remove target column from features DataFrames
for d in [df_train, df_val, df_test]:
    del d['fuel_efficiency_mpg']


# --- 6. Question 3: Missing Value Imputation Comparison ---

print("\n--- Question 3: Missing Value Imputation Comparison (r=0) ---")

# Option 1: Fill with 0
df_train_0 = df_train.copy()
df_val_0 = df_val.copy()
df_train_0[missing_col_name] = df_train_0[missing_col_name].fillna(0)
df_val_0[missing_col_name] = df_val_0[missing_col_name].fillna(0)

X_train_0 = prepare_X(df_train_0)
w_0 = train_linear_regression(X_train_0, y_train, r=0)
rmse_0_rounded = round(predict_and_evaluate(df_val_0, w_0, y_val), 2)
print(f"RMSE (Fill with 0): {rmse_0_rounded}")

# Option 2: Fill with Mean (of training set)
mean_hp_train = df_train[missing_col_name].mean()

df_train_mean = df_train.copy()
df_val_mean = df_val.copy()
df_train_mean[missing_col_name] = df_train_mean[missing_col_name].fillna(mean_hp_train)
df_val_mean[missing_col_name] = df_val_mean[missing_col_name].fillna(mean_hp_train)

X_train_mean = prepare_X(df_train_mean)
w_mean = train_linear_regression(X_train_mean, y_train, r=0)
rmse_mean_rounded = round(predict_and_evaluate(df_val_mean, w_mean, y_val), 2)
print(f"RMSE (Fill with mean): {rmse_mean_rounded}")

# --- 7. Question 4: Regularization (r) Tunning (NAs filled with 0) ---

print("\n--- Question 4: Regularization Tunning (NAs filled with 0) ---")

r_list = [0, 0.01, 0.1, 1, 5, 10, 100]
scores = {}

for r in r_list:
    # Use the NA-filled-with-0 data
    X_train = prepare_X(df_train_0)
    w = train_linear_regression(X_train, y_train, r=r)
    score = predict_and_evaluate(df_val_0, w, y_val)
    scores[r] = round(score, 2)

print("RMSE scores for different r values (rounded to 2 decimals):")
print(scores)

# Find the best r (smallest r for best (lowest) score)
best_score = min(scores.values())
best_r_candidates = [r for r, score in scores.items() if score == best_score]
best_r = min(best_r_candidates)
print(f"Best r: {best_r} with RMSE: {best_score}")


# --- 8. Question 5: Seed Influence ---

print("\n--- Question 5: Seed Influence on RMSE ---")

seed_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
rmse_scores = []

for seed in seed_list:
    # Split the dataset with the current seed
    np.random.seed(seed)
    
    idx = np.arange(n)
    np.random.shuffle(idx)

    df_shuffled_s = df.iloc[idx].reset_index(drop=True)

    df_train_s = df_shuffled_s.iloc[:n_train].copy()
    df_val_s = df_shuffled_s.iloc[n_train:n_train + n_val].copy()

    y_train_s = df_train_s['fuel_efficiency_mpg'].values
    y_val_s = df_val_s['fuel_efficiency_mpg'].values

    del df_train_s['fuel_efficiency_mpg']
    del df_val_s['fuel_efficiency_mpg']

    # Fill missing values with 0
    df_train_s[missing_col_name] = df_train_s[missing_col_name].fillna(0)
    df_val_s[missing_col_name] = df_val_s[missing_col_name].fillna(0)

    # Train unregularized model (r=0)
    X_train_s = prepare_X(df_train_s)
    w_s = train_linear_regression(X_train_s, y_train_s, r=0)
    
    # Evaluate on validation set
    score = predict_and_evaluate(df_val_s, w_s, y_val_s)
    rmse_scores.append(score)

std_rmse = np.std(rmse_scores)
std_rmse_rounded = round(std_rmse, 3)

print(f"Standard deviation of all RMSE scores: {std_rmse_rounded}")


# --- 9. Question 6: Final Model (Seed 9, r=0.001) ---

print("\n--- Question 6: Final Model Evaluation on Test Set ---")

# Split the dataset with seed 9
np.random.seed(9)
idx = np.arange(n)
np.random.shuffle(idx)
df_shuffled_f = df.iloc[idx].reset_index(drop=True)

df_train_f = df_shuffled_f.iloc[:n_train].copy()
df_val_f = df_shuffled_f.iloc[n_train:n_train + n_val].copy()
df_test_f = df_shuffled_f.iloc[n_train + n_val:].copy()

y_test_f = df_test_f['fuel_efficiency_mpg'].values
del df_test_f['fuel_efficiency_mpg']

# Combine train and validation datasets
df_full_train = pd.concat([df_train_f, df_val_f])
y_full_train = np.concatenate([df_train_f['fuel_efficiency_mpg'].values, df_val_f['fuel_efficiency_mpg'].values])
del df_full_train['fuel_efficiency_mpg']


# Fill missing values with 0
df_full_train[missing_col_name] = df_full_train[missing_col_name].fillna(0)
df_test_f[missing_col_name] = df_test_f[missing_col_name].fillna(0)

# Train model with r=0.001 on the full training set
X_full_train = prepare_X(df_full_train)
w_final = train_linear_regression(X_full_train, y_full_train, r=0.001)

# Evaluate RMSE on the test dataset
rmse_test_rounded = round(predict_and_evaluate(df_test_f, w_final, y_test_f), 4)

print(f"RMSE on the test dataset (r=0.001): {rmse_test_rounded}")


Skewness of column 'fuel_efficiency_mpg': -0.0121
The distribution is approximately symmetric.

--- EDA: Fuel Efficiency Long Tail Check ---
Skewness of 'fuel_efficiency_mpg': -0.0121
Conclusion: No, the variable does not have a strong long tail (relatively symmetric).

--- Question 1: Column with Missing Values ---
The column with missing values is: 'horsepower'

--- Question 2: Median Horsepower ---
Median (50% percentile) for 'horsepower': 149

--- Question 3: Missing Value Imputation Comparison (r=0) ---
RMSE (Fill with 0): 0.52
RMSE (Fill with mean): 0.47

--- Question 4: Regularization Tunning (NAs filled with 0) ---
RMSE scores for different r values (rounded to 2 decimals):
{0: np.float64(0.52), 0.01: np.float64(0.52), 0.1: np.float64(0.53), 1: np.float64(0.53), 5: np.float64(0.53), 10: np.float64(0.53), 100: np.float64(0.53)}
Best r: 0 with RMSE: 0.52

--- Question 5: Seed Influence on RMSE ---
Standard deviation of all RMSE scores: 0.007

--- Question 6: Final Model Evaluatio