In [4]:
import pandas as pd
import numpy as np

In [5]:
# Load the dataset
url = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv'
df = pd.read_csv(url)

print(f"{df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
df.head()

(9704, 11)

Columns: ['engine_displacement', 'num_cylinders', 'horsepower', 'vehicle_weight', 'acceleration', 'model_year', 'origin', 'fuel_type', 'drivetrain', 'num_doors', 'fuel_efficiency_mpg']


Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


In [6]:
# Select only the required columns
columns = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year', 'fuel_efficiency_mpg']
df = df[columns]

print(f"Selected dataset shape: {df.shape}")
df.info()

Selected dataset shape: (9704, 5)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9704 entries, 0 to 9703
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   engine_displacement  9704 non-null   int64  
 1   horsepower           8996 non-null   float64
 2   vehicle_weight       9704 non-null   float64
 3   model_year           9704 non-null   int64  
 4   fuel_efficiency_mpg  9704 non-null   float64
dtypes: float64(3), int64(2)
memory usage: 379.2 KB


In [7]:
missing_values = df.isnull().sum()
print("Missing values per column:")
print(missing_values)

Missing values per column:
engine_displacement      0
horsepower             708
vehicle_weight           0
model_year               0
fuel_efficiency_mpg      0
dtype: int64


In [8]:
horsepower_median = df['horsepower'].median()
print(f"Median for horsepower: {horsepower_median}")

Median for horsepower: 149.0


In [9]:
# Set up shuffling and splitting
n = len(df)
print(f'Total records: {n}')

# Calculate split sizes (60%/20%/20%)
n_val = int(n * 0.2)
n_test = int(n * 0.2) 
n_train = n - n_val - n_test

print(f'Train: {n_train}, Validation: {n_val}, Test: {n_test}')

Total records: 9704
Train: 5824, Validation: 1940, Test: 1940


In [10]:
# Shuffle with seed 42
np.random.seed(42)
idx = np.arange(n)
np.random.shuffle(idx)

In [11]:
# Split the data
df_train = df.iloc[idx[:n_train]].reset_index(drop=True)
df_val = df.iloc[idx[n_train:n_train+n_val]].reset_index(drop=True)
df_test = df.iloc[idx[n_train+n_val:]].reset_index(drop=True)

print(f'\nActual split sizes:')
print(f'Train: {len(df_train)}, Validation: {len(df_val)}, Test: {len(df_test)}')


Actual split sizes:
Train: 5824, Validation: 1940, Test: 1940


In [12]:

# Verify the split
print(f'Total after split: {len(df_train) + len(df_val) + len(df_test)}')

# Show first few rows of training set
print('\nFirst 3 rows of training set:')
print(df_train.head(3))

Total after split: 9704

First 3 rows of training set:
   engine_displacement  horsepower  vehicle_weight  model_year  \
0                  220       144.0     2535.887591        2009   
1                  160       141.0     2741.170484        2019   
2                  230       155.0     2471.880237        2017   

   fuel_efficiency_mpg  
0            16.642943  
1            16.298377  
2            18.591822  


In [13]:
def train_linear_regression(X, y):
    """Train linear regression using normal equation"""
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    
    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]  # return bias and weights

def rmse(y_true, y_pred):
    """Calculate Root Mean Square Error"""
    mse = np.mean((y_true - y_pred) ** 2)
    return np.sqrt(mse)

In [14]:
# Option 1: Fill missing values with 0
print("=== Option 1: Fill missing values with 0 ===")

# Prepare training data with 0 fill
X_train_0 = df_train.copy()
X_train_0['horsepower'] = X_train_0['horsepower'].fillna(0)

# Prepare validation data with 0 fill  
X_val_0 = df_val.copy()
X_val_0['horsepower'] = X_val_0['horsepower'].fillna(0)

# Use all features except target
feature_columns = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']
X_train_matrix_0 = X_train_0[feature_columns].values
X_val_matrix_0 = X_val_0[feature_columns].values

# Target variable
y_train = df_train['fuel_efficiency_mpg'].values
y_val = df_val['fuel_efficiency_mpg'].values

# Train model
w0_0, w_0 = train_linear_regression(X_train_matrix_0, y_train)

# Make predictions
y_pred_0 = w0_0 + X_val_matrix_0.dot(w_0)

# Calculate RMSE
rmse_0 = rmse(y_val, y_pred_0)
rmse_0_rounded = round(rmse_0, 2)

print(f"RMSE with 0 fill: {rmse_0_rounded}")
print(f"Model parameters: w0={w0_0:.4f}, w={w_0}")

=== Option 1: Fill missing values with 0 ===
RMSE with 0 fill: 0.52
Model parameters: w0=28.8274, w=[ 9.93613218e-05  3.54221891e-03 -5.01328015e-03  3.42992869e-04]


In [15]:
# Option 2: Fill missing values with mean (computed from training set only)
print("\n=== Option 2: Fill missing values with mean ===")

# Calculate mean from training set only (excluding missing values)
train_horsepower_mean = df_train['horsepower'].mean()
print(f"Training set horsepower mean: {train_horsepower_mean:.2f}")

# Prepare training data with mean fill
X_train_mean = df_train.copy()
X_train_mean['horsepower'] = X_train_mean['horsepower'].fillna(train_horsepower_mean)

# Prepare validation data with same mean from training set
X_val_mean = df_val.copy()
X_val_mean['horsepower'] = X_val_mean['horsepower'].fillna(train_horsepower_mean)

# Extract feature matrices
X_train_matrix_mean = X_train_mean[feature_columns].values
X_val_matrix_mean = X_val_mean[feature_columns].values

# Train model
w0_mean, w_mean = train_linear_regression(X_train_matrix_mean, y_train)

# Make predictions
y_pred_mean = w0_mean + X_val_matrix_mean.dot(w_mean)

# Calculate RMSE
rmse_mean = rmse(y_val, y_pred_mean)
rmse_mean_rounded = round(rmse_mean, 2)

print(f"RMSE with mean fill: {rmse_mean_rounded}")
print(f"Model parameters: w0={w0_mean:.4f}, w={w_mean}")


=== Option 2: Fill missing values with mean ===
Training set horsepower mean: 149.54
RMSE with mean fill: 0.46
Model parameters: w0=28.9253, w=[ 0.00012093  0.01030308 -0.00501009 -0.00023463]


In [16]:
# Compare the results
print("\n=== COMPARISON ===")
print(f"RMSE with 0 fill:    {rmse_0_rounded}")
print(f"RMSE with mean fill: {rmse_mean_rounded}")

# Determine which is better
if rmse_0_rounded < rmse_mean_rounded:
    better_option = "With 0"
    print(f"\nBetter option: With 0 (RMSE: {rmse_0_rounded})")
elif rmse_mean_rounded < rmse_0_rounded:
    better_option = "With mean"
    print(f"\nBetter option: With mean (RMSE: {rmse_mean_rounded})")
else:
    better_option = "Both are equally good"
    print(f"\nBoth options are equally good (RMSE: {rmse_0_rounded})")

print(f"\nANSWER TO QUESTION 3: {better_option}")


=== COMPARISON ===
RMSE with 0 fill:    0.52
RMSE with mean fill: 0.46

Better option: With mean (RMSE: 0.46)

ANSWER TO QUESTION 3: With mean


In [17]:
def train_linear_regression_reg(X, y, r=0.0):
    """Train regularized linear regression using normal equation with regularization"""
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    
    XTX = X.T.dot(X)
    XTX = XTX + r * np.eye(XTX.shape[0])  # Add regularization term
    
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]  # return bias and weights

In [18]:
#Prepare data for regularized regression (fill NAs with 0)
print("=== Regularized Linear Regression ===")
print("Filling missing values with 0 as specified...")

# Prepare training data with 0 fill
X_train_reg = df_train.copy()
X_train_reg['horsepower'] = X_train_reg['horsepower'].fillna(0)

# Prepare validation data with 0 fill
X_val_reg = df_val.copy() 
X_val_reg['horsepower'] = X_val_reg['horsepower'].fillna(0)

# Extract feature matrices
feature_columns = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']
X_train_matrix_reg = X_train_reg[feature_columns].values
X_val_matrix_reg = X_val_reg[feature_columns].values

# Target variables
y_train = df_train['fuel_efficiency_mpg'].values
y_val = df_val['fuel_efficiency_mpg'].values

print(f"Training set shape: {X_train_matrix_reg.shape}")
print(f"Validation set shape: {X_val_matrix_reg.shape}")
print(f"Features: {feature_columns}")

=== Regularized Linear Regression ===
Filling missing values with 0 as specified...
Training set shape: (5824, 4)
Validation set shape: (1940, 4)
Features: ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']


In [19]:
# Test different regularization parameters
r_values = [0, 0.01, 0.1, 1, 5, 10, 100]
results = []

print("\n=== Testing different regularization parameters ===")
print("r\t\tRMSE")
print("-" * 20)

for r in r_values:
    # Train model with regularization
    w0, w = train_linear_regression_reg(X_train_matrix_reg, y_train, r=r)
    
    # Make predictions on validation set
    y_pred = w0 + X_val_matrix_reg.dot(w)
    
    # Calculate RMSE
    rmse_score = rmse(y_val, y_pred)
    rmse_rounded = round(rmse_score, 2)
    
    results.append((r, rmse_rounded))
    print(f"{r}\t\t{rmse_rounded}")

print("\n=== Results Summary ===")
for r, rmse_score in results:
    print(f"r = {r}: RMSE = {rmse_score}")


=== Testing different regularization parameters ===
r		RMSE
--------------------
0		0.52
0.01		0.52
0.1		0.52
1		0.52
5		0.52
10		0.52
100		0.52

=== Results Summary ===
r = 0: RMSE = 0.52
r = 0.01: RMSE = 0.52
r = 0.1: RMSE = 0.52
r = 1: RMSE = 0.52
r = 5: RMSE = 0.52
r = 10: RMSE = 0.52
r = 100: RMSE = 0.52


In [20]:
# Find the best regularization parameter
best_rmse = min(results, key=lambda x: x[1])[1]
best_r_candidates = [r for r, rmse_score in results if rmse_score == best_rmse]

print(f"\n=== Best Results ===")
print(f"Best RMSE: {best_rmse}")
print(f"r values with best RMSE: {best_r_candidates}")

# If multiple r values give the same best RMSE, select the smallest r
best_r = min(best_r_candidates)
print(f"Selected r (smallest among best): {best_r}")

# Check if best_r is in the given options
options = [0, 0.01, 1, 10, 100]
if best_r in options:
    print(f"\nANSWER TO QUESTION 4: {best_r}")
else:
    # Find the option with best RMSE from the given choices
    option_results = [(r, rmse_score) for r, rmse_score in results if r in options]
    best_option_rmse = min(option_results, key=lambda x: x[1])[1]
    best_option_candidates = [r for r, rmse_score in option_results if rmse_score == best_option_rmse]
    best_option = min(best_option_candidates)
    print(f"\nFrom given options, best r: {best_option} (RMSE: {best_option_rmse})")
    print(f"ANSWER TO QUESTION 4: {best_option}")


=== Best Results ===
Best RMSE: 0.52
r values with best RMSE: [0, 0.01, 0.1, 1, 5, 10, 100]
Selected r (smallest among best): 0

ANSWER TO QUESTION 4: 0


In [21]:
# Test different seeds for data splitting
seed_values = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
rmse_scores = []

print("=== Testing different seeds for data splitting ===")
print("Seed\t\tRMSE")
print("-" * 25)

for seed in seed_values:
    # Split data with current seed
    n = len(df)
    n_val = int(n * 0.2)
    n_test = int(n * 0.2) 
    n_train = n - n_val - n_test
    
    np.random.seed(seed)
    idx = np.arange(n)
    np.random.shuffle(idx)
    
    df_train_seed = df.iloc[idx[:n_train]].reset_index(drop=True)
    df_val_seed = df.iloc[idx[n_train:n_train+n_val]].reset_index(drop=True)
    df_test_seed = df.iloc[idx[n_train+n_val:]].reset_index(drop=True)
    
    # Prepare data (fill missing values with 0)
    X_train_seed = df_train_seed.copy()
    X_train_seed['horsepower'] = X_train_seed['horsepower'].fillna(0)
    
    X_val_seed = df_val_seed.copy()
    X_val_seed['horsepower'] = X_val_seed['horsepower'].fillna(0)
    
    # Extract feature matrices
    feature_columns = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']
    X_train_matrix = X_train_seed[feature_columns].values
    X_val_matrix = X_val_seed[feature_columns].values
    
    # Target variables
    y_train_seed = df_train_seed['fuel_efficiency_mpg'].values
    y_val_seed = df_val_seed['fuel_efficiency_mpg'].values
    
    # Train model without regularization (r=0)
    w0, w = train_linear_regression(X_train_matrix, y_train_seed)
    
    # Make predictions on validation set
    y_pred = w0 + X_val_matrix.dot(w)
    
    # Calculate RMSE
    rmse_score = rmse(y_val_seed, y_pred)
    rmse_scores.append(rmse_score)
    
    print(f"{seed}\t\t{rmse_score:.6f}")

print(f"\n=== RMSE Scores for all seeds ===")
for i, score in enumerate(rmse_scores):
    print(f"Seed {seed_values[i]}: {score:.6f}")

=== Testing different seeds for data splitting ===
Seed		RMSE
-------------------------
0		0.520653
1		0.521339
2		0.522807
3		0.515952
4		0.510913
5		0.528341
6		0.531391
7		0.509067
8		0.514740
9		0.513187

=== RMSE Scores for all seeds ===
Seed 0: 0.520653
Seed 1: 0.521339
Seed 2: 0.522807
Seed 3: 0.515952
Seed 4: 0.510913
Seed 5: 0.528341
Seed 6: 0.531391
Seed 7: 0.509067
Seed 8: 0.514740
Seed 9: 0.513187


In [22]:
# Calculate standard deviation of RMSE scores
rmse_std = np.std(rmse_scores)
rmse_std_rounded = round(rmse_std, 3)

print(f"\n=== Standard Deviation Calculation ===")
print(f"RMSE scores: {[round(score, 6) for score in rmse_scores]}")
print(f"Mean RMSE: {np.mean(rmse_scores):.6f}")
print(f"Standard deviation: {rmse_std:.6f}")
print(f"Standard deviation (rounded to 3 digits): {rmse_std_rounded}")

# Check which option it matches
options = [0.001, 0.006, 0.060, 0.600]
closest_option = min(options, key=lambda x: abs(x - rmse_std_rounded))

print(f"\nOptions: {options}")
print(f"Closest option: {closest_option}")
print(f"\nANSWER TO QUESTION 5: {rmse_std_rounded}")


=== Standard Deviation Calculation ===
RMSE scores: [np.float64(0.520653), np.float64(0.521339), np.float64(0.522807), np.float64(0.515952), np.float64(0.510913), np.float64(0.528341), np.float64(0.531391), np.float64(0.509067), np.float64(0.51474), np.float64(0.513187)]
Mean RMSE: 0.518839
Standard deviation: 0.006989
Standard deviation (rounded to 3 digits): 0.007

Options: [0.001, 0.006, 0.06, 0.6]
Closest option: 0.006

ANSWER TO QUESTION 5: 0.007


In [23]:
# Additional analysis for precision
print(f"\n=== Detailed Analysis ===")
print(f"Exact standard deviation: {rmse_std:.10f}")
print(f"Rounded to 3 digits: {rmse_std_rounded}")

# Distance to each option
options = [0.001, 0.006, 0.060, 0.600]
print(f"\nDistance to each option:")
for option in options:
    distance = abs(rmse_std - option)
    print(f"Distance to {option}: {distance:.10f}")

print(f"\nThe exact value ({rmse_std:.6f}) is closest to option: {closest_option}")
print(f"Although it rounds to {rmse_std_rounded}, the closest given option is {closest_option}")

print(f"\nFINAL ANSWER TO QUESTION 5: {closest_option}")


=== Detailed Analysis ===
Exact standard deviation: 0.0069894464
Rounded to 3 digits: 0.007

Distance to each option:
Distance to 0.001: 0.0059894464
Distance to 0.006: 0.0009894464
Distance to 0.06: 0.0530105536
Distance to 0.6: 0.5930105536

The exact value (0.006989) is closest to option: 0.006
Although it rounds to 0.007, the closest given option is 0.006

FINAL ANSWER TO QUESTION 5: 0.006


In [24]:
# Step 1: Split dataset with seed 9
print("=== Final Model Training and Evaluation ===")
print("Using seed 9 for data splitting...")

seed = 9
n = len(df)
n_val = int(n * 0.2)
n_test = int(n * 0.2) 
n_train = n - n_val - n_test

np.random.seed(seed)
idx = np.arange(n)
np.random.shuffle(idx)

df_train_final = df.iloc[idx[:n_train]].reset_index(drop=True)
df_val_final = df.iloc[idx[n_train:n_train+n_val]].reset_index(drop=True)
df_test_final = df.iloc[idx[n_train+n_val:]].reset_index(drop=True)

print(f"Train set size: {len(df_train_final)}")
print(f"Validation set size: {len(df_val_final)}")
print(f"Test set size: {len(df_test_final)}")

# Step 2: Combine train and validation datasets
print(f"\nCombining train and validation datasets...")
df_full_train = pd.concat([df_train_final, df_val_final], ignore_index=True)
print(f"Combined training set size: {len(df_full_train)}")
print(f"Test set size: {len(df_test_final)}")

=== Final Model Training and Evaluation ===
Using seed 9 for data splitting...
Train set size: 5824
Validation set size: 1940
Test set size: 1940

Combining train and validation datasets...
Combined training set size: 7764
Test set size: 1940


In [25]:
# Step 3: Prepare data (fill missing values with 0)
print(f"\nPreparing data (filling missing values with 0)...")

# Prepare combined training data
X_full_train = df_full_train.copy()
X_full_train['horsepower'] = X_full_train['horsepower'].fillna(0)

# Prepare test data  
X_test_final = df_test_final.copy()
X_test_final['horsepower'] = X_test_final['horsepower'].fillna(0)

# Extract feature matrices
feature_columns = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']
X_full_train_matrix = X_full_train[feature_columns].values
X_test_matrix = X_test_final[feature_columns].values

# Target variables
y_full_train = df_full_train['fuel_efficiency_mpg'].values
y_test_final = df_test_final['fuel_efficiency_mpg'].values

print(f"Full training set shape: {X_full_train_matrix.shape}")
print(f"Test set shape: {X_test_matrix.shape}")
print(f"Features used: {feature_columns}")

# Check for missing values
print(f"\nMissing values in full training set:")
print(f"Before filling: {df_full_train.isnull().sum()}")
print(f"After filling: {X_full_train.isnull().sum()}")

print(f"\nMissing values in test set:")
print(f"Before filling: {df_test_final.isnull().sum()}")
print(f"After filling: {X_test_final.isnull().sum()}")


Preparing data (filling missing values with 0)...
Full training set shape: (7764, 4)
Test set shape: (1940, 4)
Features used: ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']

Missing values in full training set:
Before filling: engine_displacement      0
horsepower             563
vehicle_weight           0
model_year               0
fuel_efficiency_mpg      0
dtype: int64
After filling: engine_displacement    0
horsepower             0
vehicle_weight         0
model_year             0
fuel_efficiency_mpg    0
dtype: int64

Missing values in test set:
Before filling: engine_displacement      0
horsepower             145
vehicle_weight           0
model_year               0
fuel_efficiency_mpg      0
dtype: int64
After filling: engine_displacement    0
horsepower             0
vehicle_weight         0
model_year             0
fuel_efficiency_mpg    0
dtype: int64


In [26]:
# Step 4: Train model with regularization r=0.001
print(f"\nTraining regularized linear regression with r=0.001...")

r = 0.001
w0, w = train_linear_regression_reg(X_full_train_matrix, y_full_train, r=r)

print(f"Model trained with regularization parameter: {r}")
print(f"Model bias (w0): {w0:.6f}")
print(f"Model weights shape: {w.shape}")
print(f"Model weights: {w}")

# Step 5: Make predictions on test set
print(f"\nMaking predictions on test set...")
y_pred_test = w0 + X_test_matrix.dot(w)

# Step 6: Calculate RMSE on test set
test_rmse = rmse(y_test_final, y_pred_test)
test_rmse_rounded = round(test_rmse, 3)

print(f"\n=== Final Results ===")
print(f"RMSE on test set: {test_rmse:.6f}")
print(f"RMSE on test set (rounded to 3 digits): {test_rmse_rounded}")

# Check which option it matches
options = [0.15, 0.515, 5.15, 51.5]
closest_option = min(options, key=lambda x: abs(x - test_rmse))

print(f"\nOptions: {options}")
print(f"Closest option: {closest_option}")
print(f"\nANSWER TO QUESTION 6: {closest_option}")


Training regularized linear regression with r=0.001...
Model trained with regularization parameter: 0.001
Model bias (w0): 26.946835
Model weights shape: (4,)
Model weights: [ 1.45147512e-05  3.56475200e-03 -5.01712541e-03  1.29155334e-03]

Making predictions on test set...

=== Final Results ===
RMSE on test set: 0.515626
RMSE on test set (rounded to 3 digits): 0.516

Options: [0.15, 0.515, 5.15, 51.5]
Closest option: 0.515

ANSWER TO QUESTION 6: 0.515
