In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv')
df = df[['engine_displacement',
'horsepower',
'vehicle_weight',
'model_year',
'fuel_efficiency_mpg']]
df.head()

Unnamed: 0,engine_displacement,horsepower,vehicle_weight,model_year,fuel_efficiency_mpg
0,170,159.0,3413.433759,2003,13.231729
1,130,97.0,3149.664934,2007,13.688217
2,170,78.0,3079.038997,2018,14.246341
3,220,,2542.392402,2009,16.912736
4,210,140.0,3460.87099,2009,12.488369


In [3]:
for col in df.columns.to_list():
    a = df[col].isnull().sum()
    if a:
        print(f'Column: {col} has {df[col].isnull().sum()} Nan values!')

Column: horsepower has 708 Nan values!


In [4]:
df['horsepower'].median()

np.float64(149.0)

In [5]:
def split_seed(s):
    np.random.seed(s)
    n = len(df)
    
    n_val = int(0.2 * n)
    n_test = int(0.2 * n)
    n_train = n - (n_val + n_test)
    
    idx = np.arange(n)
    np.random.shuffle(idx)
    
    df_shuffled = df.iloc[idx]
    
    df_train = df_shuffled.iloc[:n_train].copy()
    df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
    df_test = df_shuffled.iloc[n_train+n_val:].copy()
    return df_train, df_val

df_train, df_val = split_seed(42)


In [6]:
base = ['engine_displacement',
'horsepower',
'vehicle_weight',
'model_year']

def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    if np.isnan(w).any():
        print("NaN detected in weights!")

    return w[0], w[1:]


def filled_hp(dfr, use_mean):
    df_new = dfr.copy()
    if use_mean:
        fill_value = df_new['horsepower'].mean()
    else:
        fill_value = 0
    df_new['horsepower'] = df_new['horsepower'].fillna(fill_value)
    return df_new

def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.sqrt(mse)

y_train = df_train.fuel_efficiency_mpg.values
y_val = df_val.fuel_efficiency_mpg.values

def label_pred(df_train, df_val, fill):
    X_train = filled_hp(df_train[base], fill)
    X_val = filled_hp(df_val[base], fill)
    w0, w = train_linear_regression(X_train, y_train)
    return w0 + X_val.dot(w)

# Fill with 0
y_pred_0 = label_pred(df_train, df_val, 0)

# Fill with mean
y_pred_1 = label_pred(df_train, df_val, 1)

# Compare RMSE
print(f'Filling with "0" yields score: {round(rmse(y_val, y_pred_0), 2)}, while filling with mean yields score: {round(rmse(y_val, y_pred_1), 2)} !!!')


Filling with "0" yields score: 0.52, while filling with mean yields score: 0.46 !!!


In [7]:
def train_linear_regression_reg(X, y, r=0.0):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])
    XTX = XTX + reg

    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

X_train_0 = filled_hp(df_train[base], 0)
X_val_0 = filled_hp(df_val[base], 0)

for r in [0, 0.01, 0.1, 1, 5, 10, 100]:
    w0, w = train_linear_regression_reg(X_train_0.values, y_train, r)
    y_p = w0 + X_val_0.values.dot(w)

    print(f"For r-value: {r}, we get RMSE: {round(rmse(y_val, y_p), 5)}, rounded to {round(rmse(y_val, y_p), 2)}!")  

For r-value: 0, we get RMSE: 0.51738, rounded to 0.52!
For r-value: 0.01, we get RMSE: 0.51711, rounded to 0.52!
For r-value: 0.1, we get RMSE: 0.51875, rounded to 0.52!
For r-value: 1, we get RMSE: 0.52223, rounded to 0.52!
For r-value: 5, we get RMSE: 0.52289, rounded to 0.52!
For r-value: 10, we get RMSE: 0.52298, rounded to 0.52!
For r-value: 100, we get RMSE: 0.52306, rounded to 0.52!


In [8]:
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
rmse_scores = []
for s in seeds:
    df_train, df_val = split_seed(s)
    y_train = df_train.fuel_efficiency_mpg.values
    y_val = df_val.fuel_efficiency_mpg.values
    y_pred = label_pred(df_train, df_val, 0)
    rmse_scores.append(rmse(y_val, y_pred))
    
std_rmse = np.std(rmse_scores)
std_rmse = round(std_rmse, 3)

print("Standard deviation of RMSE:", std_rmse)


Standard deviation of RMSE: 0.007


In [9]:
np.random.seed(9)
n = len(df)
n_test = int(0.2 * n)
n_train = n - n_test
idx = np.arange(n)
np.random.shuffle(idx)
df_shuffled = df.iloc[idx]

df_train = df_shuffled.iloc[:n_train].copy()
df_test = df_shuffled.iloc[n_train:].copy()

y_train = df_train.fuel_efficiency_mpg.values
y_test = df_test.fuel_efficiency_mpg.values

X_train_0 = filled_hp(df_train[base], 0)
X_test_0 = filled_hp(df_test[base], 0)

w0, w = train_linear_regression_reg(X_train_0.values, y_train, 0.001)
y_p = w0 + X_test_0.values.dot(w)
rmse(y_test, y_p)

np.float64(0.5156261299167999)