# Linear Regression Model

### Set Up

In [68]:
import pandas as pd 
import numpy as np 
import pickle
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler 
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import cross_val_score
from statsmodels.stats.outliers_influence import variance_inflation_factor


In [69]:
import pickle


with open("Data/train_test_split.pkl", "rb") as f:
    X_train, X_test, y_train, y_test = pickle.load(f)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)






(2876, 531) (720, 531) (2876,) (720,)


In [70]:

X_train = X_train.replace([np.inf, -np.inf], 0)
X_test = X_test.replace([np.inf, -np.inf], 0)

y_train = y_train.replace([np.inf, -np.inf], 0)
y_test = y_test.replace([np.inf, -np.inf], 0)



In [71]:
print("X_train:", X_train.shape)
print("y_train:", y_train.shape)

print("X_test:", X_test.shape)
print("y_test:", y_test.shape)


X_train: (2876, 531)
y_train: (2876,)
X_test: (720, 531)
y_test: (720,)


In [73]:
# List all columns that are object (string) type
string_cols = X_train.select_dtypes(include='object').columns
print("Columns with strings:", string_cols)


Columns with strings: Index([], dtype='object')


Using ridge regression to regularise the logistic regression. This will handle multicollinearity from the multiple encoded columns.

In [74]:
from sklearn.linear_model import Ridge

ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)

y_pred = ridge.predict(X_test)


In [75]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

y_test_actual = np.exp(y_test)
y_pred_actual = np.exp(y_pred)

print("RMSE (£):", np.sqrt(mean_squared_error(y_test_actual, y_pred_actual)))
print("MAE (£):", mean_absolute_error(y_test_actual, y_pred_actual))
print("R²:", r2_score(y_test_actual, y_pred_actual))


RMSE (£): 1280.436304837947
MAE (£): 863.9882984717127
R²: 0.9229942160137281


## Evaluation

Residuals are centered around zero with no ovvious curve and a strong predictive accuracy.

Howver there is a larger spread at lower predicted values 7-8 and tighter spead at higher values 9-10. This means there is heteroscedacity and thew model predict expensive cars more accurately.

## Refine Features

- Removing features with coefficients near zero.

- Combing Sparse Categories

- Checking multicolinearity

In [78]:


X_numeric = X_train.select_dtypes(include=np.number)
vif = pd.DataFrame()
vif["feature"] = X_numeric.columns
vif["VIF"] = [variance_inflation_factor(X_numeric.values, i) for i in range(X_numeric.shape[1])]
print(vif.sort_values("VIF", ascending=False).head(30))


                feature          VIF
0        Mileage(miles)          inf
13        Mileage_Delta          inf
12     Expected_Mileage          inf
7               Mileage          inf
8               Car_Age          inf
1     Registration_Year  1794.714608
17      Car_Age_Squared    31.616327
16      Engine_per_Seat    11.608250
3                Engine    10.806069
18          Premium_Age     7.433333
11           Is_Premium     7.175358
10          Log_Mileage     6.448670
9      Mileage_per_Year     6.390838
5                 Seats     4.429321
14      Owners_per_Year     3.583434
2       Previous Owners     2.861193
15        Is_Family_Car     2.534683
4                 Doors     1.723911
6   Has_Service_History     1.060145


  vif = 1. / (1. - r_squared_i)


In [80]:
drop_cols = [
    "Registration_Year",
    "Mileage",      # keep Log_Mileage instead
    "Engine_per_Seat",     # redundant
    "Is_Premium"           # redundant if brand dummies exist
]

X_train = X_train.drop(columns=[c for c in drop_cols if c in X_train.columns])
X_test = X_test.drop(columns=[c for c in drop_cols if c in X_test.columns])


In [81]:
import pickle

# Save X
with open("Data/X_train.pkl", "wb") as f:
    pickle.dump(X_train, f)

with open("Data/X_test.pkl", "wb") as f:
    pickle.dump(X_test, f)

# Save y
with open("Data/y_train.pkl", "wb") as f:
    pickle.dump(y_train, f)

with open("Data/y_test.pkl", "wb") as f:
    pickle.dump(y_test, f)

print("Train/test sets saved successfully.")


Train/test sets saved successfully.


In [82]:
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

print(X_train.columns[:5])
print(y_train.head())


(2876, 527) (720, 527)
(2876,) (720,)
Index(['Mileage(miles)', 'Previous Owners', 'Engine', 'Doors', 'Seats'], dtype='object')
1348    9.302920
1792    8.004700
1712    8.697847
2526    8.778788
3374    8.516993
Name: Log_Price, dtype: float64
