# Linear Regression Model

### Set Up

In [334]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler 
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import cross_val_score
from statsmodels.stats.outliers_influence import variance_inflation_factor


In [336]:
# Importing test train splits

with open("Data/train_test_split.pkl", "wb") as f:
    pickle.dump((X_train, X_test, y_train, y_test), f)

train_df['Log_Mileage'] = np.log(train_df['Mileage'])
test_df['Log_Mileage'] = np.log(test_df['Mileage'])






In [337]:
# Separating Features and Target

# Features
X_train = train_df.drop(columns=['Price', 'Log_Price', 'title', 'title_lower'], errors='ignore')
X_test = test_df.drop(columns=['Price', 'Log_Price', 'title', 'title_lower'], errors='ignore')

X_train = X_train.replace([np.inf, -np.inf], 0)
X_test = X_test.replace([np.inf, -np.inf], 0)

y_train = y_train.replace([np.inf, -np.inf], 0)
y_test = y_test.replace([np.inf, -np.inf], 0)
# Target
y_train = train_df['Log_Price']
y_test = test_df['Log_Price']


In [338]:
print("X_train:", X_train.shape)
print("y_train:", y_train.shape)

print("X_test:", X_test.shape)
print("y_test:", y_test.shape)


X_train: (2876, 388)
y_train: (2876,)
X_test: (720, 388)
y_test: (720,)


Using ridge regression to regularise the logistic regression. This will handle multicollinearity from the multiple encoded columns.

In [339]:
from sklearn.linear_model import Ridge

ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)

y_pred = ridge.predict(X_test)


In [340]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

y_test_actual = np.exp(y_test)
y_pred_actual = np.exp(y_pred)

print("RMSE (£):", np.sqrt(mean_squared_error(y_test_actual, y_pred_actual)))
print("MAE (£):", mean_absolute_error(y_test_actual, y_pred_actual))
print("R²:", r2_score(y_test_actual, y_pred_actual))


RMSE (£): 1345.8227782167423
MAE (£): 909.328846103928
R²: 0.9149286867799103


## Evaluation

Residuals are centered around zero with no ovvious curve and a strong predictive accuracy.

Howver there is a larger spread at lower predicted values 7-8 and tighter spead at higher values 9-10. This means there is heteroscedacity and thew model predict expensive cars more accurately.

## Refine Features

- Removing features with coefficients near zero.

- Combing Sparse Categories

- Checking multicolinearity

In [341]:


X_numeric = X_train.select_dtypes(include=np.number)
vif = pd.DataFrame()
vif["feature"] = X_numeric.columns
vif["VIF"] = [variance_inflation_factor(X_numeric.values, i) for i in range(X_numeric.shape[1])]
print(vif.sort_values("VIF", ascending=False).head(30))


                feature          VIF
0        Mileage(miles)          inf
13        Mileage_Delta          inf
12     Expected_Mileage          inf
7               Mileage          inf
8               Car_Age          inf
1     Registration_Year  1794.714608
17      Car_Age_Squared    31.616327
16      Engine_per_Seat    11.608250
3                Engine    10.806069
18          Premium_Age     7.433333
11           Is_Premium     7.175358
10          Log_Mileage     6.448670
9      Mileage_per_Year     6.390838
5                 Seats     4.429321
14      Owners_per_Year     3.583434
2       Previous Owners     2.861193
15        Is_Family_Car     2.534683
4                 Doors     1.723911
6   Has_Service_History     1.060145


  vif = 1. / (1. - r_squared_i)


In [342]:

scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')
print("CV R²:", scores, "Mean:", scores.mean())


CV R²: [   0.81383981  -35.65766985 -728.30339922 -152.37762763   -2.30494149] Mean: -183.56595967673044


## Fix Multicollinearity 

In [344]:
drop_cols = [
    "Registration_Year",
    "Mileage(miles)",      # keep Log_Mileage instead
    "Engine_per_Seat",     # redundant
    "Is_Premium"           # redundant if brand dummies exist
]

X_train = X_train.drop(columns=[c for c in drop_cols if c in X_train.columns])
X_test = X_test.drop(columns=[c for c in drop_cols if c in X_test.columns])


In [345]:
X_test.columns

Index(['Previous Owners', 'Engine', 'Doors', 'Seats', 'Has_Service_History',
       'Mileage', 'Car_Age', 'Mileage_per_Year', 'Log_Mileage',
       'Expected_Mileage',
       ...
       'Model_ZR', 'Model_ZT', 'Model_Zafira', 'Model_Zafira Tourer',
       'Usage_Level_Low', 'Usage_Level_Normal', 'Usage_Level_Very High',
       'Door_Category_Family', 'Door_Category_Sedan', 'Door_Category_Small'],
      dtype='object', length=384)

In [347]:
import pickle

# Save X
with open("Data/X_train.pkl", "wb") as f:
    pickle.dump(X_train, f)

with open("Data/X_test.pkl", "wb") as f:
    pickle.dump(X_test, f)

# Save y
with open("Data/y_train.pkl", "wb") as f:
    pickle.dump(y_train, f)

with open("Data/y_test.pkl", "wb") as f:
    pickle.dump(y_test, f)

print("Train/test sets saved successfully.")


Train/test sets saved successfully.


In [None]:
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

print(X_train.columns[:5])
print(y_train.head())
