# What drives the price of a car ? 

In [83]:
from statsmodels.tsa.seasonal import seasonal_decompose

#import statsmodels.api as sm
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np


from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures, StandardScaler
from sklearn.compose import ColumnTransformer, make_column_transformer
#from sklearn.inspection import permutation_importance
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score , GridSearchCV

from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

from sklearn.ensemble import RandomForestRegressor

In [45]:
import warnings
warnings.filterwarnings("ignore")

In [46]:
pd.set_option('display.max_colwidth', None)

In [78]:
# Reading from the CLEAN and PROCESSED data saved ( df_clean.csv) 

df=pd.read_csv('data/df_clean.csv')
df = df.apply(pd.to_numeric, errors='ignore')
df

Unnamed: 0,price,manufacturer,condition,odometer,title_status,type,car_age
0,33590,gmc,good,57923.0,clean,pickup,11.0
1,22590,chevrolet,good,71229.0,clean,pickup,15.0
2,39590,chevrolet,good,19160.0,clean,pickup,5.0
3,30990,toyota,good,41124.0,clean,pickup,8.0
4,15000,ford,excellent,128000.0,clean,truck,12.0
...,...,...,...,...,...,...,...
365714,23590,nissan,good,32226.0,clean,sedan,6.0
365715,30590,volvo,good,12029.0,clean,sedan,5.0
365716,34990,cadillac,good,4174.0,clean,hatchback,5.0
365717,28990,lexus,good,30112.0,clean,sedan,7.0


In [48]:
# Creating dummies for string values 
df2 = pd.get_dummies(df, drop_first=True)

In [49]:
# Set X to all input features by dropping the target column 'price' from the DataFrame
# Set y to the target variable 'price' from the DataFrame

X = ""
y = ""
X = df2.drop("price", axis=1)
y = df2["price"]

In [50]:
# Split the dataset into training (80%) and testing (20%) sets for model evaluation
train_X , test_X, train_y, test_y = train_test_split(X,y,test_size=0.2,random_state = 42) 

##### ------------------
##### Linear Regression 
##### ------------------

In [51]:
# Create and train a linear regression model to learn the relationship between features and price
reg = LinearRegression().fit(train_X,train_y)

In [81]:
# Evaluate the model performance on unseen test data using R² score
print(f"Test R² Score:" , reg.score(test_X,test_y))
print(f"Test Intercept:" , reg.intercept_)

Test R² Score: 0.5738324121815643
Test Intercept: 39971.86106515444


In [79]:
# Evaluate the model performance on training data using R² score
print(f"Training R² Score:" , reg.score(train_X,train_y))

Training R² Score: 0.575328296514303


In [55]:
# Printing the Coefficient values from linear regression 

coef_df = pd.DataFrame({
    "Feature": X.columns,
    "Coefficient": reg.coef_
}).sort_values(by="Coefficient", ascending=False)

print("Coefficient values - Top 10 \n ")
coef_df.head(10)

Coefficient values - Top 10 
 


Unnamed: 0,Feature,Coefficient
12,manufacturer_ferrari,52474.182452
10,manufacturer_datsun,19496.387434
3,manufacturer_aston-martin,16484.272674
39,manufacturer_tesla,14103.098848
34,manufacturer_porsche,10835.412427
61,type_truck,8739.523913
58,type_pickup,6267.037849
36,manufacturer_rover,5737.782177
57,type_offroad,5712.917392
2,manufacturer_alfa-romeo,3553.962975


##### Linear regression scores 
##### R2 on training : 0.575328296514303
##### R2 on testing : 0.5738324121815643
##### R2 score is bad  , on both training and testing. 

##### -------------------------------------------------
##### Linear Regression with forward selection 
##### -------------------------------------------------

In [59]:
# Initialize Variables

remaining_features = list(X.columns)  # All features
selected_features = []               # Start with no features
best_score = 0

In [60]:
# Forward Selection Loop (select top 5 features)
top_n = 5  # Select only top 5 features 

while len(selected_features) < top_n and len(remaining_features) > 0:
    scores_with_candidates = []
    
    for feature in remaining_features:
        # Try adding this feature to the current set
        features_to_try = selected_features + [feature]
        
        # Cross-validation score
        score = cross_val_score(
            LinearRegression(),
            X[features_to_try],
            y,
            cv=5,
            scoring='r2'
        ).mean()
        
        scores_with_candidates.append((score, feature))
    
    # Pick the feature with the highest CV score
    scores_with_candidates.sort(reverse=True)
    best_new_score, best_feature = scores_with_candidates[0]
    
    # Stop if score doesn’t improve
    if best_new_score > best_score:
        remaining_features.remove(best_feature)
        selected_features.append(best_feature)
        best_score = best_new_score
        print(f"Added {best_feature}, CV R² = {best_score:.4f}")
    else:
        break

print("Top 5 selected features:", selected_features)

Added car_age, CV R² = 0.3056
Added odometer, CV R² = 0.3863
Added type_truck, CV R² = 0.4314
Added type_pickup, CV R² = 0.4788
Added type_sedan, CV R² = 0.4973
Top 5 selected features: ['car_age', 'odometer', 'type_truck', 'type_pickup', 'type_sedan']


In [61]:
# Train Model on Selected Features

final_model = LinearRegression()
final_model.fit(train_X[selected_features], train_y)

print("Train R²:", final_model.score(train_X[selected_features], train_y))
print("Test R²:", final_model.score(test_X[selected_features], test_y))

Train R²: 0.5011719829121948
Test R²: 0.49937563759256665


In [62]:
# Feature Importance Table 

coef_df = pd.DataFrame({
    "Feature": selected_features,
    "Coefficient": final_model.coef_
}).sort_values(by="Coefficient", key=abs, ascending=False)

print("Feature importance table - Top 10")
coef_df.head(10)

Feature importance table - Top 10


Unnamed: 0,Feature,Coefficient
2,type_truck,11303.309939
3,type_pickup,8804.053341
4,type_sedan,-4883.452399
0,car_age,-757.842619
1,odometer,-0.084855


##### Lasso Regression 

In [63]:
# Train a Lasso regression model with regularization

lasso_reg = Lasso(alpha=10, max_iter = 10000 , tol = 0.0001)
lasso_reg.fit(train_X, train_y)

In [64]:
print("Lasso Test R²:", lasso_reg.score(test_X,test_y))

Lasso Test R²: 0.5688252785955601


In [82]:
print("Lasso Train R²:", lasso_reg.score(train_X,train_y))

Lasso Train R²: 0.5708030174146866


##### Lasso regression scores 
##### R2 on training :  0.5708030174146866
##### R2 on testing : 0.5688252785955601
##### R2 score is bad  , on both training and testing. 

##### Ridge Regression 

In [66]:
# Train a Ridge regression model with regularization

ridge_reg = Ridge(alpha=30 , max_iter = 10000 , tol = 0.0001)
ridge_reg.fit(train_X,train_y)

In [67]:
print("Ridge Regression Test R²:", ridge_reg.score(test_X, test_y))

Ridge Regression Test R²: 0.5730533171610614


In [68]:
print("Ridge Regression Train R²:", ridge_reg.score(train_X, train_y))

Ridge Regression Train R²: 0.5748613129548226


##### Ridge Regression Scores 
##### R2 on training : 0.5747237547974725
##### R2 on testing : 0.5730533171610614
##### R2 score is bad , on both training and testing.

#####  Ridge Regression with Hyper-Parmeter selection 

In [69]:
# Perform hyperparameter tuning for Ridge regression using GridSearchCV to find the optimal alpha value via 5-fold cross-validation


param_grid = {
    "alpha": [0.01, 0.08 , 0.09 , 0.1, 0.15 , 0.2, 0.3 , 0.4 , 0.5 , 0.6 ,  1, 10,  30, 50, 100, 300]
}

ridge = Ridge(max_iter=10000)

ridge_grid = GridSearchCV(
    ridge,
    param_grid,
    cv=5,
    scoring="r2"
)

ridge_grid.fit(train_X, train_y)

In [70]:
# Finding best alpha 
print("Best alpha:", ridge_grid.best_params_["alpha"])

Best alpha: 0.3


In [71]:
# Finding best training score 
best_ridge = ridge_grid.best_estimator_
print("Best R2 score with Ridge and Hyperparamter selection and 5 fold cross validation " , best_ridge.score(test_X, test_y))

Best R2 score with Ridge and Hyperparamter selection and 5 fold cross validation  0.5738264708199561


In [72]:
# Printing top 10 most important coefficients. 

coef_df = pd.DataFrame({
    "Feature": train_X.columns,
    "Coefficient": best_ridge.coef_
})

# Sort by absolute importance
top_features = coef_df.reindex(
    coef_df.Coefficient.abs().sort_values(ascending=False).index
)

top_features.head(10)

Unnamed: 0,Feature,Coefficient
12,manufacturer_ferrari,51896.818035
10,manufacturer_datsun,19207.294692
3,manufacturer_aston-martin,16180.490964
31,manufacturer_morgan,-15801.461959
39,manufacturer_tesla,14096.973132
13,manufacturer_fiat,-11166.608404
34,manufacturer_porsche,10831.909677
16,manufacturer_harley-davidson,-10102.505073
22,manufacturer_kia,-9173.323259
61,type_truck,8739.395334


##### RandomForest Regressor 

In [73]:
# Creating the RandomForestRegressor model and fitting 

rfr_model = RandomForestRegressor(
    n_estimators=300,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)

rfr_model.fit(train_X, train_y)

In [74]:
# Calculating and R2 error 

# Calculate R² score on training data
rfr_train_r2 = rfr_model.score(train_X, train_y)

# Calculate R² score on testing data
rfr_test_r2 = rfr_model.score(test_X, test_y)

print(f"Random Forest Train R2 Score: {rfr_train_r2:.4f}")
print(f"Random Forest Test  R2 Score: {rfr_test_r2:.4f}")

Random Forest Train R2 Score: 0.9764
Random Forest Test  R2 Score: 0.8418


In [75]:
importances = pd.Series(rfr_model.feature_importances_, index=train_X.columns)
print("Top 5 important features")
importances.sort_values(ascending=False).head(5)

Top 5 important features


car_age        0.446003
odometer       0.264339
type_truck     0.042991
type_pickup    0.038481
type_sedan     0.021257
dtype: float64