In [1]:
from statsmodels.tsa.seasonal import seasonal_decompose

#import statsmodels.api as sm
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np


from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures, StandardScaler
from sklearn.compose import ColumnTransformer, make_column_transformer
#from sklearn.inspection import permutation_importance
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score , GridSearchCV

from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
pd.set_option('display.max_colwidth', None)

In [4]:
# Reading from the CLEAN and PROCESSED data saved ( df_clean.csv) 

df=pd.read_csv('data/df_clean.csv')
df = df.apply(pd.to_numeric, errors='ignore')

In [5]:
# Creating dummies for string values 
df2 = pd.get_dummies(df, drop_first=True)

In [6]:
X = ""
y = ""
X = df2.drop("price", axis=1)
y = df2["price"]

##### Linear Regression - Using Forward selection 5 features. 

In [7]:
def forward_selection_top_k(X, y, k=5, cv=5):
    """
    Select top k features using forward selection based on cross-validated R^2.

    Parameters:
        X : pd.DataFrame, numeric predictors
        y : pd.Series or np.array, target variable
        k : int, number of features to select
        cv : int, cross-validation folds

    Returns:
        selected_features : list of top k selected features
    """
    remaining_features = list(X.columns)
    selected_features = []
    
    while len(selected_features) < k and remaining_features:
        scores_with_candidates = []
        
        for feature in remaining_features:
            features_to_try = selected_features + [feature]
            model = LinearRegression()
            # Use cross-validated R^2 as selection metric
            scores = cross_val_score(model, X[features_to_try], y, cv=cv, scoring='r2')
            mean_score = scores.mean()
            scores_with_candidates.append((mean_score, feature))
        
        # Pick the feature with the highest mean R^2
        scores_with_candidates.sort(reverse=True)
        best_score, best_feature = scores_with_candidates[0]

        #selected_scores.append(best_score)
        selected_features.append(best_feature)
        remaining_features.remove(best_feature)
    
    return selected_features

In [8]:
# Selecting the top 5 features 
top5_features = forward_selection_top_k(X, y, k=5)
print("Top 5 selected features:", top5_features)

Top 5 selected features: ['car_age', 'odometer', 'type_truck', 'type_pickup', 'type_sedan']


In [10]:
# Fit Final Linear model using selected features. 
final_model = LinearRegression()
final_model.fit(X[top5_features], y)
r2 = final_model.score(X[top5_features], y)


In [11]:
# Pair each feature with its coefficient
coef_pairs = list(zip(top5_features, final_model.coef_))

# Sort by absolute magnitude (importance)
coef_pairs_sorted = sorted(coef_pairs, key=lambda x: abs(x[1]), reverse=True)


print("\nTop 5 Most Influential Features (sorted by impact):")
for feature, coef in coef_pairs_sorted:
    print(f"{feature}: coefficient = {coef}")

print("\nIntercept:", final_model.intercept_)
print("\n R2 :" , r2)


Top 5 Most Influential Features (sorted by impact):
type_truck: coefficient = 11240.699406410857
type_pickup: coefficient = 8774.613900365743
type_sedan: coefficient = -4891.545054972943
car_age: coefficient = -757.9168886459646
odometer: coefficient = -0.08500296772393606

Intercept: 36173.51416554385

 R2 : 0.5008167896526374


##### Ridge model / Hyperparmeter & grid search 

In [12]:
X_selected = X[top5_features]

ridge_model = Ridge()
param_grid = {
    'alpha': [0.01, 0.1, 1, 10, 50, 100]
}
X_selected

Unnamed: 0,car_age,odometer,type_truck,type_pickup,type_sedan
0,11.0,57923.0,False,True,False
1,15.0,71229.0,False,True,False
2,5.0,19160.0,False,True,False
3,8.0,41124.0,False,True,False
4,12.0,128000.0,True,False,False
...,...,...,...,...,...
365714,6.0,32226.0,False,False,True
365715,5.0,12029.0,False,False,True
365716,5.0,4174.0,False,False,False
365717,7.0,30112.0,False,False,True


In [13]:
grid = GridSearchCV(estimator=ridge_model, param_grid=param_grid, cv=5, scoring='r2')
grid.fit(X_selected, y)

In [14]:
print("Best alpha:", grid.best_params_)
print("Best R²:", grid.best_score_)

Best alpha: {'alpha': 0.01}
Best R²: 0.4972525074997973


In [15]:
ridge_coef = grid.best_estimator_.coef_

coef_df = pd.DataFrame({
    'Feature': X_selected.columns,
    'Coefficient': ridge_coef
}).sort_values(by='Coefficient', key=abs, ascending=False)

print(coef_df)

       Feature   Coefficient
2   type_truck  11240.694788
3  type_pickup   8774.610992
4   type_sedan  -4891.545091
0      car_age   -757.916900
1     odometer     -0.085003


##### Lasso Regression with GridSearchCV 

In [16]:
lasso = Lasso(max_iter=10000)

param_grid = {'alpha': [ 0.001 , 0.01, 0.1, 1, 10]}

grid = GridSearchCV(lasso, param_grid, cv=5, scoring='r2')
grid.fit(X_selected, y)

print("Best alpha:", grid.best_params_)
print("Best cross-validated R²:", grid.best_score_)

Best alpha: {'alpha': 0.001}
Best cross-validated R²: 0.49725250620694894


In [17]:
coef_lasso = pd.DataFrame({
    'Feature': X_selected.columns,
    'Coefficient': grid.best_estimator_.coef_
}).sort_values(by='Coefficient', key=abs, ascending=False)

print(coef_lasso)

       Feature   Coefficient
2   type_truck  11240.684955
3  type_pickup   8774.603020
4   type_sedan  -4891.541854
0      car_age   -757.916891
1     odometer     -0.085003
