In [1]:
from statsmodels.tsa.seasonal import seasonal_decompose

import statsmodels.api as sm
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.forecasting.stl import STLForecast


from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures, StandardScaler
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.inspection import permutation_importance
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor


In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
pd.set_option('display.max_colwidth', None)

In [4]:
df_clean=pd.read_csv('data/df_clean.csv')

##### Random Forest Regressor

In [5]:
X = ""
y = ""
# Separate features and target 
X = df_clean.drop('price', axis=1)
y = df_clean['price']

In [6]:
#Identify numerical + categorical features
num_cols = ['odometer', 'car_age']
cat_cols = [col for col in X.columns if col not in num_cols]
cat_cols

['manufacturer', 'condition', 'title_status', 'type']

In [7]:
# One-hot encode categorical columns
X = pd.get_dummies(X, columns=cat_cols, drop_first=True)


In [8]:
# Spilitting into training and testing 

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [9]:
# Creating the model and fitting 

model = RandomForestRegressor(
    n_estimators=300,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train)

In [10]:
# Calculating RSME and R2 error 

y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

In [11]:
print("RMSE:", rmse)
print("R²:", r2)

RMSE: 5629.668379920402
R²: 0.8418322634121502


###### Interpreting results:
###### RMSE tells you the typical dollar error
###### R² tells you how much variance your model explains
###### ~0.6 → decent  , ~0.7 → good  , ~0.8 → excellent

In [12]:
importances = pd.Series(model.feature_importances_, index=X_train.columns)

In [13]:
print("Top 5 important features")
importances.sort_values(ascending=False).head(5)


Top 5 important features


car_age        0.446003
odometer       0.264339
type_truck     0.042991
type_pickup    0.038481
type_sedan     0.021257
dtype: float64