### Random Forest Regressor

In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from datetime import datetime

In [3]:
df = pd.read_csv("merged_df.csv")
df.describe()

Unnamed: 0,Store,DayOfWeek,Sales,Customers,Open,Promo,SchoolHoliday,StateHoliday_encoded,isHoliday,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,StoreType_encoded,Assortment_encoded,PromoInterval_encoded
count,1017209.0,1017209.0,1017209.0,1017209.0,1017209.0,1017209.0,1017209.0,1017209.0,1017209.0,1017209.0,1017209.0,1017209.0,1017209.0,1017209.0,1017209.0,1017209.0,1017209.0,1017209.0
mean,558.4297,3.998341,5773.819,633.1459,0.8301067,0.3815145,0.1786467,0.04516279,0.8213533,5422.021,7.336981,2008.925,0.5005638,11.64767,1007.011,1.207467,0.9351412,0.3093396
std,321.9087,1.997391,3849.926,464.4117,0.3755392,0.4857586,0.3830564,0.2836559,0.3830564,7706.918,2.674456,4.99937,0.4999999,15.32393,1005.877,1.365376,0.9938011,1.421467
min,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,1.0,1900.0,0.0,0.0,0.0,0.0,0.0,-1.0
25%,280.0,2.0,3727.0,405.0,1.0,0.0,0.0,0.0,1.0,710.0,6.0,2008.0,0.0,0.0,0.0,0.0,0.0,-1.0
50%,558.0,4.0,5744.0,609.0,1.0,0.0,0.0,0.0,1.0,2325.0,8.0,2010.0,1.0,1.0,2009.0,0.0,0.0,1.0
75%,838.0,6.0,7856.0,837.0,1.0,1.0,0.0,0.0,1.0,6880.0,9.0,2011.0,1.0,22.0,2012.0,3.0,2.0,1.0
max,1115.0,7.0,41551.0,7388.0,1.0,1.0,1.0,3.0,1.0,75860.0,12.0,2015.0,1.0,50.0,2015.0,3.0,2.0,3.0


In [4]:
# Convert date fields to datetime objects
df['Date'] = pd.to_datetime(df['Date'])
df['CompetitionOpenSinceDate'] = pd.to_datetime(df['CompetitionOpenSinceDate'])

# Calculate the duration of competition being open in months
df['CompetitionDurationMonths'] = (
    (df['Date'].dt.year - df['CompetitionOpenSinceDate'].dt.year) * 12 +
    (df['Date'].dt.month - df['CompetitionOpenSinceDate'].dt.month)
)

# Impute missing dates on Promo2Date with a placeholder value, e.g., '1900-01-01'
df['Promo2Date'].fillna('1900-01-01', inplace=True)

# Replace NaNs in 'CompetitionDurationMonths' with the median duration
df['CompetitionDurationMonths'].fillna(df['CompetitionDurationMonths'].median(), inplace=True)

# Drop the original date fields and other non-numeric or unnecessary columns
df = df.drop(['StoreType', 'Assortment', 'StateHoliday', 'PromoInterval', 'CompetitionOpenSinceMonth','CompetitionOpenSinceYear', 'Promo2SinceWeek', 'Promo2SinceYear'], axis=1)

# --> impute median by store type within each year, tell why you used storetype. then once you have the months categorize them in ranges
# --> also remove the thing you did with competitionDate and promo2Date
# --> CompetitionDistanceCategory, show outliers graph

# Convert datetime columns to numeric features
df['Date_Year'] = df['Date'].dt.year
df['Date_Month'] = df['Date'].dt.month
df['Date_Day'] = df['Date'].dt.day

df['CompetitionOpenSinceDate_Year'] = df['CompetitionOpenSinceDate'].dt.year
df['CompetitionOpenSinceDate_Month'] = df['CompetitionOpenSinceDate'].dt.month
df['CompetitionOpenSinceDate_Day'] = df['CompetitionOpenSinceDate'].dt.day

df['Promo2Date'] = pd.to_datetime(df['Promo2Date'])
df['Promo2Date_Year'] = df['Promo2Date'].dt.year
df['Promo2Date_Month'] = df['Promo2Date'].dt.month
df['Promo2Date_Day'] = df['Promo2Date'].dt.day

# Drop the original datetime columns
df = df.drop(['Date', 'CompetitionOpenSinceDate', 'Promo2Date'], axis=1)

In [6]:
# Define the target variables and features
targets = ['Sales', 'Customers']
features = df.drop(targets, axis=1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, df[targets], test_size=0.2, random_state=42)

In [10]:
# Initialize the RandomForestRegressor models
rf_sales = RandomForestRegressor(n_estimators=100, random_state=42)
rf_customers = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the RandomForestRegressor model for sales
rf_sales.fit(X_train, y_train['Sales'])

# Train the RandomForestRegressor model for customers
rf_customers.fit(X_train, y_train['Customers'])

In [None]:
# Predict on test data
sales_predictions = rf_sales.predict(X_test)
customers_predictions = rf_customers.predict(X_test)

In [None]:
# Evaluate performance for Sales
mse_sales = mean_squared_error(y_test['Sales'], sales_predictions)
rmse_sales = np.sqrt(mse_sales)
rmspe_sales = rmse_sales / np.mean(y_test['Sales']) * 100

print(f'Mean Squared Error for Sales: {mse_sales}')
print(f'Root Mean Squared Error for Sales: {rmse_sales}')
print(f'Root Mean Squared Percentage Error for Sales: {rmspe_sales}')

Mean Squared Error for Sales: 681681.3903717938
Root Mean Squared Error for Sales: 825.6399883555748
Root Mean Squared Percentage Error for Sales: 14.329931932467122


In [None]:
# Evaluate performance for Customers
mse_customers = mean_squared_error(y_test['Customers'], customers_predictions)
rmse_customers = np.sqrt(mse_customers)
rmspe_customers = rmse_customers / np.mean(y_test['Customers']) * 100

print(f'Mean Squared Error for Customers: {mse_customers}')
print(f'Root Mean Squared Error for Customers: {rmse_customers}')
print(f'Root Mean Squared Percentage Error for Customers: {rmspe_customers}')

Mean Squared Error for Customers: 5702.327715027378
Root Mean Squared Error for Customers: 75.51375844855941
Root Mean Squared Percentage Error for Customers: 11.947254980312639


In [None]:
# Get feature importances
feature_importances = rf_sales.feature_importances_
feature_importances2 = rf_customers.feature_importances_

# Create a DataFrame to display feature importances
importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)
importance_df2 = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances2})
importance_df2 = importance_df2.sort_values(by='Importance', ascending=False)

# Print the top features
print("Top Features:")
print(importance_df.head())
print(importance_df2.head())

Top Features:
                         Feature  Importance
2                           Open    0.459925
7            CompetitionDistance    0.100048
0                          Store    0.081398
3                          Promo    0.073256
18  CompetitionOpenSinceDate_Day    0.044212
                         Feature  Importance
2                           Open    0.380239
7            CompetitionDistance    0.149041
0                          Store    0.102593
9              StoreType_encoded    0.054498
18  CompetitionOpenSinceDate_Day    0.045410
