In [None]:
import pandas as pd

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import pointbiserialr
from scipy.stats import chi2_contingency
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from IPython.core.interactiveshell import InteractiveShell
import os
InteractiveShell.ast_node_interactivity = "all"

In [None]:
df_clean = pd.read_csv("df_clean.csv")
df_clean.head(5)

In [None]:
x_features = ['Size', 'Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'CPI', 'Unemployment']
print(np.isinf(df_clean[x_features]).sum())
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_clean[x_features])
df_scaled = pd.DataFrame(df_scaled, columns = x_features)
df_scaled.head(5)

In [None]:
from sklearn.linear_model import LinearRegression

X = df_scaled
# y = scaler.fit_transform(df_clean['Weekly_Sales'].values.reshape(-1,1)).ravel()
y = df_clean['Weekly_Sales']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)

y_pred = linear_reg.predict(X_test)

In [None]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score

lreg_mae = mean_absolute_error(y_test, y_pred)
lreg_mse = mean_squared_error(y_test, y_pred)
lreg_r2 = r2_score(y_test, y_pred)
print(f"MAE: {lreg_mae:.3f}")
print(f"MSE: {lreg_mse:.3f}")
print(f"R^2: {lreg_r2:.3f}")

In [None]:
print(y.shape)
print(X.shape)
print(y_pred.shape)
print(y_test.shape)

In [None]:
import numpy as np

#Plot regression
#Get limits for line
min_val = min(y_test.min(), y_pred.min())
max_val = max(y_test.max(), y_pred.max())

#Plot scatterplot
plt.figure(figsize=(8,6))
sns.scatterplot(x=y_test, y=y_pred, alpha=0.7)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title("Linear Regression Model")

#Plot line
plt.plot([min_val, max_val], [min_val, max_val], color='red', linestyle='--')
plt.xlim([-5000,150000])
plt.ylim([0,40000])
plt.tight_layout()
plt.show()

In [None]:
df_clean.head(5)
df_clean['Type'] = df_clean['Type'].astype(str)
df_clean = pd.get_dummies(df_clean, columns = ['Type'], prefix = 'type')
df_clean.head(5)
df_clean.info()

In [None]:
#random forest data
X_rf = df_clean.drop(columns=['Date', 'Weekly_Sales', 'Lagged_Sales', 'Sales_Growth_Rate', 'Temperature_Zscore'])
y_rf = df_clean['Weekly_Sales']

X_train, X_test, y_train, y_test = train_test_split(X_rf, y_rf, test_size = 0.2, random_state = 42)

In [None]:
#random forest (n_estimators = 50 max_depth = 5)
import time
from sklearn.ensemble import RandomForestRegressor

start_time = time.time()
rf = RandomForestRegressor(
    n_estimators = 50,
    max_depth = 5
)

rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

rf_r2_50_5 = r2_score(y_test, rf_pred)
rf_mae_50_5 = mean_absolute_error(y_test, rf_pred)
rf_mse_50_5 = mean_squared_error(y_test, rf_pred)
print(f"R^2: {rf_r2_50_5:.3f}")
print(f"MAE: {rf_mae_50_5:.3f}")
print(f"MSE: {rf_mse_50_5:.3f}")

end_time = time.time()
run_time = end_time - start_time
print(run_time)

In [None]:
#random forest (n_estimators = 100 max_depth = 10)
import time
from sklearn.ensemble import RandomForestRegressor

start_time = time.time()
rf = RandomForestRegressor(
    n_estimators = 100,
    max_depth = 10
)

rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

rf_r2_100_10 = r2_score(y_test, rf_pred)
rf_mae_100_10 = mean_absolute_error(y_test, rf_pred)
rf_mse_100_10 = mean_squared_error(y_test, rf_pred)
print(f"R^2: {rf_r2_100_10:.3f}")
print(f"MAE: {rf_mae_100_10:.3f}")
print(f"MSE: {rf_mse_100_10:.3f}")

end_time = time.time()
run_time = end_time - start_time
print(run_time)

In [None]:
#random forest (n_estimators = 100 max_depth = 100)
import time
from sklearn.ensemble import RandomForestRegressor

start_time = time.time()
rf = RandomForestRegressor(
    n_estimators = 100,
    max_depth = 100
)

rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

rf_r2_100_100 = r2_score(y_test, rf_pred)
rf_mae_100_100 = mean_absolute_error(y_test, rf_pred)
rf_mse_100_100 = mean_squared_error(y_test, rf_pred)
print(f"R^2: {rf_r2_100_100:.3f}")
print(f"MAE: {rf_mae_100_100:.3f}")
print(f"MSE: {rf_mse_100_100:.3f}")

end_time = time.time()
run_time = end_time - start_time
print(run_time)

In [None]:
#XGBoost Regressor (n = 50)
from xgboost import XGBRegressor
start_time = time.time()
xgb = XGBRegressor(n_estimators = 50)
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_test)

xgb_r2_50 = r2_score(y_test, xgb_pred)
xgb_mae_50 = mean_absolute_error(y_test, xgb_pred)
xgb_mse_50 = mean_squared_error(y_test, xgb_pred)
XGB_accuracy_50 = xgb.score(X_test, y_test)*100
print(f"R^2: {xgb_r2_50:.3f}")
print(f"MAE: {xgb_mae_50:.3f}")
print(f"MSE: {xgb_mse_50:.3f}")
print(f"XGBoost Accuracy = {XGB_accuracy_50:.3f}")


end_time = time.time()
run_time = end_time - start_time
print(f"Model execution time: {run_time}")

In [None]:
#XGBoost Regressor (n = 100)
start_time = time.time()
xgb = XGBRegressor(n_estimators = 200)
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_test)

xgb_r2_100 = r2_score(y_test, xgb_pred)
xgb_mae_100 = mean_absolute_error(y_test, xgb_pred)
xgb_mse_100 = mean_squared_error(y_test, xgb_pred)
XGB_accuracy_100 = xgb.score(X_test, y_test)*100
print(f"R^2: {xgb_r2_100:.3f}")
print(f"MAE: {xgb_mae_100:.3f}")
print(f"MSE: {xgb_mse_100:.3f}")
print(f"XGBoost Accuracy = {XGB_accuracy_100:.3f}")


end_time = time.time()
run_time = end_time - start_time
print(f"Model execution time: {run_time}")

In [None]:
#table of results
results = {
    "Model": ["Linear Regression", "Random Forest", "Random Forest", "Random Forest", "XGBoost", "XGBoost"],
    "MSE": [lreg_mse, rf_mse_50_5, rf_mse_100_10, rf_mse_100_100, xgb_mse_50, xgb_mse_100],
    "R^2": [lreg_r2, rf_r2_50_5, rf_r2_100_10, rf_r2_100_100, xgb_r2_50, xgb_r2_100]
}

results_df = pd.DataFrame(results)
print(results_df)

In [None]:
df_clean.to_csv("df_clean_forecasting.csv", index=False)