In [1]:
import numpy as np
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import plotly.express as px
import seaborn as sns
import geopandas as gpd
import shapefile as shp
from shapely.geometry import Point
from itertools import product
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [None]:
base_path = 'data/'
holidays_events = pd.read_csv(base_path + 'holidays_events.csv')
items = pd.read_csv(base_path + 'items.csv')
oil = pd.read_csv(base_path + 'oil.csv')
oil_INR = pd.read_csv(base_path + 'oil(INR).csv', sep=',', skipinitialspace=True)
stores = pd.read_csv(base_path + 'stores.csv')
transactions = pd.read_csv(base_path + 'transactions.csv')

test = pd.read_csv('dataRaw/test.csv', sep=',')
test['date'] = pd.to_datetime(test['date'], format='%Y-%m-%d')

data = pd.read_csv('dataRaw/data.csv', sep=',', low_memory=False)
data['date'] = pd.to_datetime(data['date'], format='%Y-%m-%d')
data['unit_sales'] = data['unit_sales'].round(2)
data['onpromotion'].fillna(False, inplace=True)

In [None]:
# Aggregating sales data monthly, using 'year' and 'month' for grouping
monthly_sales_simplified = data.groupby(['store_nbr', 'item_nbr', 'year', 'month'])['unit_sales'].sum().reset_index()

# Preparing the features (X) and target variable (y) for the model
# Note: This simplistic approach uses store number, item number, year, and month as features
# In a more refined model, additional features like promotions, holidays, and oil prices could be included to improve predictions
X = monthly_sales_simplified[['store_nbr', 'item_nbr', 'year', 'month']]  # Features
y = monthly_sales_simplified['unit_sales']  # Target variable

# Splitting the dataset into training and testing sets, considering a chronological split
# Here we ensure that we're respecting the time series nature of our data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# Constructing the Random Forest model
# Random Forest is chosen for its ability to handle non-linear data and its robustness against overfitting with many decision trees
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)  # Using 100 trees for a balance between performance and computational efficiency

# Training the model with our training data
rf_model.fit(X_train, y_train)

# Predicting the sales on the test set
y_pred = rf_model.predict(X_test)

# Evaluating the model's performance using Root Mean Squared Error (RMSE), a common metric for regression models
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse) 
print(f"Model's RMSE: {rmse}")

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(y_test.reset_index(drop=True), label='Actual Sales', color='blue')
plt.plot(y_pred, label='Predicted Sales', color='red')
plt.title('Actual vs Predicted Sales')
plt.xlabel('Time (in Months)')
plt.ylabel('Sales')
plt.legend()
plt.show()

In [None]:
# Let's first establish the range of stores and items from our training set for which we want to forecast
unique_stores = X_train['store_nbr'].unique()
unique_items = X_train['item_nbr'].unique()

# Now, we generate combinations of 'store_nbr', 'item_nbr', 'year', and 'month' for the next year
# For this demonstration, let's take just a single store and item as an example
future_months = pd.DataFrame(list(product([unique_stores[0]], [unique_items[0]], [max_year + 1], range(1, 13))),
                             columns=['store_nbr', 'item_nbr', 'year', 'month'])

# Predicting future sales for these combinations
future_sales_predictions = rf_model.predict(future_months)

# Plotting the future sales predictions
plt.figure(figsize=(10, 6))
months = range(1, 13)  # Months from January to December
plt.plot(months, future_sales_predictions, label='Projected Sales', color='green')
plt.title(f'Projected Sales for the Year {max_year + 1}')
plt.xlabel('Month')
plt.ylabel('Sales')
plt.xticks(ticks=months)  # Set x-ticks to be the months
plt.legend()
plt.show()