In [3]:
pip install pandas
pip install numpy 
pip install scikit-learn



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor


In [73]:
df = pd.read_csv("./Data/retail_store_inventory.csv")

In [74]:
df.head()

Unnamed: 0,Date,Store ID,Product ID,Category,Region,Inventory Level,Units Sold,Units Ordered,Demand Forecast,Price,Discount,Weather Condition,Holiday/Promotion,Competitor Pricing,Seasonality
0,2022-01-01,S001,P0001,Groceries,North,231,127,55,135.47,33.5,20,Rainy,0,29.69,Autumn
1,2022-01-01,S001,P0002,Toys,South,204,150,66,144.04,63.01,20,Sunny,0,66.16,Autumn
2,2022-01-01,S001,P0003,Toys,West,102,65,51,74.02,27.99,10,Sunny,1,31.32,Summer
3,2022-01-01,S001,P0004,Toys,North,469,61,164,62.18,32.72,10,Cloudy,1,34.74,Autumn
4,2022-01-01,S001,P0005,Electronics,East,166,14,135,9.26,73.64,0,Sunny,0,68.95,Summer


In [75]:
#checking importance of "score_id"

from scipy.stats import f_oneway

stores = df['Store ID'].unique()
groups = [df[df['Store ID'] == store]['Units Sold'] for store in stores]

f_stat, p_value = f_oneway(*groups)
print(f"F-statistic: {f_stat}, P-value: {p_value}")

F-statistic: 2.4061067606526807, P-value: 0.04726220926004178


In [76]:
df.groupby('Product ID')['Units Sold'].describe()


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Product ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
P0001,3655.0,136.268399,109.315675,0.0,47.0,107.0,201.0,496.0
P0002,3655.0,133.468399,106.417492,0.0,47.0,106.0,194.0,486.0
P0003,3655.0,134.960055,108.380437,0.0,46.0,105.0,203.0,485.0
P0004,3655.0,135.567989,109.263223,0.0,47.0,107.0,202.0,489.0
P0005,3655.0,137.79699,106.854998,0.0,52.0,111.0,204.0,488.0
P0006,3655.0,136.013953,108.80725,0.0,48.0,106.0,201.0,489.0
P0007,3655.0,136.613133,110.505351,0.0,48.0,106.0,204.0,491.0
P0008,3655.0,133.669767,108.675121,0.0,46.0,104.0,197.5,494.0
P0009,3655.0,137.369631,110.560806,0.0,47.0,109.0,204.0,491.0
P0010,3655.0,135.832832,109.721355,0.0,49.0,105.0,200.0,488.0


In [77]:
#Checking importance of product ID

from scipy.stats import f_oneway

products = df['Product ID'].unique()
groups = [df[df['Product ID'] == product]['Units Sold'] for product in products]

f_stat, p_value = f_oneway(*groups)
print(f"F-statistic: {f_stat}, P-value: {p_value}")


F-statistic: 0.8858477931073849, P-value: 0.6013074706842993


### Feature Engineering

In [78]:
# EXtracting month, day, week from the date column 

df['Date'] = pd.to_datetime(df['Date'])


#Extract date components 
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day

In [79]:
#OHE for the following categories

df = pd.get_dummies(df, columns=['Category', 'Region', 'Weather Condition', 'Store ID'], drop_first=True)

In [80]:
df.drop(columns=['Date','Product ID'], inplace=True)

In [82]:
#Dealing with the seasonality column with cyclic encoding to preserve periodicity 

# Convert Seasonality into numbers (Manual Mapping)
season_mapping = {'Spring': 0, 'Summer': 1, 'Autumn': 2, 'Winter': 3}
df['Seasonality_Num'] = df['Seasonality'].map(season_mapping)

# Apply Cyclic Encoding (Sin & Cos)
df['Seasonality_Sin'] = np.sin(2 * np.pi * df['Seasonality_Num'] / 4)
df['Seasonality_Cos'] = np.cos(2 * np.pi * df['Seasonality_Num'] / 4)

# Drop original Seasonality columns
df.drop(columns=['Seasonality', 'Seasonality_Num'], inplace=True)



In [94]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

def train_model(df):
    X = df.drop(columns=['Units Sold'])  # Features
    y = df['Units Sold']  # Target variable

    # Split into Train and Test sets (80% Train, 20% Test)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize RandomForestRegressor
    model = RandomForestRegressor(
        n_estimators=100,  # Number of trees
        max_depth=10,      # Tree depth (adjust as needed)
        random_state=42,
        n_jobs=-1          # Use all CPU cores
    )

    # Train the model
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Calculate evaluation metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    print(f"Mean Absolute Error (MAE): {mae}")
    print(f"Mean Squared Error (MSE): {mse}")
    print(f"Root Mean Squared Error (RMSE): {rmse}")
    print(f"R-Squared Score (R²): {r2}")

    # Return trained model and metrics
    return model, {'MAE': mae, 'MSE': mse, 'RMSE': rmse, 'R2': r2}

    

In [95]:
model, metrics = train_model(df)


Mean Absolute Error (MAE): 7.156549268748968
Mean Squared Error (MSE): 70.38911896882036
Root Mean Squared Error (RMSE): 8.389822344294327
R-Squared Score (R²): 0.9940548046115956
