# Problem Formulation

## Problem Description:
This project aims to develop a machine learning model that predicts future sales
and demand by utilising historical sales data and external factors, including
product details, promotions, seasonality, holidays, and economic indicators. The
goal is to analyse historical patterns and generate reliable forecasts that help
businesses make data-driven decisions to reduce costs, increase efficiency, and
improve customer satisfaction by predicting the daily sales for the next 28 days.

## Objectives

● Collect and preprocess historical sales and demand data.

● Identify key features that influence sales trends.

● Build, train, and optimise forecasting models to predict future sales and
demand.

● Deploy the best-performing model to generate forecasts in real-time or in
batches.
## Data source:
Hierarchical sales data from Walmart, the world’s largest company by revenue in the US.



# Code setup

## Important libraries

In [1]:
import pandas as pd
import numpy as np

## data reading

In [2]:
#sales_validation=pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sales_train_validation.csv')
#sales_validation.head(3)

In [3]:
#cal=pd.read_csv('/kaggle/input/m5-forecasting-accuracy/calendar.csv')
#cal.head(3)

In [4]:
#sell_price = pd.read_csv("/kaggle/input/m5-forecasting-accuracy/sell_prices.csv")
#sell_price.head(3)

In [5]:
data = pd.read_csv("/kaggle/input/depi-dataset/data.csv")

# Schema formating

## sales_validation file

In [6]:
'''
data = sales_validation.melt(
    id_vars=["id","item_id", "dept_id","cat_id","store_id","state_id"],  # columns to keep
    var_name="d",                          # new column name for day labels (d_1, d_2, ...)
    value_name="sales"                     # new column name for sales values
)
'''


'\ndata = sales_validation.melt(\n    id_vars=["id","item_id", "dept_id","cat_id","store_id","state_id"],  # columns to keep\n    var_name="d",                          # new column name for day labels (d_1, d_2, ...)\n    value_name="sales"                     # new column name for sales values\n)\n'

In [7]:
#data.drop(columns=['id'],inplace=True)

## calendar file

In [8]:
#cal["date"]= pd.to_datetime(cal["date"])

In [9]:
'''
cal["event_name_1"]= cal["event_name_1"].fillna("No event")
cal["event_type_1"]= cal["event_type_1"].fillna("No event")
cal["event_name_2"]= cal["event_name_2"].fillna("No event")
cal["event_type_2"]= cal["event_type_2"].fillna("No event")
'''

'\ncal["event_name_1"]= cal["event_name_1"].fillna("No event")\ncal["event_type_1"]= cal["event_type_1"].fillna("No event")\ncal["event_name_2"]= cal["event_name_2"].fillna("No event")\ncal["event_type_2"]= cal["event_type_2"].fillna("No event")\n'

In [10]:
'''
# Merge data and cal dataframes on the 'd' column
merged_data = pd.merge(data, cal, on='d', how='left')

conditions = [
    merged_data["state_id"] == "CA",
    merged_data["state_id"] == "TX",
    merged_data["state_id"] == "WI"
]
choices= [
    merged_data["snap_CA"],
    merged_data["snap_TX"],
    merged_data["snap_WI"]
]
merged_data["snap"]= np.select(conditions, choices)

merged_data.drop(columns=['snap_CA','snap_TX','snap_WI'],inplace=True)
'''

'\n# Merge data and cal dataframes on the \'d\' column\nmerged_data = pd.merge(data, cal, on=\'d\', how=\'left\')\n\nconditions = [\n    merged_data["state_id"] == "CA",\n    merged_data["state_id"] == "TX",\n    merged_data["state_id"] == "WI"\n]\nchoices= [\n    merged_data["snap_CA"],\n    merged_data["snap_TX"],\n    merged_data["snap_WI"]\n]\nmerged_data["snap"]= np.select(conditions, choices)\n\nmerged_data.drop(columns=[\'snap_CA\',\'snap_TX\',\'snap_WI\'],inplace=True)\n'

## sell_price file


In [11]:
'''
final_data = temp_data.merge(
        sell_price,
        on=["store_id", "item_id", "wm_yr_wk"],
        how="left",
        validate="m:1"  # many sales rows per unique price row is expected
    )
    '''

'\nfinal_data = temp_data.merge(\n        sell_price,\n        on=["store_id", "item_id", "wm_yr_wk"],\n        how="left",\n        validate="m:1"  # many sales rows per unique price row is expected\n    )\n    '

In [12]:
#final_data["price_in_dollars"]=final_data["sell_price"]*final_data["sales"]

In [13]:
#final_data.to_csv("/kaggle/working/data.csv")

# EDA

In [14]:
data.isna().sum()

Unnamed: 0          0
item_id             0
store_id            0
d                   0
sales               0
date                0
wm_yr_wk            0
wday                0
event_name_1        0
event_type_1        0
event_name_2        0
event_type_2        0
snap                0
price_in_dollars    0
dtype: int64

In [15]:
print(data["event_name_1"].unique())
print(data["event_name_2"].unique())
print(data["event_type_1"].unique())
print(data["event_type_2"].unique())

['No event' "Mother's day" 'Halloween' 'VeteransDay' 'Thanksgiving'
 'Chanukah End' 'Christmas' 'NewYear' 'OrthodoxChristmas'
 'MartinLutherKingDay' 'SuperBowl' 'ValentinesDay' 'PresidentsDay'
 'LentStart' 'LentWeek2' 'Purim End' 'StPatricksDay' 'Easter' 'Pesach End'
 'Cinco De Mayo' 'MemorialDay' 'NBAFinalsStart' 'NBAFinalsEnd'
 'Ramadan starts' 'IndependenceDay' 'Eid al-Fitr' 'LaborDay' 'EidAlAdha'
 'ColumbusDay' "Father's day" 'OrthodoxEaster']


['No event' 'OrthodoxEaster' "Father's day" 'Cinco De Mayo' 'Easter']


['No event' 'Cultural' 'National' 'Religious' 'Sporting']


['No event' 'Religious' 'Cultural']


In [16]:
data.head(10)

Unnamed: 0.1,Unnamed: 0,item_id,store_id,d,sales,date,wm_yr_wk,wday,event_name_1,event_type_1,event_name_2,event_type_2,snap,price_in_dollars
0,0,FOODS_1_001,CA_1,d_1,3,2011-01-29,11101,1,No event,No event,No event,No event,0,6.0
1,1,FOODS_1_001,CA_1,d_10,0,2011-02-07,11102,3,No event,No event,No event,No event,1,0.0
2,2,FOODS_1_001,CA_1,d_100,0,2011-05-08,11115,2,Mother's day,Cultural,No event,No event,1,0.0
3,3,FOODS_1_001,CA_1,d_1000,1,2013-10-24,11339,6,No event,No event,No event,No event,0,2.24
4,4,FOODS_1_001,CA_1,d_1001,0,2013-10-25,11339,7,No event,No event,No event,No event,0,0.0
5,5,FOODS_1_001,CA_1,d_1002,1,2013-10-26,11340,1,No event,No event,No event,No event,0,2.24
6,6,FOODS_1_001,CA_1,d_1003,0,2013-10-27,11340,2,No event,No event,No event,No event,0,0.0
7,7,FOODS_1_001,CA_1,d_1004,0,2013-10-28,11340,3,No event,No event,No event,No event,0,0.0
8,8,FOODS_1_001,CA_1,d_1005,0,2013-10-29,11340,4,No event,No event,No event,No event,0,0.0
9,9,FOODS_1_001,CA_1,d_1006,0,2013-10-30,11340,5,No event,No event,No event,No event,0,0.0


# Data-Cleaning

In [17]:
'''
data.drop(columns = "sell_price",inplace = True)
data.drop(columns="dept_id",inplace = True)
data.drop(columns = "cat_id",inplace = True)
data.drop(columns = "state_id",inplace = True)
data.drop(columns="month",inplace = True)
data.drop(columns="year",inplace = True)
data.drop(columns="weekday",inplace=True)
'''

'\ndata.drop(columns = "sell_price",inplace = True)\ndata.drop(columns="dept_id",inplace = True)\ndata.drop(columns = "cat_id",inplace = True)\ndata.drop(columns = "state_id",inplace = True)\ndata.drop(columns="month",inplace = True)\ndata.drop(columns="year",inplace = True)\ndata.drop(columns="weekday",inplace=True)\n'

In [18]:
data.drop(columns="Unnamed: 0",inplace = True)

In [19]:
data.fillna(0,inplace = True)

In [20]:
data.sort_values(by=["store_id", "item_id", "d"], inplace=True)
data.reset_index(drop=True, inplace=True)

In [21]:
data.to_csv("/kaggle/working/data.csv")