# Preprocessing Data for training the model

In [2]:
from pytorch_forecasting.data import TimeSeriesDataSet
from sklearn.preprocessing import LabelEncoder

  from tqdm.autonotebook import tqdm


In [50]:
import pandas as pd

# df = pd.read_csv("../sales_data.csv")
df = pd.read_csv("./sales_data_with_dates.csv")

### Ensure correct data types


In [51]:
df["sku"] = df["sku"].astype(str)  # Ensure SKU is categorical
df["time_idx"] = df["time_idx"].astype(int)  # Ensure time_idx is an integer

### Create a mapping for continuous time_idx values

In [52]:
# unique_time_idx = sorted(df["time_idx"].unique())  # Get sorted unique time indexes
# time_idx_mapping = {old: new for new, old in enumerate(unique_time_idx)}  # Map old to new

# # Apply mapping to make time_idx continuous
# df["time_idx"] = df["time_idx"].map(time_idx_mapping)

# # # Save the processed dataset
# # df.to_csv("sales_data_processed.csv", index=False)

# # Verify if time_idx is now continuous
# print(df["time_idx"].head(20))
# print("Is time_idx monotonic increasing?", df["time_idx"].is_monotonic_increasing)
# print("Are time_idx values unique and continuous?", df["time_idx"].nunique() == len(unique_time_idx))

In [53]:
df.head()

Unnamed: 0,distributor_id,industry,sku,category,sales,avg_quarterly_sales,movement_category,quarter,year,total_quarter_sales,...,is_diwali,is_ganesh_chaturthi,is_gudi_padwa,is_eid,is_akshay_tritiya,is_dussehra_navratri,is_onam,is_christmas,time_idx,date
0,DIST036,Healthcare,MDH Garam Masala,Food,4726.09,6345.93,Fast Moving,2,2022,4726.09,...,0,0,1,0,1,0,0,0,26645,2022-04-02
1,DIST007,E-commerce,Shell Helix Ultra 4L,Automotive,4015.0,4141.9,Slow Moving,3,2023,4015.0,...,0,1,0,1,0,1,1,0,4927,2023-09-19
2,DIST012,Retail,PlayStation 5 Digital Edition,Toys,12386.14,13593.1,Medium,2,2024,12386.14,...,0,0,1,1,1,0,0,0,8683,2024-04-10
3,DIST034,E-commerce,PlayStation 5 Digital Edition,Toys,13198.1,13593.1,Medium,4,2020,13198.1,...,1,0,0,1,0,1,0,1,25185,2020-11-14
4,DIST009,Manufacturing,Britannia Good Day Cookies,Food,3148.8,4600.58,Medium,1,2023,3148.8,...,0,0,1,1,0,0,0,0,6160,2023-03-24


In [54]:
# Convert date column to datetime format (replace 'date_column' with actual name)
df["date"] = pd.to_datetime(df["date"])

# Sort data by date and assign continuous time index
df = df.sort_values(["sku", "date"]).reset_index(drop=True)
df["time_idx"] = df.groupby("sku").cumcount()

df.head()
# Save the processed dataset
# df.to_csv("sales_data_with_time_idx.csv", index=False)


Unnamed: 0,distributor_id,industry,sku,category,sales,avg_quarterly_sales,movement_category,quarter,year,total_quarter_sales,...,is_diwali,is_ganesh_chaturthi,is_gudi_padwa,is_eid,is_akshay_tritiya,is_dussehra_navratri,is_onam,is_christmas,time_idx,date
0,DIST002,Hospitality,3M Car Care Kit,Automotive,27771.08,33045.78,Slow Moving,2,2020,27771.08,...,0,0,1,1,1,0,0,0,0,2020-03-23
1,DIST010,E-commerce,3M Car Care Kit,Automotive,26185.03,33045.78,Medium,2,2020,26185.03,...,0,0,1,1,1,0,0,0,1,2020-03-24
2,DIST002,Hospitality,3M Car Care Kit,Automotive,40653.67,33045.78,Fast Moving,3,2020,40653.67,...,0,1,0,0,0,1,1,0,2,2020-08-23
3,DIST010,E-commerce,3M Car Care Kit,Automotive,47283.42,33045.78,Slow Moving,4,2020,47283.42,...,1,0,0,1,0,1,0,1,3,2020-11-13
4,DIST033,E-commerce,3M Car Care Kit,Automotive,29008.09,33045.78,Medium,4,2020,29008.09,...,1,0,0,1,0,1,0,1,4,2020-11-16


<!-- ### Ensure time_idx is sorted -->

In [55]:
df = df.sort_values(by=["time_idx", "sku"]).reset_index(drop=True)

In [56]:
df.head(20)

Unnamed: 0,distributor_id,industry,sku,category,sales,avg_quarterly_sales,movement_category,quarter,year,total_quarter_sales,...,is_diwali,is_ganesh_chaturthi,is_gudi_padwa,is_eid,is_akshay_tritiya,is_dussehra_navratri,is_onam,is_christmas,time_idx,date
0,DIST002,Hospitality,3M Car Care Kit,Automotive,27771.08,33045.78,Slow Moving,2,2020,27771.08,...,0,0,1,1,1,0,0,0,0,2020-03-23
1,DIST033,E-commerce,7UP 600ml Bottle,Beverages,980.93,1285.49,Slow Moving,1,2020,980.93,...,0,0,1,0,0,0,0,0,0,2020-03-25
2,DIST040,Retail,AccuCheck Sugar Test Strips,Health,3783.43,6422.28,Fast Moving,1,2020,3783.43,...,0,0,1,0,0,0,0,0,0,2020-03-23
3,DIST036,Healthcare,Adidas Running Shoes,Sports,1873.04,1700.72,Slow Moving,2,2020,1873.04,...,0,0,1,1,1,0,0,0,0,2020-03-24
4,DIST020,Retail,Adidas Ultraboost Sneakers,Apparel,387.15,842.34,Medium,1,2020,387.15,...,0,0,1,0,0,0,0,0,0,2020-03-23
5,DIST010,E-commerce,Ajanta Wall Clock,Home Goods,399.71,538.67,Fast Moving,2,2020,399.71,...,0,0,1,1,1,0,0,0,0,2020-03-26
6,DIST004,Healthcare,Amaron Battery 45Ah,Automotive,6240.5,15903.38,Fast Moving,1,2020,6240.5,...,0,0,1,0,0,0,0,0,0,2020-03-23
7,DIST007,E-commerce,Amul Butter 500g,Food,1876.48,1674.57,Medium,2,2020,1876.48,...,0,0,1,1,1,0,0,0,0,2020-03-27
8,DIST006,Financial Services,Asian Paints Apcolite Premium Enamel,Paint and ancillaries,7867.81,6108.35,Slow Moving,2,2020,7867.81,...,0,0,1,1,1,0,0,0,0,2020-03-23
9,DIST014,Manufacturing,Asian Paints Dynamo Adhesive,Paint and ancillaries,9545.84,5602.99,Fast Moving,2,2020,9545.84,...,0,0,1,1,1,0,0,0,0,2020-03-24


### Convert categorical columns to string type

In [57]:
categorical_cols = ["sku", "distributor_id", "industry", "category", "movement_category"]
df[categorical_cols] = df[categorical_cols].astype(str)

### Encode categorical variables

In [58]:
label_encoders = {col: LabelEncoder().fit(df[col]) for col in categorical_cols}
for col, encoder in label_encoders.items():
    df[col] = encoder.transform(df[col])

### Define max encoder and prediction lengths


In [59]:
max_encoder_length = 8   # Use last 8 time points as input
max_prediction_length = 4  # Predict the next 4 time points

### Define training cutoff point


In [60]:
training_cutoff = df["time_idx"].max() - max_prediction_length

### Create TimeSeriesDataSet

In [62]:
tft_dataset = TimeSeriesDataSet(
    df,
    time_idx="time_idx",
    target="sales",
    group_ids=["sku"],  # Forecast per SKU
    max_encoder_length=max_encoder_length,
    max_prediction_length=max_prediction_length,
    time_varying_known_reals=["quarter", "year"],  # Future known variables
    time_varying_unknown_reals=["sales", "avg_quarterly_sales", "total_quarter_sales", "prev_quarter_sales", "movement_category"],
    # time_varying_unknown_categoricals=,
    categorical_encoders={"movement_category": LabelEncoder()},
    target_normalizer=None,  # No normalization for sales
    # allow_missing_timesteps= True
)


### Display dataset structure

In [63]:
tft_dataset

TimeSeriesDataSet[length=7800](
	time_idx='time_idx',
	target='sales',
	group_ids=['sku'],
	weight=None,
	max_encoder_length=8,
	min_encoder_length=8,
	min_prediction_idx=np.int64(0),
	min_prediction_length=4,
	max_prediction_length=4,
	static_categoricals=None,
	static_reals=None,
	time_varying_known_categoricals=None,
	time_varying_known_reals=['quarter', 'year'],
	time_varying_unknown_categoricals=None,
	time_varying_unknown_reals=['sales', 'avg_quarterly_sales', 'total_quarter_sales', 'prev_quarter_sales', 'movement_category'],
	variable_groups=None,
	constant_fill_strategy=None,
	allow_missing_timesteps=False,
	lags=None,
	add_relative_time_idx=False,
	add_target_scales=False,
	add_encoder_length=False,
	target_normalizer=TorchNormalizer(method='identity', center=True, transformation=None, method_kwargs=None),
	categorical_encoders={'movement_category': LabelEncoder(), '__group_id__sku': NaNLabelEncoder(add_nan=False, warn=True)},
	scalers={'quarter': StandardScaler(), 'year': Sta