In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**Load the libraries**

In [4]:
import pandas as pd
import plotly.express as px
import numpy as np
import plotly.graph_objects as go
import matplotlib.pylab as plt
import seaborn as sns
from tqdm import tqdm
import random 
import datetime
import joblib

**Load the data**

In [129]:
sales = pd.read_csv("/content/drive/My Drive/M5 Forcasting/sales_train_evaluation.csv")
price = pd.read_csv("/content/drive/My Drive/M5 Forcasting/sell_prices.csv")
cal = pd.read_csv("/content/drive/My Drive/M5 Forcasting/calendar.csv")

**Pick the random point for testing**

In [153]:
rand_data = random.choice(sales.values)
data_point = pd.DataFrame(data=rand_data.reshape(1,-1),columns=sales.columns)

In [154]:
data_point

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,d_10,d_11,d_12,d_13,d_14,d_15,d_16,d_17,d_18,d_19,d_20,d_21,d_22,d_23,d_24,d_25,d_26,d_27,d_28,d_29,d_30,d_31,d_32,d_33,d_34,...,d_1902,d_1903,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913,d_1914,d_1915,d_1916,d_1917,d_1918,d_1919,d_1920,d_1921,d_1922,d_1923,d_1924,d_1925,d_1926,d_1927,d_1928,d_1929,d_1930,d_1931,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
0,FOODS_3_384_CA_4_evaluation,FOODS_3_384,FOODS_3,FOODS,CA_4,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,2,1,2,0,2,1,0,0,2,2,2,1,0,2,0,1,4,0,1,0,1,0,1,0,1,0,0,0,1,1,1,1,0,0,0,0,0


In [132]:
#https://www.kaggle.com/anshuls235/time-series-forecasting-eda-fe-modelling/notebook#2.-Downcasting

def downcast(df):

   """
     to downcast the data and reduce the size of data
    """

    cols = df.dtypes.index.tolist()
    types = df.dtypes.values.tolist()

    for i,t in enumerate(types):

        if 'int' in str(t):
            if df[cols[i]].min() > np.iinfo(np.int8).min and df[cols[i]].max() < np.iinfo(np.int8).max:
                df[cols[i]] = df[cols[i]].astype(np.int8)
            elif df[cols[i]].min() > np.iinfo(np.int16).min and df[cols[i]].max() < np.iinfo(np.int16).max:
                df[cols[i]] = df[cols[i]].astype(np.int16)
            elif df[cols[i]].min() > np.iinfo(np.int32).min and df[cols[i]].max() < np.iinfo(np.int32).max:
                df[cols[i]] = df[cols[i]].astype(np.int32)
            else:
                df[cols[i]] = df[cols[i]].astype(np.int64)

        elif 'float' in str(t):
            if df[cols[i]].min() > np.finfo(np.float16).min and df[cols[i]].max() < np.finfo(np.float16).max:
                df[cols[i]] = df[cols[i]].astype(np.float16)
            elif df[cols[i]].min() > np.finfo(np.float32).min and df[cols[i]].max() < np.finfo(np.float32).max:
                df[cols[i]] = df[cols[i]].astype(np.float32)
            else:
                df[cols[i]] = df[cols[i]].astype(np.float64)

        elif t == np.object:
            if cols[i] == 'date':
                df[cols[i]] = pd.to_datetime(df[cols[i]], format='%Y-%m-%d')
            else:
                df[cols[i]] = df[cols[i]].astype('category')

    return df  

In [138]:

def preprocessing(data):
    """
     to preprocees the data and make it to supervised machine learning data
    """

    # to downcasast the data and reduce the sizes of data
    pre_data = downcast(data)
    cal_d = downcast(cal)
    price_d = downcast(price)

    #fill sales zero in forecast values of next 28 days sales
    for day in range(1942,1970):
       pre_data['d_' + str(day)] = 0
       pre_data['d_' + str(day)] = pre_data['d_' + str(day)].astype(np.int16)

    #combine the datasets into a single supervised machine learning dataset
    pre_data = pd.melt(pre_data, id_vars=[a for a in data_point.columns if a.find("id")!=-1],
                           value_vars=[a for a in data_point.columns if a.find("d_")==0], var_name='d', value_name='sales')
  
    pre_data = pd.merge(pre_data, cal_d, on='d', how='left')
    pre_data = pd.merge(pre_data, price_d, on=['store_id','item_id','wm_yr_wk'], how='left')

    #fil the missing sell price values by mean imputaion
    pre_data["sell_price"].fillna(pre_data.groupby("id")["sell_price"].transform("mean"), inplace=True)
    
    pre_data.drop(columns=["date","weekday"], inplace=True)
    pre_data['d'] = pre_data['d'].apply(lambda a: a.split('_')[1]).astype(np.int16)

    return pre_data

In [139]:
def feature_engineering(data):

  #label encoding features

   columns = list(data.columns)
   d_types = list(data.dtypes.values)

   for a, d_t in enumerate(d_types):
     if d_t.name == 'object':
       data[columns[a]] = data[columns[a]].astype("category") 

   d_types = list(data.dtypes.values)
   for a, d_t in enumerate(d_types):
     if d_t.name == 'category':
       data[columns[a]] = data[columns[a]].cat.codes
       
  #lag features
   lags = [28, 35, 42, 49, 56, 63, 70, 77]
   for lag in tqdm(lags):
      data["lag_" + str(lag)] = data.groupby("id")["sales"].shift(lag).astype(np.float16)   

  #rolling median features
   data['lag_28'] = data.groupby('id')['sales'].transform(lambda x: x.shift(28)).astype(np.float16) 
   data['rolling_median_7'] = data.groupby('id')['lag_28'].transform(lambda x: x.rolling(7).median())
   data['rolling_median_28'] = data.groupby('id')['lag_28'].transform(lambda x: x.rolling(28).median())
   data['rolling_median_50'] = data.groupby('id')['lag_28'].transform(lambda x: x.rolling(50).median())
   
   data.drop('sales',axis=1,inplace=True)
   data = data[data['d']>1050]

   return data

Forcasting the data point with best model 

In [146]:
def final_fun(data_point):

  pre_data_point = preprocessing(data_point)
  fin_data_point = feature_engineering(pre_data_point)

  model = joblib.load("/content/drive/My Drive/M5 Forcasting/final_model")

  for d in range(1914, 1942):
      data_point['F_' + str(d)] = model.predict(a[a['d']==d])

  forecast_values = [a for a in data_point.columns if a.find("F_") == 0]

  return data_point[forecast_values]

Forcasted value

In [147]:
final_fun(data_point)

100%|██████████| 8/8 [00:00<00:00, 474.37it/s]


Unnamed: 0,F_1914,F_1915,F_1916,F_1917,F_1918,F_1919,F_1920,F_1921,F_1922,F_1923,F_1924,F_1925,F_1926,F_1927,F_1928,F_1929,F_1930,F_1931,F_1932,F_1933,F_1934,F_1935,F_1936,F_1937,F_1938,F_1939,F_1940,F_1941
0,0.464629,0.327662,0.359832,0.441222,0.428819,0.448462,0.537568,0.365309,0.356559,0.347148,0.405443,0.411749,0.462366,0.569566,0.465116,0.332329,0.354548,0.393045,0.389429,0.447126,0.762537,0.466107,0.323186,0.326531,0.421448,0.364692,0.438671,0.711514
