# **Packages**

In [1]:
import pandas as pd
import numpy as np
import pickle

from stores_module import *
from recursive_predictions import *
from sales_module import *

# **Datasets**

In [2]:
with open("AngeliqueFile.pkl", "rb") as f:
    transactions = pickle.load(f)

In [3]:
train_set = pd.read_csv("train.csv")
train_set["date"] = pd.to_datetime(train_set["date"])

In [4]:
with open("transac_sales.pkl","rb") as f:
    transac_sales = pickle.load(f)

# **Prediction transactions**

In [5]:
stores_list = list(transactions["store_nbr"].unique())


In [6]:
stores_test = transactions.loc[transactions["date"] >= "2017-08-01","date"].to_frame()
stores_test.drop_duplicates(inplace=True, ignore_index=True)

In [7]:
for store in stores_list:
    store_df = store_data(store)
    no_cat_list = ["not a national event","Not a Nat holiday","Nope"]
    for col in store_df.columns:
        for cat in no_cat_list:
            if store_df[col].dtype == "category" and cat in store_df[col].cat.categories:
                store_df[col] = rename_null_cat(store_df,col,cat)
        if store_df[col].dtype == "category" and len(store_df[col].cat.categories) == 2:
            col_binom = binom_cat_bool(store_df)
            col_binom.binom_bool()
            store_df.loc[:,col] = col_binom.transform()

        elif store_df[col].dtype == "category" and len(store_df[col].cat.categories) > 2:
            col_Encod = my_labelEncoder()
            col_Encod.fit(store_df,col)
            store_df[col] = col_Encod.transform(store_df,col)
    
    _,store_df = frame_time_of_interest(store_df,8) #To generalise we need to precise the month upstream
    split_Lagg = SplitLagg(store_df)
    split_Lagg.transactions_X()
    X_df = split_Lagg.lagg_X(lags=4)
    y_df = split_Lagg.lagg_y(lags=4)
    X_train,y_train,X_test,y_test,test_y = split_train_test(X_df,y_df,date="2017-07-31")
    predictions_store, test_yPred = recurs_Lin_regr(X_train,y_train,X_test,test_y)
    if "store_nbr" not in stores_test.columns:
        stores_test["store_nbr"] = [25] * len(y_test)
        stores_test["value"] = y_test.values
        stores_test["transactions_pred"] = predictions_store
    else:
        store_dict = {"date": y_test.index, "store_nbr": [store] * len(y_test), "value": y_test.values, "transactions_pred": predictions_store}
        store_dict = pd.DataFrame(store_dict)
        stores_test = pd.concat([stores_test, store_dict], ignore_index=True, axis=0)

In [8]:
stores_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 810 entries, 0 to 809
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   date               810 non-null    datetime64[ns]
 1   store_nbr          810 non-null    int64         
 2   value              810 non-null    int64         
 3   transactions_pred  810 non-null    float64       
dtypes: datetime64[ns](1), float64(1), int64(2)
memory usage: 25.4 KB


# **Preprocessing**

In [9]:
train = train_set[train_set["date"] <"2017-08-01"].copy()
test = train_set[train_set["date"] >= "2017-08-01"].copy()

In [10]:
transactions_train = transactions[transactions["date"] < "2017-08-01"].copy()
train = train.merge(transactions_train[["date","transactions","store_nbr"]], \
                    on=["date","store_nbr"], how="inner", validate="many_to_one")

In [11]:
test = test.merge(stores_test[["date","store_nbr", "transactions_pred"]], \
                  on=["date","store_nbr"], how="inner", validate="many_to_one")

In [12]:
test["transactions"] = test["transactions_pred"]
test.drop("transactions_pred", axis=1, inplace=True)

In [13]:
sales = pd.concat([train,test], axis=0)

## **Adding events and holidays**

In [14]:
holid_transac = transactions_cat()

In [15]:
sales = sales.merge(holid_transac[["date","store_nbr","Local Holiday","Regional Holiday","Workday","National Workday", "National Event","National holiday",\
                                   "National period of holiday","Transfer"]],\
                                    on=["date","store_nbr"], how="inner", validate="many_to_one")

In [16]:
sales["Payday"] = transac_sales["pay_day"]

# **Predictions looping through families and stores**

In [17]:
items_list = list(sales["family"].unique())

In [18]:
items_list

['AUTOMOTIVE',
 'BABY CARE',
 'BEAUTY',
 'BEVERAGES',
 'BOOKS',
 'BREAD/BAKERY',
 'CELEBRATION',
 'CLEANING',
 'DAIRY',
 'DELI',
 'EGGS',
 'FROZEN FOODS',
 'GROCERY I',
 'GROCERY II',
 'HARDWARE',
 'HOME AND KITCHEN I',
 'HOME AND KITCHEN II',
 'HOME APPLIANCES',
 'HOME CARE',
 'LADIESWEAR',
 'LAWN AND GARDEN',
 'LINGERIE',
 'LIQUOR,WINE,BEER',
 'MAGAZINES',
 'MEATS',
 'PERSONAL CARE',
 'PET SUPPLIES',
 'PLAYERS AND ELECTRONICS',
 'POULTRY',
 'PREPARED FOODS',
 'PRODUCE',
 'SCHOOL AND OFFICE SUPPLIES',
 'SEAFOOD']

In [19]:
sales.loc[sales["family"] == "BREAD/BAKERY","family"] = "BREAD"

In [20]:
items_list = list(sales["family"].unique())
items_list

['AUTOMOTIVE',
 'BABY CARE',
 'BEAUTY',
 'BEVERAGES',
 'BOOKS',
 'BREAD',
 'CELEBRATION',
 'CLEANING',
 'DAIRY',
 'DELI',
 'EGGS',
 'FROZEN FOODS',
 'GROCERY I',
 'GROCERY II',
 'HARDWARE',
 'HOME AND KITCHEN I',
 'HOME AND KITCHEN II',
 'HOME APPLIANCES',
 'HOME CARE',
 'LADIESWEAR',
 'LAWN AND GARDEN',
 'LINGERIE',
 'LIQUOR,WINE,BEER',
 'MAGAZINES',
 'MEATS',
 'PERSONAL CARE',
 'PET SUPPLIES',
 'PLAYERS AND ELECTRONICS',
 'POULTRY',
 'PREPARED FOODS',
 'PRODUCE',
 'SCHOOL AND OFFICE SUPPLIES',
 'SEAFOOD']

In [21]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2755104 entries, 0 to 2755103
Data columns (total 16 columns):
 #   Column                      Dtype         
---  ------                      -----         
 0   id                          int64         
 1   date                        datetime64[ns]
 2   store_nbr                   int64         
 3   family                      object        
 4   sales                       float64       
 5   onpromotion                 int64         
 6   transactions                float64       
 7   Local Holiday               int32         
 8   Regional Holiday            int32         
 9   Workday                     int32         
 10  National Workday            int32         
 11  National Event              Int64         
 12  National holiday            Int64         
 13  National period of holiday  Int64         
 14  Transfer                    int32         
 15  Payday                      int64         
dtypes: Int64(3), datet

In [22]:
for item in items_list:
    item_df = family_df(family_name=item,df=sales,month=8)
    stores_pred_df = transactions.loc[transactions["date"] >= "2017-08-01","date"].to_frame()
    stores_pred_df = stores_pred_df.drop_duplicates(ignore_index=True)
    for store in stores_list:
        store_df = item_df[item_df["store_nbr"] == store].copy()
        to_be_lagged = store_df[["sales","onpromotion","transactions","Payday","weekday"]].copy()
        lagg = SplitLagg_sale(to_be_lagged,"sales")
        lagg.sales_X()
        lagg_X = lagg.lagg_X(4)
        lagg_y = lagg.lagg_y(4)
        cols_to_remove = list(to_be_lagged.columns)
        store_df = store_df.drop(cols_to_remove,axis=1)
        store_df = store_df.iloc[4:]
        store_df = pd.concat([store_df,lagg_X,lagg_y],axis=1)
        #split
        X = store_df.drop("var1 y(t)", axis=1)
        y = store_df[["var1 y(t)"]]
        X_train = X.loc[:"2017-07-31"]
        y_train = y.loc[:"2017-07-31"]
        X_test = X.loc["2017-08-01":]
        y_test = y.loc["2017-08-01":]
        index=len(X.columns) - 4
        test_y = X_test.iloc[:,index:]
        if len(X_test)==0:
            print(f"The store {store} doesn't have any {item}")
            break
        predictions, test_pred = recurs_Lin_regr(X_train,y_train,X_test,test_y)
        if store == stores_list[0]:
            stores_pred_df["store_nbr"] = [store] * len(y_test)
            stores_pred_df["value"] = y_test.values
            stores_pred_df["prediction"] = predictions
        else:
            store_dict_i = {"date":X_test.index, "store_nbr":[store]*len(y_test), "value":y_test.shape[0],\
                          "prediction":predictions}
            store_dict_i = pd.DataFrame(store_dict_i)
            stores_pred_df = pd.concat([stores_pred_df, store_dict_i], ignore_index=True, axis=0)
        with open(f"{item}_pred.pkl", "wb") as f:
            pickle.dump(stores_test,f)
        