# **Packages**

In [308]:
import pandas as pd
import numpy as np
import pickle

from stores_module import *
from recursive_predictions import *
from sales_module import *

# **Datasets**

In [309]:
with open("AngeliqueFile.pkl", "rb") as f:
    transactions = pickle.load(f)

In [310]:
train_set = pd.read_csv("train.csv")
train_set["date"] = pd.to_datetime(train_set["date"])

In [311]:
with open("transac_sales.pkl","rb") as f:
    transac_sales = pickle.load(f)

# **Prediction transactions**

In [312]:
stores_list = list(transactions["store_nbr"].unique())


In [313]:
stores_test = transactions.loc[transactions["date"] >= "2017-08-01","date"].to_frame()
stores_test.drop_duplicates(inplace=True, ignore_index=True)

In [314]:
for store in stores_list:
    store_df = store_data(store)
    no_cat_list = ["not a national event","Not a Nat holiday","Nope"]
    for col in store_df.columns:
        for cat in no_cat_list:
            if store_df[col].dtype == "category" and cat in store_df[col].cat.categories:
                store_df[col] = rename_null_cat(store_df,col,cat)
        if store_df[col].dtype == "category" and len(store_df[col].cat.categories) == 2:
            col_binom = binom_cat_bool(store_df)
            col_binom.binom_bool()
            store_df.loc[:,col] = col_binom.transform()

        elif store_df[col].dtype == "category" and len(store_df[col].cat.categories) > 2:
            col_Encod = my_labelEncoder()
            col_Encod.fit(store_df,col)
            store_df[col] = col_Encod.transform(store_df,col)
    
    _,store_df = frame_time_of_interest(store_df,8) #To generalise we need to precise the month upstream
    split_Lagg = SplitLagg(store_df)
    split_Lagg.transactions_X()
    X_df = split_Lagg.lagg_X(lags=4)
    y_df = split_Lagg.lagg_y(lags=4)
    X_train,y_train,X_test,y_test,test_y = split_train_test(X_df,y_df,date="2017-07-31")
    predictions_store, test_yPred = recurs_Lin_regr(X_train,y_train,X_test,test_y)
    if "store_nbr" not in stores_test.columns:
        stores_test["store_nbr"] = [25] * len(y_test)
        stores_test["value"] = y_test.values
        stores_test["transactions_pred"] = predictions_store
    else:
        store_dict = {"date": y_test.index, "store_nbr": [store] * len(y_test), "value": y_test.values, "transactions_pred": predictions_store}
        store_dict = pd.DataFrame(store_dict)
        stores_test = pd.concat([stores_test, store_dict], ignore_index=True, axis=0)
    stores_test["transactions_pred"] = stores_test["transactions_pred"].astype("int")

In [315]:
stores_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 810 entries, 0 to 809
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   date               810 non-null    datetime64[ns]
 1   store_nbr          810 non-null    int64         
 2   value              810 non-null    int64         
 3   transactions_pred  810 non-null    int32         
dtypes: datetime64[ns](1), int32(1), int64(2)
memory usage: 22.3 KB


In [316]:
stores_test_52 = stores_test.loc[stores_test["store_nbr"] == 52]
stores_test_52

Unnamed: 0,date,store_nbr,value,transactions_pred
795,2017-08-01,52,2147,1905
796,2017-08-02,52,2499,1975
797,2017-08-03,52,2045,1783
798,2017-08-04,52,2442,2125
799,2017-08-05,52,2837,2765
800,2017-08-06,52,2711,2599
801,2017-08-07,52,2152,1829
802,2017-08-08,52,1997,1644
803,2017-08-09,52,2300,1820
804,2017-08-10,52,2165,3621


# **Preprocessing**

In [342]:
train = train_set[train_set["date"] <"2017-08-01"].copy()
test = train_set[train_set["date"] >= "2017-08-01"].copy()

In [343]:
transactions["date"].min()

Timestamp('2013-01-01 00:00:00')

In [344]:
transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83488 entries, 0 to 83487
Data columns (total 19 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   date                        83488 non-null  datetime64[ns]
 1   store_nbr                   83488 non-null  int64         
 2   transactions                83488 non-null  int64         
 3   National Event              83488 non-null  category      
 4   National holiday            83488 non-null  category      
 5   National period of holiday  83488 non-null  category      
 6   National Workday            83488 non-null  category      
 7   city                        83488 non-null  category      
 8   state                       83488 non-null  category      
 9   type                        83488 non-null  category      
 10  cluster                     83488 non-null  int64         
 11  Local Holiday               83488 non-null  category  

In [345]:
transactions_train = transactions[transactions["date"] < "2017-08-01"].copy()
train = train.merge(transactions_train[["date","transactions","store_nbr","dcoilwtico"]], \
                    on=["date","store_nbr"], how="left", validate="many_to_one")

In [346]:
train.isna().sum()

id                   0
date                 0
store_nbr            0
family               0
sales                0
onpromotion          0
transactions    245784
dcoilwtico      245784
dtype: int64

the null values correspond to the stores that are closed (hence, no transactions are given for those dates) but we do still have the rows for sales = 0.

Next: Create a column with the median sale for that day (per item per store). 

In [347]:
train.dropna(how="any", axis=0, inplace=True)

In [348]:
stores_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 810 entries, 0 to 809
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   date               810 non-null    datetime64[ns]
 1   store_nbr          810 non-null    int64         
 2   value              810 non-null    int64         
 3   transactions_pred  810 non-null    int32         
dtypes: datetime64[ns](1), int32(1), int64(2)
memory usage: 22.3 KB


In [349]:
test = test.merge(stores_test[["date","store_nbr", "transactions_pred"]], \
                  on=["date","store_nbr"], how="left", validate="many_to_one")

In [350]:
test["transactions"] = test["transactions_pred"]
test.drop("transactions_pred", axis=1, inplace=True)

In [351]:
transactions.loc[transactions["date"] > "2017-07-31","dcoilwtico"]

82678    49.19
82679    49.19
82680    49.19
82681    49.19
82682    49.19
         ...  
83483    47.57
83484    47.57
83485    47.57
83486    47.57
83487    47.57
Name: dcoilwtico, Length: 810, dtype: float64

In [352]:
test = test.merge(transactions.loc[transactions["date"] > "2017-07-31",["date","store_nbr","dcoilwtico"]], on=["date","store_nbr"], how="left", validate="many_to_one")

In [353]:
test.isna().sum()

id              0
date            0
store_nbr       0
family          0
sales           0
onpromotion     0
transactions    0
dcoilwtico      0
dtype: int64

In [354]:
train.isna().sum()

id              0
date            0
store_nbr       0
family          0
sales           0
onpromotion     0
transactions    0
dcoilwtico      0
dtype: int64

In [355]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2728374 entries, 561 to 2974157
Data columns (total 8 columns):
 #   Column        Dtype         
---  ------        -----         
 0   id            int64         
 1   date          datetime64[ns]
 2   store_nbr     int64         
 3   family        object        
 4   sales         float64       
 5   onpromotion   int64         
 6   transactions  float64       
 7   dcoilwtico    float64       
dtypes: datetime64[ns](1), float64(3), int64(3), object(1)
memory usage: 187.3+ MB


In [356]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26730 entries, 0 to 26729
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   id            26730 non-null  int64         
 1   date          26730 non-null  datetime64[ns]
 2   store_nbr     26730 non-null  int64         
 3   family        26730 non-null  object        
 4   sales         26730 non-null  float64       
 5   onpromotion   26730 non-null  int64         
 6   transactions  26730 non-null  int32         
 7   dcoilwtico    26730 non-null  float64       
dtypes: datetime64[ns](1), float64(2), int32(1), int64(3), object(1)
memory usage: 1.5+ MB


In [357]:
blob

NameError: name 'blob' is not defined

In [358]:
sales = pd.concat([train,test], axis=0)

In [359]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2755104 entries, 561 to 26729
Data columns (total 8 columns):
 #   Column        Dtype         
---  ------        -----         
 0   id            int64         
 1   date          datetime64[ns]
 2   store_nbr     int64         
 3   family        object        
 4   sales         float64       
 5   onpromotion   int64         
 6   transactions  float64       
 7   dcoilwtico    float64       
dtypes: datetime64[ns](1), float64(3), int64(3), object(1)
memory usage: 189.2+ MB


## **Adding events and holidays**

In [360]:
holid_transac = transactions_cat()

In [361]:
holid_transac.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83488 entries, 0 to 83487
Data columns (total 21 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   date                        83488 non-null  datetime64[ns]
 1   store_nbr                   83488 non-null  int64         
 2   transactions                83488 non-null  int64         
 3   National Event              83488 non-null  Int64         
 4   National holiday            83488 non-null  Int64         
 5   National period of holiday  83488 non-null  Int64         
 6   National Workday            83488 non-null  int32         
 7   city                        83488 non-null  Int64         
 8   state                       83488 non-null  Int64         
 9   type                        83488 non-null  Int64         
 10  cluster                     83488 non-null  int64         
 11  Local Holiday               83488 non-null  int32     

In [362]:
sales["date"].min()

Timestamp('2013-01-01 00:00:00')

In [363]:
sales = sales.merge(holid_transac[["date","store_nbr", "payday","Local Holiday","Regional Holiday","Workday","National Workday", "National Event","National holiday",\
                                   "National period of holiday","Transfer"]],\
                                    on=["date","store_nbr"], how="left", validate="many_to_one")

In [364]:
sales.isna().sum()

id                            0
date                          0
store_nbr                     0
family                        0
sales                         0
onpromotion                   0
transactions                  0
dcoilwtico                    0
payday                        0
Local Holiday                 0
Regional Holiday              0
Workday                       0
National Workday              0
National Event                0
National holiday              0
National period of holiday    0
Transfer                      0
dtype: int64

In [365]:
#
#ols_to_fill = ["payday","Local Holiday","Regional Holiday","Workday","National Workday","National Event","National holiday",\
                                                                        "National period of holiday","Transfer"]
#or date in dates_missing:
#   ref_row = sales[(sales["date"] == date)&(sales["transactions"] != 0)]
#   if not ref_row.empty:
#       ref_values = ref_row.iloc[0][cols_to_fill]
#       sales.loc[(sales["date"] == date)&(sales["transactions"] == 0), cols_to_fill] = ref_values.values

IndentationError: unexpected indent (2826566898.py, line 3)

In [366]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2755104 entries, 0 to 2755103
Data columns (total 17 columns):
 #   Column                      Dtype         
---  ------                      -----         
 0   id                          int64         
 1   date                        datetime64[ns]
 2   store_nbr                   int64         
 3   family                      object        
 4   sales                       float64       
 5   onpromotion                 int64         
 6   transactions                float64       
 7   dcoilwtico                  float64       
 8   payday                      int32         
 9   Local Holiday               int32         
 10  Regional Holiday            int32         
 11  Workday                     int32         
 12  National Workday            int32         
 13  National Event              Int64         
 14  National holiday            Int64         
 15  National period of holiday  Int64         
 16  Transfer          

In [367]:
sales.drop(sales.loc[sales["date"].isin(["2016-01-01","2016-01-03"])].index, axis=0, inplace=True)

  sales.drop(sales.loc[sales["date"].isin(["2016-01-01","2016-01-03"])].index, axis=0, inplace=True)


In [368]:
sales.isna().sum()

id                            0
date                          0
store_nbr                     0
family                        0
sales                         0
onpromotion                   0
transactions                  0
dcoilwtico                    0
payday                        0
Local Holiday                 0
Regional Holiday              0
Workday                       0
National Workday              0
National Event                0
National holiday              0
National period of holiday    0
Transfer                      0
dtype: int64

In [369]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2755104 entries, 0 to 2755103
Data columns (total 17 columns):
 #   Column                      Dtype         
---  ------                      -----         
 0   id                          int64         
 1   date                        datetime64[ns]
 2   store_nbr                   int64         
 3   family                      object        
 4   sales                       float64       
 5   onpromotion                 int64         
 6   transactions                float64       
 7   dcoilwtico                  float64       
 8   payday                      int32         
 9   Local Holiday               int32         
 10  Regional Holiday            int32         
 11  Workday                     int32         
 12  National Workday            int32         
 13  National Event              Int64         
 14  National holiday            Int64         
 15  National period of holiday  Int64         
 16  Transfer          

In [370]:
sales.isna().sum()

id                            0
date                          0
store_nbr                     0
family                        0
sales                         0
onpromotion                   0
transactions                  0
dcoilwtico                    0
payday                        0
Local Holiday                 0
Regional Holiday              0
Workday                       0
National Workday              0
National Event                0
National holiday              0
National period of holiday    0
Transfer                      0
dtype: int64

## **Adding a column median_sales**

In [371]:
sales["month"] = sales["date"].dt.month
sales["day"] = sales["date"].dt.day
grouped_sales = sales.groupby(["family","store_nbr","month","day"])["sales"].mean().reset_index(name="Avr_sales")
grouped_sales.rename(columns={"sales":"Avr_sales"}, inplace=True)
sales = sales.merge(grouped_sales, on=["family","store_nbr","month","day"], how="left", validate="many_to_one")


In [372]:
sales.drop(["month","day"], axis=1, inplace=True)

In [373]:
sales.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,transactions,dcoilwtico,payday,Local Holiday,Regional Holiday,Workday,National Workday,National Event,National holiday,National period of holiday,Transfer,Avr_sales
0,561,2013-01-01,25,AUTOMOTIVE,0.0,0,770.0,93.14,0,0,0,0,0,0,8,0,0,2.0
1,562,2013-01-01,25,BABY CARE,0.0,0,770.0,93.14,0,0,0,0,0,0,8,0,0,0.5
2,563,2013-01-01,25,BEAUTY,2.0,0,770.0,93.14,0,0,0,0,0,0,8,0,0,2.25
3,564,2013-01-01,25,BEVERAGES,810.0,0,770.0,93.14,0,0,0,0,0,0,8,0,0,2440.25
4,565,2013-01-01,25,BOOKS,0.0,0,770.0,93.14,0,0,0,0,0,0,8,0,0,0.0


# **Predictions looping through families and stores**

In [374]:
blob

NameError: name 'blob' is not defined

In [375]:
items_list = list(sales["family"].unique())

In [376]:
items_list

['AUTOMOTIVE',
 'BABY CARE',
 'BEAUTY',
 'BEVERAGES',
 'BOOKS',
 'BREAD/BAKERY',
 'CELEBRATION',
 'CLEANING',
 'DAIRY',
 'DELI',
 'EGGS',
 'FROZEN FOODS',
 'GROCERY I',
 'GROCERY II',
 'HARDWARE',
 'HOME AND KITCHEN I',
 'HOME AND KITCHEN II',
 'HOME APPLIANCES',
 'HOME CARE',
 'LADIESWEAR',
 'LAWN AND GARDEN',
 'LINGERIE',
 'LIQUOR,WINE,BEER',
 'MAGAZINES',
 'MEATS',
 'PERSONAL CARE',
 'PET SUPPLIES',
 'PLAYERS AND ELECTRONICS',
 'POULTRY',
 'PREPARED FOODS',
 'PRODUCE',
 'SCHOOL AND OFFICE SUPPLIES',
 'SEAFOOD']

In [377]:
sales.loc[sales["family"] == "BREAD/BAKERY","family"] = "BREAD"

In [378]:
items_list = list(sales["family"].unique())
items_list

['AUTOMOTIVE',
 'BABY CARE',
 'BEAUTY',
 'BEVERAGES',
 'BOOKS',
 'BREAD',
 'CELEBRATION',
 'CLEANING',
 'DAIRY',
 'DELI',
 'EGGS',
 'FROZEN FOODS',
 'GROCERY I',
 'GROCERY II',
 'HARDWARE',
 'HOME AND KITCHEN I',
 'HOME AND KITCHEN II',
 'HOME APPLIANCES',
 'HOME CARE',
 'LADIESWEAR',
 'LAWN AND GARDEN',
 'LINGERIE',
 'LIQUOR,WINE,BEER',
 'MAGAZINES',
 'MEATS',
 'PERSONAL CARE',
 'PET SUPPLIES',
 'PLAYERS AND ELECTRONICS',
 'POULTRY',
 'PREPARED FOODS',
 'PRODUCE',
 'SCHOOL AND OFFICE SUPPLIES',
 'SEAFOOD']

In [379]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2755104 entries, 0 to 2755103
Data columns (total 18 columns):
 #   Column                      Dtype         
---  ------                      -----         
 0   id                          int64         
 1   date                        datetime64[ns]
 2   store_nbr                   int64         
 3   family                      object        
 4   sales                       float64       
 5   onpromotion                 int64         
 6   transactions                float64       
 7   dcoilwtico                  float64       
 8   payday                      int32         
 9   Local Holiday               int32         
 10  Regional Holiday            int32         
 11  Workday                     int32         
 12  National Workday            int32         
 13  National Event              Int64         
 14  National holiday            Int64         
 15  National period of holiday  Int64         
 16  Transfer          

In [380]:
print(sales.isna().sum())

id                            0
date                          0
store_nbr                     0
family                        0
sales                         0
onpromotion                   0
transactions                  0
dcoilwtico                    0
payday                        0
Local Holiday                 0
Regional Holiday              0
Workday                       0
National Workday              0
National Event                0
National holiday              0
National period of holiday    0
Transfer                      0
Avr_sales                     0
dtype: int64


In [381]:
sales.loc

<pandas.core.indexing._LocIndexer at 0x195013c3a20>

In [382]:
stores_list[0]

25

In [383]:
AUTOMOTIVE_df = family_df(family_name="AUTOMOTIVE",df=sales, month=8)

In [384]:
auto_52 = AUTOMOTIVE_df.loc[AUTOMOTIVE_df["store_nbr"] == 52].copy()
auto_52.drop("store_nbr", axis=1, inplace=True)
to_be_lagged = auto_52[["sales","transactions","weekday"]].copy()
lagg = SplitLagg_sale(to_be_lagged,"sales")
lagg.sales_X()
X_lagged = lagg.lagg_X(4)
y_lagged = lagg.lagg_y(4)
cols_to_remove = list(to_be_lagged.columns)
auto_52.drop(cols_to_remove, axis=1, inplace=True)
auto_52 = pd.concat([auto_52,X_lagged,y_lagged], axis=1)
X = auto_52.drop("var1 y(t)", axis=1)
y = auto_52[["var1 y(t)"]]
X_train = X.loc[:"2017-07-31"]
y_train = y.loc[:"2017-07-31"]
X_test = X.loc["2017-08-01":]
y_test = y.loc["2017-08-01":]
y_test

Unnamed: 0_level_0,var1 y(t)
date,Unnamed: 1_level_1
2017-08-01,5.0
2017-08-02,19.0
2017-08-03,19.0
2017-08-04,9.0
2017-08-05,15.0
2017-08-06,24.0
2017-08-07,5.0
2017-08-08,8.0
2017-08-09,11.0
2017-08-10,7.0


In [385]:
aug_dates_25 = list(sales.loc[(sales["store_nbr"] == stores_list[0])&(sales["date"] >= "2017-08-01"),"date"].unique())

In [386]:
for item in items_list:
    item_df = family_df(family_name=item,df=sales,month=8)
    stores_pred_df = pd.DataFrame(aug_dates_25, columns=["date"])
    for store in stores_list:
        store_df = item_df[item_df["store_nbr"] == store].copy()
        store_df.drop("store_nbr", axis=1, inplace=True)
        to_be_lagged = store_df[["sales","transactions","weekday"]].copy()
        lagg = SplitLagg_sale(to_be_lagged,"sales")
        lagg.sales_X()
        lagg_X = lagg.lagg_X(4)
        lagg_y = lagg.lagg_y(4)
        cols_to_remove = list(to_be_lagged.columns)
        store_df = store_df.drop(cols_to_remove,axis=1)
        store_df = store_df.iloc[4:]
        store_df = pd.concat([store_df,lagg_X,lagg_y],axis=1)
        #split
        X = store_df.drop("var1 y(t)", axis=1)
        y = store_df[["var1 y(t)"]]
        X_train = X.loc[:"2017-07-31"]
        y_train = y.loc[:"2017-07-31"]
        X_test = X.loc["2017-08-01":]
        y_test = y.loc["2017-08-01":]
        index=len(X.columns) - 4 #index of the col from which we add the pred laggs
        test_y = X_test.iloc[:,index:].copy()
        if len(X_test)==0:
            print(f"The store {store} doesn't have any {item}")
            break
        elif len(X_test) < 15:
            print(f"Store {store} has missing dates for item {item}")
        predictions, test_pred = recurs_Lin_regr(X_train,y_train,X_test,test_y)
        if store == stores_list[0]:
            stores_pred_df["store_nbr"] = [store] * len(y_test)
            stores_pred_df["value"] = y_test.values
            stores_pred_df["prediction"] = predictions
        else:
            store_dict_i = {"date":X_test.index, "store_nbr":[store]*len(y_test), "value":y_test["var1 y(t)"],\
                          "prediction":predictions}
            store_dict_i = pd.DataFrame(store_dict_i)
            stores_pred_df = pd.concat([stores_pred_df, store_dict_i], ignore_index=True, axis=0)
            stores_pred_df["prediction"] = stores_pred_df["prediction"].astype("int")
    with open(f"{item}_pred.pkl", "wb") as f:
        pickle.dump(stores_pred_df,f)
        