In [2]:
import pandas as pd
import numpy as np
import matplotlib as plt
from datetime import datetime

In [6]:
df_train = pd.read_csv("data/train.csv")
df_train = df_train.drop(columns=["id"])
df_train.head()

Unnamed: 0,date,store_nbr,family,sales,onpromotion
0,2013-01-01,1,AUTOMOTIVE,0.0,0
1,2013-01-01,1,BABY CARE,0.0,0
2,2013-01-01,1,BEAUTY,0.0,0
3,2013-01-01,1,BEVERAGES,0.0,0
4,2013-01-01,1,BOOKS,0.0,0


In [7]:
df_stores = pd.read_csv("data/stores.csv")
df_stores.head()

Unnamed: 0,store_nbr,city,state,type,cluster
0,1,Quito,Pichincha,D,13
1,2,Quito,Pichincha,D,13
2,3,Quito,Pichincha,D,8
3,4,Quito,Pichincha,D,9
4,5,Santo Domingo,Santo Domingo de los Tsachilas,D,4


In [29]:
df_holidays = pd.read_csv("data/holidays_events.csv")
df_holidays["date"] = pd.to_datetime(df_holidays["date"])
df_holidays.head()

Unnamed: 0,date,type,locale,locale_name,description,transferred
0,2012-03-02,Holiday,Local,Manta,Fundacion de Manta,False
1,2012-04-01,Holiday,Regional,Cotopaxi,Provincializacion de Cotopaxi,False
2,2012-04-12,Holiday,Local,Cuenca,Fundacion de Cuenca,False
3,2012-04-14,Holiday,Local,Libertad,Cantonizacion de Libertad,False
4,2012-04-21,Holiday,Local,Riobamba,Cantonizacion de Riobamba,False


In [30]:
df_holidays.type.unique()

array(['Holiday', 'Transfer', 'Additional', 'Bridge', 'Work Day', 'Event'],
      dtype=object)

In [31]:
df_train_w_stores = pd.merge(df_train, df_stores, on="store_nbr", how="left")

In [32]:
df_train_w_stores["date"] = pd.to_datetime(df_train_w_stores["date"])

In [33]:
df_train_w_stores

Unnamed: 0,date,store_nbr,family,sales,onpromotion,city,state,type,cluster
0,2013-01-01,1,AUTOMOTIVE,0.000,0,Quito,Pichincha,D,13
1,2013-01-01,1,BABY CARE,0.000,0,Quito,Pichincha,D,13
2,2013-01-01,1,BEAUTY,0.000,0,Quito,Pichincha,D,13
3,2013-01-01,1,BEVERAGES,0.000,0,Quito,Pichincha,D,13
4,2013-01-01,1,BOOKS,0.000,0,Quito,Pichincha,D,13
...,...,...,...,...,...,...,...,...,...
3000883,2017-08-15,9,POULTRY,438.133,0,Quito,Pichincha,B,6
3000884,2017-08-15,9,PREPARED FOODS,154.553,1,Quito,Pichincha,B,6
3000885,2017-08-15,9,PRODUCE,2419.729,148,Quito,Pichincha,B,6
3000886,2017-08-15,9,SCHOOL AND OFFICE SUPPLIES,121.000,8,Quito,Pichincha,B,6


In [53]:
def get_holiday_info(row, holidays):


    tmp = holidays[(holidays["date"] == row["date"]) & (holidays["transferred"] == False) &
                   (
                       (holidays["locale"] == "National") |
                       ((holidays["locale"] == "Regional") & (holidays["locale_name"] == row["state"])) | 
                       ((holidays["locale"] == "Local") & (holidays["locale_name"] == row["city"]))
                   )
                   ]
    
    if len(tmp) > 0:
        #print(tmp.iloc[0])
        tmp = tmp.reset_index().iloc[0]
        if tmp["type"] == "Work Day":
            return pd.Series({'isHoliday': 0, 'reason': "Work Day"})
        else:
            return pd.Series({'isHoliday': 1, 'reason': "Holiday"})
    else:
        #print("Not a holiday:", row["date"])
        #print("Day num:", row["date"].weekday())
        #input_date = datetime.datetime.strptime(tmp["date"], '%Y-%m-%d')
        if row["date"].weekday() in [5, 6]:
            return pd.Series({'isHoliday': 1, 'reason': "Weekend"})
        else:
            return pd.Series({'isHoliday': 0, 'reason': "Week Day"})
        

In [54]:
df_train_w_stores

Unnamed: 0,date,store_nbr,family,sales,onpromotion,city,state,type,cluster
0,2013-01-01,1,AUTOMOTIVE,0.000,0,Quito,Pichincha,D,13
1,2013-01-01,1,BABY CARE,0.000,0,Quito,Pichincha,D,13
2,2013-01-01,1,BEAUTY,0.000,0,Quito,Pichincha,D,13
3,2013-01-01,1,BEVERAGES,0.000,0,Quito,Pichincha,D,13
4,2013-01-01,1,BOOKS,0.000,0,Quito,Pichincha,D,13
...,...,...,...,...,...,...,...,...,...
3000883,2017-08-15,9,POULTRY,438.133,0,Quito,Pichincha,B,6
3000884,2017-08-15,9,PREPARED FOODS,154.553,1,Quito,Pichincha,B,6
3000885,2017-08-15,9,PRODUCE,2419.729,148,Quito,Pichincha,B,6
3000886,2017-08-15,9,SCHOOL AND OFFICE SUPPLIES,121.000,8,Quito,Pichincha,B,6


In [55]:
get_holiday_info(df_train_w_stores.iloc[78], df_holidays)

isHoliday          1
reason       Holiday
dtype: object

In [57]:
df = df_train_w_stores.copy()

In [58]:
new_cols = df.apply(get_holiday_info, holidays=df_holidays, axis=1)

In [59]:
new_cols

Unnamed: 0,isHoliday,reason
0,1,Holiday
1,1,Holiday
2,1,Holiday
3,1,Holiday
4,1,Holiday
...,...,...
3000883,0,Week Day
3000884,0,Week Day
3000885,0,Week Day
3000886,0,Week Day


In [60]:
df_n = pd.concat([df, new_cols], axis=1)

In [62]:
df_n.to_pickle("sales_with_holiday_info.pkl")