run on virtual_env

    Data Import

In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
from datetime import datetime

In [2]:
df_train = pd.read_csv("data/train.csv")
df_train = df_train.drop(columns=["id"])
df_train.head()

Unnamed: 0,date,store_nbr,family,sales,onpromotion
0,2013-01-01,1,AUTOMOTIVE,0.0,0
1,2013-01-01,1,BABY CARE,0.0,0
2,2013-01-01,1,BEAUTY,0.0,0
3,2013-01-01,1,BEVERAGES,0.0,0
4,2013-01-01,1,BOOKS,0.0,0


In [3]:
df_stores = pd.read_csv("data/stores.csv")
df_stores.head()

Unnamed: 0,store_nbr,city,state,type,cluster
0,1,Quito,Pichincha,D,13
1,2,Quito,Pichincha,D,13
2,3,Quito,Pichincha,D,8
3,4,Quito,Pichincha,D,9
4,5,Santo Domingo,Santo Domingo de los Tsachilas,D,4


In [4]:
df_holidays = pd.read_csv("data/holidays_events.csv")
df_holidays["date"] = pd.to_datetime(df_holidays["date"])
df_holidays.head()

Unnamed: 0,date,type,locale,locale_name,description,transferred
0,2012-03-02,Holiday,Local,Manta,Fundacion de Manta,False
1,2012-04-01,Holiday,Regional,Cotopaxi,Provincializacion de Cotopaxi,False
2,2012-04-12,Holiday,Local,Cuenca,Fundacion de Cuenca,False
3,2012-04-14,Holiday,Local,Libertad,Cantonizacion de Libertad,False
4,2012-04-21,Holiday,Local,Riobamba,Cantonizacion de Riobamba,False


In [5]:
df_holidays.type.unique()

array(['Holiday', 'Transfer', 'Additional', 'Bridge', 'Work Day', 'Event'],
      dtype=object)

In [31]:
# Oil is an important economic indicator for countries like ecuador
df_oil = pd.read_csv("data/oil.csv")
df_oil.head()

Unnamed: 0,date,dcoilwtico
0,2013-01-01,
1,2013-01-02,93.14
2,2013-01-03,92.97
3,2013-01-04,93.12
4,2013-01-07,93.2


In [32]:
df_oil.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1218 entries, 0 to 1217
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   date        1218 non-null   object 
 1   dcoilwtico  1175 non-null   float64
dtypes: float64(1), object(1)
memory usage: 19.2+ KB


In [33]:
df_oil = df_oil.interpolate().bfill()
df_oil.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1218 entries, 0 to 1217
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   date        1218 non-null   object 
 1   dcoilwtico  1218 non-null   float64
dtypes: float64(1), object(1)
memory usage: 19.2+ KB


In [37]:
df_transactions = pd.read_csv("data/transactions.csv")
df_transactions["date"] =  pd.to_datetime(df_transactions["date"])
df_transactions

Unnamed: 0,date,store_nbr,transactions
0,2013-01-01,25,770
1,2013-01-02,1,2111
2,2013-01-02,2,2358
3,2013-01-02,3,3487
4,2013-01-02,4,1922
...,...,...,...
83483,2017-08-15,50,2804
83484,2017-08-15,51,1573
83485,2017-08-15,52,2255
83486,2017-08-15,53,932


    Merge

In [6]:
df_train_w_stores = pd.merge(df_train, df_stores, on="store_nbr", how="left")
df_train_w_stores["date"] = pd.to_datetime(df_train_w_stores["date"])
df_train_w_stores

Unnamed: 0,date,store_nbr,family,sales,onpromotion,city,state,type,cluster
0,2013-01-01,1,AUTOMOTIVE,0.000,0,Quito,Pichincha,D,13
1,2013-01-01,1,BABY CARE,0.000,0,Quito,Pichincha,D,13
2,2013-01-01,1,BEAUTY,0.000,0,Quito,Pichincha,D,13
3,2013-01-01,1,BEVERAGES,0.000,0,Quito,Pichincha,D,13
4,2013-01-01,1,BOOKS,0.000,0,Quito,Pichincha,D,13
...,...,...,...,...,...,...,...,...,...
3000883,2017-08-15,9,POULTRY,438.133,0,Quito,Pichincha,B,6
3000884,2017-08-15,9,PREPARED FOODS,154.553,1,Quito,Pichincha,B,6
3000885,2017-08-15,9,PRODUCE,2419.729,148,Quito,Pichincha,B,6
3000886,2017-08-15,9,SCHOOL AND OFFICE SUPPLIES,121.000,8,Quito,Pichincha,B,6


In [8]:
def get_holiday_info(row, holidays):


    tmp = holidays[(holidays["date"] == row["date"]) & (holidays["transferred"] == False) &
                   (
                       (holidays["locale"] == "National") |
                       ((holidays["locale"] == "Regional") & (holidays["locale_name"] == row["state"])) | 
                       ((holidays["locale"] == "Local") & (holidays["locale_name"] == row["city"]))
                   )
                   ]
    
    if len(tmp) > 0:
        #print(tmp.iloc[0])
        tmp = tmp.reset_index().iloc[0]
        if tmp["type"] == "Work Day":
            return pd.Series({'isHoliday': 0, 'reason': "Work Day"})
        else:
            return pd.Series({'isHoliday': 1, 'reason': "Holiday"})
    else:
        #print("Not a holiday:", row["date"])
        #print("Day num:", row["date"].weekday())
        #input_date = datetime.datetime.strptime(tmp["date"], '%Y-%m-%d')
        if row["date"].weekday() in [5, 6]:
            return pd.Series({'isHoliday': 1, 'reason': "Weekend"})
        else:
            return pd.Series({'isHoliday': 0, 'reason': "Week Day"})
        

In [7]:
df = df_train_w_stores[["date", "city", "state"]].drop_duplicates()
df

Unnamed: 0,date,city,state
0,2013-01-01,Quito,Pichincha
66,2013-01-01,Cayambe,Pichincha
99,2013-01-01,Latacunga,Cotopaxi
165,2013-01-01,Riobamba,Chimborazo
198,2013-01-01,Ibarra,Imbabura
...,...,...,...
3000129,2017-08-15,Loja,Loja
3000228,2017-08-15,Machala,El Oro
3000327,2017-08-15,Esmeraldas,Esmeraldas
3000657,2017-08-15,Manta,Manabi


In [9]:
new_cols = df.apply(get_holiday_info, holidays=df_holidays, axis=1)
new_cols

Unnamed: 0,isHoliday,reason
0,1,Holiday
66,1,Holiday
99,1,Holiday
165,1,Holiday
198,1,Holiday
...,...,...
3000129,0,Week Day
3000228,0,Week Day
3000327,0,Week Day
3000657,0,Week Day


In [10]:
df_n = pd.concat([df, new_cols], axis=1)

In [11]:
df = pd.merge(df_train_w_stores, df_n, on=["date", "city", "state"], how="left")

In [38]:
df = pd.merge(df, df_transactions, on=["date", "store_nbr"], how="left")

In [39]:
df

Unnamed: 0,date,state,city,isHoliday,reason,store_nbr,type,cluster,family,onpromotion,sales,transactions
0,2013-01-01,Pichincha,Quito,1,Holiday,1,D,13,AUTOMOTIVE,0,0.000,
1,2013-01-01,Pichincha,Quito,1,Holiday,1,D,13,BABY CARE,0,0.000,
2,2013-01-01,Pichincha,Quito,1,Holiday,1,D,13,BEAUTY,0,0.000,
3,2013-01-01,Pichincha,Quito,1,Holiday,1,D,13,BEVERAGES,0,0.000,
4,2013-01-01,Pichincha,Quito,1,Holiday,1,D,13,BOOKS,0,0.000,
...,...,...,...,...,...,...,...,...,...,...,...,...
3000883,2017-08-15,Pichincha,Quito,0,Week Day,9,B,6,POULTRY,0,438.133,2155.0
3000884,2017-08-15,Pichincha,Quito,0,Week Day,9,B,6,PREPARED FOODS,1,154.553,2155.0
3000885,2017-08-15,Pichincha,Quito,0,Week Day,9,B,6,PRODUCE,148,2419.729,2155.0
3000886,2017-08-15,Pichincha,Quito,0,Week Day,9,B,6,SCHOOL AND OFFICE SUPPLIES,8,121.000,2155.0


In [61]:
cols = ['date', 'state', 'city', 'isHoliday', 'reason', 'store_nbr', 'type', 'cluster', 'transactions', 'family',  'onpromotion', 'sales']
df = df[cols]
df.head()

Unnamed: 0,date,state,city,isHoliday,reason,store_nbr,type,cluster,transactions,family,onpromotion,sales
0,2013-01-02,Pichincha,Quito,0,Week Day,1,D,13,2111.0,AUTOMOTIVE,0,2.0
1,2013-01-02,Pichincha,Quito,0,Week Day,1,D,13,2111.0,BABY CARE,0,0.0
2,2013-01-02,Pichincha,Quito,0,Week Day,1,D,13,2111.0,BEAUTY,0,2.0
3,2013-01-02,Pichincha,Quito,0,Week Day,1,D,13,2111.0,BEVERAGES,0,1091.0
4,2013-01-02,Pichincha,Quito,0,Week Day,1,D,13,2111.0,BOOKS,0,0.0


In [62]:
df = df[df["date"]>pd.to_datetime("2013-01-01")].reset_index(drop=True)
df

Unnamed: 0,date,state,city,isHoliday,reason,store_nbr,type,cluster,transactions,family,onpromotion,sales
0,2013-01-02,Pichincha,Quito,0,Week Day,1,D,13,2111.0,AUTOMOTIVE,0,2.000
1,2013-01-02,Pichincha,Quito,0,Week Day,1,D,13,2111.0,BABY CARE,0,0.000
2,2013-01-02,Pichincha,Quito,0,Week Day,1,D,13,2111.0,BEAUTY,0,2.000
3,2013-01-02,Pichincha,Quito,0,Week Day,1,D,13,2111.0,BEVERAGES,0,1091.000
4,2013-01-02,Pichincha,Quito,0,Week Day,1,D,13,2111.0,BOOKS,0,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...
2999101,2017-08-15,Pichincha,Quito,0,Week Day,9,B,6,2155.0,POULTRY,0,438.133
2999102,2017-08-15,Pichincha,Quito,0,Week Day,9,B,6,2155.0,PREPARED FOODS,1,154.553
2999103,2017-08-15,Pichincha,Quito,0,Week Day,9,B,6,2155.0,PRODUCE,148,2419.729
2999104,2017-08-15,Pichincha,Quito,0,Week Day,9,B,6,2155.0,SCHOOL AND OFFICE SUPPLIES,8,121.000


In [197]:
df.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Index: 2778798 entries, 0 to 55538
Data columns (total 12 columns):
 #   Column        Non-Null Count    Dtype         
---  ------        --------------    -----         
 0   date          2778798 non-null  datetime64[ns]
 1   state         2778798 non-null  object        
 2   city          2778798 non-null  object        
 3   isHoliday     2778798 non-null  int64         
 4   reason        2778798 non-null  object        
 5   store_nbr     2778798 non-null  int64         
 6   type          2778798 non-null  object        
 7   cluster       2778798 non-null  int64         
 8   transactions  2755071 non-null  float64       
 9   family        2778798 non-null  object        
 10  onpromotion   2778798 non-null  int64         
 11  sales         2778798 non-null  float64       
dtypes: datetime64[ns](1), float64(2), int64(4), object(5)
memory usage: 275.6+ MB


In [191]:
df_empty = pd.DataFrame()
for i in df["store_nbr"].sort_values().unique():
    df_temp = df[df["store_nbr"]==i].reset_index(drop=True)
    df_temp2 = df_temp[["date","store_nbr","sales"]].groupby(by=["date","store_nbr"]).sum().reset_index()
    first_purchase_date = df_temp2[df_temp2['sales'] != 0].date.iloc[0]
    df_temp = df_temp[df_temp.date >= first_purchase_date]
    df_empty = pd.concat([df_empty, df_temp])
df = df_empty.copy()

In [199]:
df[(df["transactions"].isna()==True)&(df["isHoliday"]==0)]

Unnamed: 0,date,state,city,isHoliday,reason,store_nbr,type,cluster,transactions,family,onpromotion,sales
30162,2015-07-07,Pichincha,Quito,0,Week Day,1,D,13,,AUTOMOTIVE,0,0.000000
30163,2015-07-07,Pichincha,Quito,0,Week Day,1,D,13,,BABY CARE,0,0.000000
30164,2015-07-07,Pichincha,Quito,0,Week Day,1,D,13,,BEAUTY,0,0.000000
30165,2015-07-07,Pichincha,Quito,0,Week Day,1,D,13,,BEVERAGES,0,0.000000
30166,2015-07-07,Pichincha,Quito,0,Week Day,1,D,13,,BOOKS,0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
36130,2016-01-04,Manabi,El Carmen,0,Week Day,54,C,3,,POULTRY,0,52.871998
36131,2016-01-04,Manabi,El Carmen,0,Week Day,54,C,3,,PREPARED FOODS,0,5.000000
36132,2016-01-04,Manabi,El Carmen,0,Week Day,54,C,3,,PRODUCE,0,664.128000
36133,2016-01-04,Manabi,El Carmen,0,Week Day,54,C,3,,SCHOOL AND OFFICE SUPPLIES,0,1.000000


In [12]:
df.to_pickle("data/data_prepped.pkl")

People working in public sector usually get their salaries on 15th of each month and at the end of the month. Label these days

16 Apr 2016 -> An earthquake of 7.8. People bought a lot of food and donated them. So, we can check the sales of those days.