# Imports

In [1]:
import os
import pandas as pd
import numpy as np
import math
import cmath
import matplotlib.pyplot as plt
import seaborn as sns

# Data

## Declaration

In [2]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 300)

In [3]:
transx = pd.read_csv("../data/transactions.csv")
stores = pd.read_csv("../data/stores.csv")
oil = pd.read_csv("../data/oil.csv")
holidays = pd.read_csv("../data/holidays_events.csv")

training = pd.read_csv("../data/train.csv")
testing = pd.read_csv("../data/test.csv")
sample = pd.read_csv("../data/sample_submission.csv")

## Attributes

### Transactions

In [4]:
transx.head()

display(transx.head())

print(f"Shape: {transx.shape} \n")
print(f"Size: {transx.size} \n")
print(f"Nulls: {transx.isnull().sum()} \n")
print(f"Dtypes: {transx.dtypes} \n")

print(f"Date check: {type(transx['date'][0])}")

Unnamed: 0,date,store_nbr,transactions
0,2013-01-01,25,770
1,2013-01-02,1,2111
2,2013-01-02,2,2358
3,2013-01-02,3,3487
4,2013-01-02,4,1922


Shape: (83488, 3) 

Size: 250464 

Nulls: date            0
store_nbr       0
transactions    0
dtype: int64 

Dtypes: date            object
store_nbr        int64
transactions     int64
dtype: object 

Date check: <class 'str'>


Date will need to be converted into a datetime object. Values to be explored. 

### Stores

In [5]:
display(stores.head())

print(f"Shape: {stores.shape} \n")
print(f"Size: {stores.size} \n")
print(f"Nulls: {stores.isnull().sum()} \n")
print(f"Dtypes: {stores.dtypes} \n")

Unnamed: 0,store_nbr,city,state,type,cluster
0,1,Quito,Pichincha,D,13
1,2,Quito,Pichincha,D,13
2,3,Quito,Pichincha,D,8
3,4,Quito,Pichincha,D,9
4,5,Santo Domingo,Santo Domingo de los Tsachilas,D,4


Shape: (54, 5) 

Size: 270 

Nulls: store_nbr    0
city         0
state        0
type         0
cluster      0
dtype: int64 

Dtypes: store_nbr     int64
city         object
state        object
type         object
cluster       int64
dtype: object 



Store number could be used here to match up to transactions. There are only 54 rows. Will be checking upon the join to transactions if any stores are omitted.  

### Oil

In [6]:
display(oil.head())

print(f"Shape: {oil.shape} \n")
print(f"Size: {oil.size} \n")
print(f"Nulls: {oil.isnull().sum()} \n")
print(f"Dtypes: {oil.dtypes} \n")

print(type(oil["date"][0]))

Unnamed: 0,date,dcoilwtico
0,2013-01-01,
1,2013-01-02,93.14
2,2013-01-03,92.97
3,2013-01-04,93.12
4,2013-01-07,93.2


Shape: (1218, 2) 

Size: 2436 

Nulls: date           0
dcoilwtico    43
dtype: int64 

Dtypes: date           object
dcoilwtico    float64
dtype: object 

<class 'str'>


Date will need to be converted to datetime objects. Need to make sense of the oil price column. There are also nulls. Must address what to do if dates don't line up. It's likely best to observe how much oil prices fluctuate and then extrapolate if possible. 

### Holidays

In [7]:
holidays.head()

display(holidays.head())

print(f"Shape: {holidays.shape} \n")
print(f"Size: {holidays.size} \n")
print(f"Nulls: {holidays.isnull().sum()} \n")
print(f"Dtypes: {holidays.dtypes} \n")

print(f"Date check: {type(holidays['date'][0])}")

Unnamed: 0,date,type,locale,locale_name,description,transferred
0,2012-03-02,Holiday,Local,Manta,Fundacion de Manta,False
1,2012-04-01,Holiday,Regional,Cotopaxi,Provincializacion de Cotopaxi,False
2,2012-04-12,Holiday,Local,Cuenca,Fundacion de Cuenca,False
3,2012-04-14,Holiday,Local,Libertad,Cantonizacion de Libertad,False
4,2012-04-21,Holiday,Local,Riobamba,Cantonizacion de Riobamba,False


Shape: (350, 6) 

Size: 2100 

Nulls: date           0
type           0
locale         0
locale_name    0
description    0
transferred    0
dtype: int64 

Dtypes: date           object
type           object
locale         object
locale_name    object
description    object
transferred      bool
dtype: object 

Date check: <class 'str'>


Date needs to be converted to datetime object. Holidays will be useful for understanding any captured anomalies our model may pickup. We will keep them in mind when interpreting inaccuracy in scores or any explanation of our model's limits. These anamolies are ordinal, as well. We can likely fine tune a model to understand these events and treat them in a specific way that best helps us plan for our business' supply and demand. 

### Training

In [8]:
training.head()

display(training.head())

print(f"Shape: {training.shape} \n")
print(f"Size: {training.size} \n")
print(f"Nulls: {training.isnull().sum()} \n")
print(f"Dtypes: {training.dtypes} \n")

print(f"Date check: {type(training['date'][0])}")

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0
1,1,2013-01-01,1,BABY CARE,0.0,0
2,2,2013-01-01,1,BEAUTY,0.0,0
3,3,2013-01-01,1,BEVERAGES,0.0,0
4,4,2013-01-01,1,BOOKS,0.0,0


Shape: (3000888, 6) 

Size: 18005328 

Nulls: id             0
date           0
store_nbr      0
family         0
sales          0
onpromotion    0
dtype: int64 

Dtypes: id               int64
date            object
store_nbr        int64
family          object
sales          float64
onpromotion      int64
dtype: object 

Date check: <class 'str'>


Date conversion needed here. Surprisingly no nulls. Will be joining up against `date`, after all the coincident columns have matching datetime keys.  

### Testing

In [9]:
testing.head()

display(testing.head())
print(f"Shape: {testing.shape} \n")
print(f"Size: {testing.size} \n")
print(f"Nulls: {testing.isnull().sum()} \n")
print(f"Dtypes: {testing.dtypes} \n")

print(f"Date check: {type(testing['date'][0])}")

Unnamed: 0,id,date,store_nbr,family,onpromotion
0,3000888,2017-08-16,1,AUTOMOTIVE,0
1,3000889,2017-08-16,1,BABY CARE,0
2,3000890,2017-08-16,1,BEAUTY,2
3,3000891,2017-08-16,1,BEVERAGES,20
4,3000892,2017-08-16,1,BOOKS,0


Shape: (28512, 5) 

Size: 142560 

Nulls: id             0
date           0
store_nbr      0
family         0
onpromotion    0
dtype: int64 

Dtypes: id              int64
date           object
store_nbr       int64
family         object
onpromotion     int64
dtype: object 

Date check: <class 'str'>


So testing data omits some of the other data. The question now begs is how do we correctly apply all the data we have to work with to plug into the testing data? 

## Cleaning

### Datetime Conversion

In [10]:
## Datetime conversion function

dated_dfs = [transx, oil, holidays, training]
# testing["date"] = pd.to_datetime(testing["date"]) #this is to show how we'll tag our testing df with all the pre-processing we need {A function will be the result of housing all the cleaning}


for df in dated_dfs:
    df["date"] = pd.to_datetime(df["date"]).dt.date
    print(type(df["date"][0]))

<class 'datetime.date'>
<class 'datetime.date'>
<class 'datetime.date'>
<class 'datetime.date'>


With the proper conversion of our dataframes' `"date"` column, we have been able to promote their data types into more charismatic data. Pandas is now able to recognize the column with ordinality instead of solely recognizing it as a nominal data type - which it was doing previously. From the perspective of the human eye, we readers understand the column implicitly as something with ordinality. Computers have simple thinking, where the concept of ordinality is not implicitly understood upon first glance - only able to access each object's attribute just a string.

Furthermore, some comments were included in the above cell to reference how we will conduct a similar process on our testing data. The functions housing such will be provided at the top of the notebook.

### Value Exploration

#### Datetime Exploration

**All Dfs**

In [11]:
##Minimums and Maximums on Training Data alone (we won't be able to understand any testing data)

dated_df_names = ["Transactions:", "Oil:", "Holidays:", "Training:"]

for index, df in enumerate(dated_dfs):
    print(f"Showing table for {dated_df_names[index]}")
    print(f"Min: {df['date'].min()}")
    print(f"Max: {df['date'].max()} \n")

Showing table for Transactions:
Min: 2013-01-01
Max: 2017-08-15 

Showing table for Oil:
Min: 2013-01-01
Max: 2017-08-31 

Showing table for Holidays:
Min: 2012-03-02
Max: 2017-12-26 

Showing table for Training:
Min: 2013-01-01
Max: 2017-08-15 



We notice several years worth of data here (which sounds great!). `holidays` records the largest spread of dates with an absolute minimum and maximum respectively `2012-03-02` and `2017-12-26`. The other tables (`transx`, `oil`, and `training`) all showcase a start date of `2013-01-01`. `oil` extends beyond `transx`'s and `trainings`'s maximums of `2017-08-15` which an end date of `2017-08-31`. The mismatch in size will be noted and addressed as we press forward. 

Additional note: We only focus on the `training` ( + complementary datasets we are able to leverage) data as it would be mostly impractical to perform the same analysis on our `testing` data. We won't be able to understand our unknown data's date range until we already begin sicking our model on the data. This will also prevent any leakage of data and force us to work around real-world constraints. However, the work we can accomplish on the training data might be generalized enough to consider sensible kpis built off of our `date` column, which could be implemented. 

In [12]:
#date_differences checking

for index, dated_df in enumerate(dated_dfs):
    max_date = dated_df['date'].max() 
    min_date = dated_df['date'].min()
    absolute_date_difference = (max_date - min_date).days
    table_row_count = dated_df.shape[0]

    print(f"Date difference of {dated_df_names[index]}:")
    print(f"{absolute_date_difference} days <- ({max_date} - {min_date})")
    print(f"The rows of the native table is: {table_row_count} \n \n")

#TODO: re-approach the issue and its relevance to wards modeling. 
# def date_subtraction(df) #not necessary I don't think for now. 

#date_counts checking

 

Date difference of Transactions::
1687 days <- (2017-08-15 - 2013-01-01)
The rows of the native table is: 83488 
 

Date difference of Oil::
1703 days <- (2017-08-31 - 2013-01-01)
The rows of the native table is: 1218 
 

Date difference of Holidays::
2125 days <- (2017-12-26 - 2012-03-02)
The rows of the native table is: 350 
 

Date difference of Training::
1687 days <- (2017-08-15 - 2013-01-01)
The rows of the native table is: 3000888 
 



We expected this data to have differences in days. So this is unsurprising. However, the days difference between the training and transactions table is effectively the same. What could this mean? 

In [13]:
#days between per df

for index, dated_df in enumerate(dated_dfs):
    print("----------")
    print(f"Editing: {dated_df_names[index]}")
    dated_df["days_diff"] = dated_df["date"].diff().dt.days
    display(dated_df["days_diff"].value_counts().sort_values(ascending=False))
    

#TODO: re-approach the issue and its relevance to wards modeling.
# test_df = transx.copy()

# test_df["days_diff"] = transx["date"].diff().dt.days

----------
Editing: Transactions:


days_diff
0.0    81806
1.0     1675
2.0        6
Name: count, dtype: int64

----------
Editing: Oil:


days_diff
1.0    974
3.0    243
Name: count, dtype: int64

----------
Editing: Holidays:


days_diff
1.0     133
0.0      38
2.0      33
3.0      23
5.0      17
9.0      13
11.0     11
8.0      10
13.0      8
30.0      7
7.0       7
10.0      6
35.0      6
23.0      5
20.0      5
4.0       4
12.0      4
24.0      3
22.0      2
28.0      2
21.0      2
60.0      1
42.0      1
19.0      1
16.0      1
37.0      1
18.0      1
15.0      1
38.0      1
27.0      1
56.0      1
Name: count, dtype: int64

----------
Editing: Training:


days_diff
0.0    2999204
1.0       1679
2.0          4
Name: count, dtype: int64

It makes no sense for us to currently incorporate a `days_diff` column before our merge. Some of our merging will utilize the `date` column as a coincident key. Therefore, there will be inconsistency in bringing our respecting date difference columns - especially if the choice is to potentially omit some dates or duplicate some dates for cleaner processing. 

`transx` and `training` again show strong consistencies with one another through absolute date differences and through spread of `days_diff`. 

**Holiday Dates**

We will want to pay attention to the holidays present in this case. A reason being is because of the areas where `days_diff` is high. Some questions to consider:
- can we leverage any information from this table to extrapolate holiday existence?
- are there any locations inconsistent with a Favorita store? (likely to be studied in a different section and then be otherwise found in the appendix).
- how many holidays are there per year? per month? per day?
- most frequent kind of holiday

In [33]:
# display(holidays.dropna().sort_values(by="days_diff", ascending=False).head(50))
display(holidays["description"].value_counts(ascending=True).head(20))
display(holidays["description"].value_counts(ascending=True).tail(5))

description
Traslado Fundacion de Quito                        1
Terremoto Manabi+11                                1
Mundial de futbol Brasil: Ecuador-Honduras         1
Terremoto Manabi+9                                 1
Terremoto Manabi+8                                 1
Terremoto Manabi+7                                 1
Terremoto Manabi+6                                 1
Terremoto Manabi+5                                 1
Terremoto Manabi+4                                 1
Terremoto Manabi+3                                 1
Terremoto Manabi+2                                 1
Terremoto Manabi+1                                 1
Terremoto Manabi                                   1
Recupero Puente Primer dia del ano                 1
Recupero Puente Navidad                            1
Recupero puente Navidad                            1
Recupero puente primer dia del ano                 1
Mundial de futbol Brasil: Final                    1
Mundial de futbol Brasil: Tercer y

description
Fundacion de Loja       6
Fundacion de Manta      6
Fundacion de Ibarra     7
Fundacion de Cuenca     7
Carnaval               10
Name: count, dtype: int64

The above truly isn't enough of a look to understand more about these days difference gaps. We

In [None]:
##count the number of local of holidays and the number of cities it's in. 

display(
    holidays
    .groupby(["locale"])
    .agg(
        description_count = ("description", "count"), 
        locale_name_count = ("locale_name", "count"), 
        transferred_count = ("transferred", "count"),
        type_count = ("type", "count"),
        description_unique_count = ("description", "nunique"), 
        locale_name_unique_count = ("locale_name", "nunique"),
        transferred_unique_count = ("transferred", "nunique"),
        type_unique_count = ("type", "nunique")
    )
)

#count the holidays per month

Unnamed: 0_level_0,description_count,locale_name_count,transferred_count,type_count,description_unique_count,locale_name_unique_count,transferred_unique_count,type_unique_count
locale,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Local,152,152,152,152,27,19,2,3
National,174,174,174,174,72,1,2,6
Regional,24,24,24,24,4,4,1,1


In [68]:
local_filter = holidays["locale"] == "Local"
regional_filter = holidays["locale"] == "Regional"
national_filter = holidays["locale"] == "National"


#We want to find the intercepts I guess
#See if descriptions are overlapping (if so, are we finding the same information posed on a per day basis?)
local_locale_names = list(holidays[local_filter]["locale_name"].unique())
local_description_names = list(holidays[local_filter]["description"].unique())

regional_locale_names = list(holidays[regional_filter]["locale_name"].unique())
regional_description_names = list(holidays[regional_filter]["description"].unique())

national_locale_names = list(holidays[national_filter]["locale_name"].unique())
national_description_names = list(holidays[national_filter]["description"].unique())


local_coincident_regional_locale_names = [x for x in local_locale_names if x in regional_locale_names]
local_coincident_national_locale_names = [x for x in local_locale_names if x in national_locale_names]
regional_coincident_national_locale_names = [x for x in regional_locale_names if x in national_locale_names]


local_coincident_regional_description_names = [x for x in local_description_names if x in regional_description_names]
local_coincident_national_description_names = [x for x in local_description_names if x in national_description_names]
regional_coincident_national_description_names = [x for x in regional_description_names if x in national_description_names]



# # print(regional_locale_names)
# holidays[holidays["locale"] == "Regional"].head(24)

In [69]:
local_coincident_national_description_names

[]

In [70]:
local_coincident_regional_description_names

[]

In [71]:
regional_coincident_national_description_names

[]

In [67]:
regional_coincident_local_locale_names

[]

In [62]:
regional_description_names

['Provincializacion de Cotopaxi',
 'Provincializacion de Imbabura',
 'Provincializacion de Santo Domingo',
 'Provincializacion Santa Elena']

In [64]:
regional_locale_names

['Cotopaxi', 'Imbabura', 'Santo Domingo de los Tsachilas', 'Santa Elena']

Quick notes:

Left 4 columns are there to showcase some split reference on things. They will obviously correspond. We use the right side of the above table to look at the amount of unique values found here. 

Some of the more interesting points to take away from here...
- Regional has a low amount of presence here. It may be worth examining it further
- most holidays are national holidays

In [None]:
regional_locale_names = list(holidays["locale_name"].unique())
regional_description_names = list(holidays["description"].unique())

holidays[holidays["locale"] == "Local"].head(20)

Unnamed: 0,date,type,locale,locale_name,description,transferred,days_diff
0,2012-03-02,Holiday,Local,Manta,Fundacion de Manta,False,
2,2012-04-12,Holiday,Local,Cuenca,Fundacion de Cuenca,False,11.0
3,2012-04-14,Holiday,Local,Libertad,Cantonizacion de Libertad,False,2.0
4,2012-04-21,Holiday,Local,Riobamba,Cantonizacion de Riobamba,False,7.0
5,2012-05-12,Holiday,Local,Puyo,Cantonizacion del Puyo,False,21.0
6,2012-06-23,Holiday,Local,Guaranda,Cantonizacion de Guaranda,False,42.0
8,2012-06-25,Holiday,Local,Latacunga,Cantonizacion de Latacunga,False,0.0
9,2012-06-25,Holiday,Local,Machala,Fundacion de Machala,False,0.0
10,2012-07-03,Holiday,Local,Santo Domingo,Fundacion de Santo Domingo,False,8.0
11,2012-07-03,Holiday,Local,El Carmen,Cantonizacion de El Carmen,False,0.0


In [43]:
holidays[holidays["locale"] == "Regional"].head(20)

Unnamed: 0,date,type,locale,locale_name,description,transferred,days_diff
1,2012-04-01,Holiday,Regional,Cotopaxi,Provincializacion de Cotopaxi,False,30.0
7,2012-06-25,Holiday,Regional,Imbabura,Provincializacion de Imbabura,False,2.0
23,2012-11-06,Holiday,Regional,Santo Domingo de los Tsachilas,Provincializacion de Santo Domingo,False,3.0
24,2012-11-07,Holiday,Regional,Santa Elena,Provincializacion Santa Elena,False,1.0
47,2013-04-01,Holiday,Regional,Cotopaxi,Provincializacion de Cotopaxi,False,30.0
58,2013-06-25,Holiday,Regional,Imbabura,Provincializacion de Imbabura,False,2.0
76,2013-11-06,Holiday,Regional,Santo Domingo de los Tsachilas,Provincializacion de Santo Domingo,False,3.0
77,2013-11-07,Holiday,Regional,Santa Elena,Provincializacion Santa Elena,False,1.0
96,2014-04-01,Holiday,Regional,Cotopaxi,Provincializacion de Cotopaxi,False,28.0
112,2014-06-25,Holiday,Regional,Imbabura,Provincializacion de Imbabura,False,0.0


In [44]:
holidays[holidays["locale"] == "National"].head(20)

Unnamed: 0,date,type,locale,locale_name,description,transferred,days_diff
14,2012-08-10,Holiday,National,Ecuador,Primer Grito de Independencia,False,5.0
19,2012-10-09,Holiday,National,Ecuador,Independencia de Guayaquil,True,2.0
20,2012-10-12,Transfer,National,Ecuador,Traslado Independencia de Guayaquil,False,3.0
21,2012-11-02,Holiday,National,Ecuador,Dia de Difuntos,False,21.0
22,2012-11-03,Holiday,National,Ecuador,Independencia de Cuenca,False,1.0
31,2012-12-21,Additional,National,Ecuador,Navidad-4,False,13.0
33,2012-12-22,Additional,National,Ecuador,Navidad-3,False,0.0
34,2012-12-23,Additional,National,Ecuador,Navidad-2,False,1.0
35,2012-12-24,Bridge,National,Ecuador,Puente Navidad,False,1.0
36,2012-12-24,Additional,National,Ecuador,Navidad-1,False,0.0


In `holidays`, the `locale` column contains two unique nominal values where it specifies the holiday to be on a regional or national scale. However, when counting values and unique values of some others against the grouped by `locale` column, we notice there will always be an "Ecuador" value in the `locale_name` corresponding to every value of "National" in the `locale` column. Therefore, we can reduce the width of the table by dropping the `locale` column as it contains redundant information that can be implied by the `locale_name`. 

The `transferred_count` on the aggregated 

There also appears to be a one-for-one correspondence of the `transferred` column which is boolean. There is a case to be made here where we may find redundant information. I believe there is more value in grouping the names of each holiday (`description`) as my hypothesis is that `transferred` denotes the amount of time the holiday appears. Not sure. There may also be some value in grouping by `description` and `transferred` to check 

In [39]:
display(
    holidays
    .groupby(["description", "transferred"])
    .agg(
        transferred_count = ("transferred", "count"), 
    )
    .sort_values(by="transferred_count", ascending=False)
    .head()
)

display(
    holidays
    .groupby(["description", "locale_name", "transferred"])
    .agg(
        transferred_count = ("transferred", "count"), 
    )
    .sort_values(by="transferred_count", ascending=False)
    .head(20)
)

# display(
#     holidays
#     .groupby(["description", "locale_name", "transferred"])
#     .agg(
#         transferred_count = ("transferred", "count"), 
#     )
#     .sort_values(by="transferred_count", ascending=False)
#     .tail(10)
# )

display(
    holidays
    .groupby(["description", "locale_name", "transferred"])
    .agg(
        transferred_count = ("transferred", "count"), 
    )
    .sort_values(by=["description", "locale_name", "transferred"], ascending=False)
    .head(20)
)

display(
    holidays
    .groupby(["description", "locale_name", "transferred", "type"])
    .agg(
        transferred_count = ("transferred", "count"), 
    )
    .sort_values(by=["description", "locale_name", "transferred"], ascending=False)
    .head(20)
)

Unnamed: 0_level_0,Unnamed: 1_level_0,transferred_count
description,transferred,Unnamed: 2_level_1
Carnaval,False,10
Navidad-4,False,6
Fundacion de Cuenca,False,6
Fundacion de Esmeraldas,False,6
Navidad-2,False,6


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,transferred_count
description,locale_name,transferred,Unnamed: 3_level_1
Carnaval,Ecuador,False,10
Navidad-4,Ecuador,False,6
Fundacion de Cuenca,Cuenca,False,6
Fundacion de Esmeraldas,Esmeraldas,False,6
Navidad-2,Ecuador,False,6
Navidad-1,Ecuador,False,6
Navidad+1,Ecuador,False,6
Fundacion de Ibarra,Ibarra,False,6
Navidad,Ecuador,False,6
Fundacion de Loja,Loja,False,6


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,transferred_count
description,locale_name,transferred,Unnamed: 3_level_1
Viernes Santo,Ecuador,False,5
Traslado Primer dia del ano,Ecuador,False,1
Traslado Primer Grito de Independencia,Ecuador,False,2
Traslado Independencia de Guayaquil,Ecuador,False,3
Traslado Fundacion de Quito,Quito,False,1
Traslado Fundacion de Guayaquil,Guayaquil,False,1
Traslado Batalla de Pichincha,Ecuador,False,2
Terremoto Manabi+9,Ecuador,False,1
Terremoto Manabi+8,Ecuador,False,1
Terremoto Manabi+7,Ecuador,False,1


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,transferred_count
description,locale_name,transferred,type,Unnamed: 4_level_1
Viernes Santo,Ecuador,False,Holiday,5
Traslado Primer dia del ano,Ecuador,False,Transfer,1
Traslado Primer Grito de Independencia,Ecuador,False,Transfer,2
Traslado Independencia de Guayaquil,Ecuador,False,Transfer,3
Traslado Fundacion de Quito,Quito,False,Transfer,1
Traslado Fundacion de Guayaquil,Guayaquil,False,Transfer,1
Traslado Batalla de Pichincha,Ecuador,False,Transfer,2
Terremoto Manabi+9,Ecuador,False,Event,1
Terremoto Manabi+8,Ecuador,False,Event,1
Terremoto Manabi+7,Ecuador,False,Event,1


In [121]:
#unique holidays per year

holidays.head()

Unnamed: 0,date,type,locale,locale_name,description,transferred,days_diff
0,2012-03-02,Holiday,Local,Manta,Fundacion de Manta,False,
1,2012-04-01,Holiday,Regional,Cotopaxi,Provincializacion de Cotopaxi,False,30.0
2,2012-04-12,Holiday,Local,Cuenca,Fundacion de Cuenca,False,11.0
3,2012-04-14,Holiday,Local,Libertad,Cantonizacion de Libertad,False,2.0
4,2012-04-21,Holiday,Local,Riobamba,Cantonizacion de Riobamba,False,7.0


### Null Handling

In [None]:
def data_cleaning(df):
    # Datetime Conversions
    df["date"] = pd.to_datetime(df["date"])

    #Feature Reduction
    #drop the `locale` column

    # Null Handling
    pass

## Merges

### Pre-merge Analysis

In [None]:
#Holiday merge might need something clever like "is_holiday" as a column. We may have to also see how many cities apply to one unique holiday which can gage the weight of a holiday's importance. 

### Merge and Review

In [None]:
#value count the new dates' instances

#collect the omitted dates from each table from the final table

# Plotting and Feature Exploration

### Sales Aggregations per category

### Geographical Plotting of Store Locations

In [None]:
### 

# Modeling

## Modeling

**Model pre-processing: transformation**

**Modeling**

## Plotting & Analysis

I might make a trackable hyperparameter tuning table. So take all the hyperparameters and showcase the progress of work. 

## Conclusion

# Appendix

## Individual Plot Date Diff Exploration

I will be making a matplotlib subplot comparison of plots to understand the date difference sparcisty within the columns. Not sure if the plot will have major impactful value, but will otherwise explore it nontheless. 

Two plots come to mind:

- histogram with bin counts of the date-diff precense in each 
- sorted by date barchart directly indicating where the largest days diff happen to be
- 