In [None]:
#https://stackoverflow.com/questions/40536560/ipython-and-jupyter-autocomplete-not-working
%config Completer.use_jedi = False

import os
import sys
from pathlib import Path

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

import lightgbm as lgb

from IPython.display import display, Markdown, HTML, Image

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

### Data Exploration

1. **calendar.csv**: contains information about the dates on which the products are sold.
2. **sales_train_validation.csv**: contains the historical daily unit sales data per product and store [d_1 - d_1913]
3. **sell_prices.csv**: contains information about the price of the products sold per store and date.

In [None]:
DATA_PATH = './data'

calendar = pd.read_csv(f'{DATA_PATH}/calendar.csv')
sales_tv = pd.read_csv(f'{DATA_PATH}/sales_train_validation.csv')
sell_prices = pd.read_csv(f'{DATA_PATH}/sell_prices.csv')

ss = pd.read_csv(f'{DATA_PATH}/sample_submission.csv')

calendar.shape, sales_tv.shape, sell_prices.shape

In [None]:
calendar.info()
calendar.head()
calendar.describe()

In [None]:
sales_tv.info()
sales_tv.head()
sales_tv.describe()

In [None]:
sell_prices.info()
sell_prices.head()
sell_prices.describe()

### What are we trying to predict?
We should make a forecast of sales for 28 days. For each item, the model should **predict its sales (number of unit sold per day) in the next 28 days**. 

1. The rows each represent a specific item. This id tells us the item type, state, and store. We don't know what these items are exactly.
2. the columns must be fill with the predicted values (28 days head forecast)

In [None]:
# sample submission
ss.head()

### Merge calendar

In [None]:
calendar.shape, sales_tv.shape
calendar.date.min(), calendar.date.max()
calendar.head()

In [None]:
calendar.d.shape

In [None]:
date_df = calendar.loc[:, ['date', 'd']].set_index('d', drop=True)
date_df['date'] = pd.to_datetime(date_df.date)
date_df.info()

In [None]:
day_cols = [c for c in sales_tv.columns if c.startswith('d_')]
sales_tv.cat_id.unique()
sales_tv.id.is_unique
df1 = sales_tv.loc[:, ['id', 'cat_id'] + day_cols].set_index('id', drop=True)
df1.shape
df1

### Quickly visualizations

In [None]:
g1 = df1.groupby('cat_id').mean().T
g1.head()

In [None]:
len(g1)
df_plot = pd.merge(g1, date_df, how='left', right_index=True, left_index=True).set_index('date', drop=True)
df_plot.shape
df_plot.head()

In [None]:
df_weekly = df_plot.groupby(pd.Grouper(freq='W')).mean()
df_weekly

In [None]:
cols = df_weekly.columns.to_list()
fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(9,5))
date = df_weekly.index
for i in range(3):
    cols_name = cols[i]
    axs[i].plot(date, df_weekly[cols_name], label=cols_name)
plt.legend()
plt.show()