In [None]:
# silence warnings
import warnings
warnings.filterwarnings("ignore")

# imports time series
import pandas as pd
import numpy as np

# plots
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# pip install calmap
#import calmap

# para calculas métricas del modelo
from sklearn.metrics import mean_squared_error

# misc
import os
import datetime
import itertools
import pickle
from datetime import timedelta

# apartado ts
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.seasonal import seasonal_decompose

import xgboost as xgb

In [None]:
# Leemos los ficheros
ruta = '../data/'
df_forecast = pd.read_csv(ruta+'daily_sales_sample.csv')

In [None]:
df_forecast.head()

In [None]:
# Pasamos la variable 'date' a tipo fecha:
#df_forecast["date"] = pd.to_datetime(df_forecast["date"], format = "%Y-%m-%d")

In [None]:
# Generamos una nueva columna como 'week_day' para poder utilizarla como index:
df_forecast['year'] = df_forecast['yearweek'].astype(str).str[:4]
df_forecast['week'] = df_forecast['yearweek'].astype(str).str[4:] 
df_forecast['week_day'] = pd.to_datetime(df_forecast['year'] + df_forecast['week'] + '-1', format='%Y%U-%w')

In [None]:
# Borramos aquellas variables que no suman al modelo:
df_forecast.drop(['d','date','weekend', 'weekday_int','yearweek'], inplace=True, axis=1)

In [None]:
# Importo el dataset de test:
df_test = pd.read_csv(ruta+'df_test.csv',delimiter=';')

In [None]:
# Transformo algunas columnas al mismo tipo de dato que el dataset de forecast:
df_test['year'] = df_test['year'].astype(str)
df_test['week'] = df_test['week'].astype(str)
df_test['week_day'] = pd.to_datetime(df_test['year'] + df_test['week'] + '-1', format='%Y%U-%w')

In [None]:
# Unimos los dos datasets:
df_forecast = pd.concat([df_forecast, df_test], ignore_index=True)

In [None]:
df_forecast.tail()

# Asignación de valores numericos a los nulos:

In [None]:
df_forecast.isnull().sum()

In [None]:
# Asignamos valores a sell_price:
df_forecast['sell_price'] = df_forecast.groupby(['id'])['sell_price'].transform(lambda series: series.bfill().ffill())

In [None]:
# Reemplazamos los valores NaN de 'qty_sold' con cero, ya que son los valores a predecir:
df_forecast['qty_sold'].fillna(0, inplace=True)

# Data Transformation: creacion de diccionarios.

In [None]:
store_dict = {

    'South_End': 1,
    'Roxbury': 2,
    'Back_Bay': 3,
    'Greenwich_Village': 4,
    'Harlem' : 5,
    'Tribeca': 6, 
    'Brooklyn': 7,
    'Midtown_Village': 8,
    'Yorktown':9, 
    'Queen_Village': 10

}

df_forecast["store_encode"]=df_forecast["store"].map(store_dict)

In [None]:
department_dict = {
          
    'HOME_&_GARDEN_1' : 1,   
    'HOME_&_GARDEN_2' : 2, 
    'ACCESORIES_1' : 3,      
    'ACCESORIES_2': 4,
    'SUPERMARKET_1': 5,
    'SUPERMARKET_2': 6,  
    'SUPERMARKET_3': 7  

}

df_forecast["department_encode"]=df_forecast["department"].map(department_dict)

In [None]:
category_dict = {

    'SUPERMARKET': 1,
    'HOME_&_GARDEN': 2,
    'ACCESORIES': 3
   
}

df_forecast["category_encode"]=df_forecast["category"].map(category_dict)

In [None]:
region_dict ={

    'New York': 1,
    'Boston': 2,
    'Philadelphia':3

}

df_forecast["region_encode"]=df_forecast["region"].map(region_dict)

In [None]:
store_code_dict ={

    'NYC_1': 1,
    'NYC_2': 2,
    'NYC_3': 3,
    'NYC_4': 4,
    'BOS_1': 5,
    'BOS_2': 6,
    'BOS_3': 7,
    'PHI_1': 8,
    'PHI_2': 9,
    'PHI_3': 10

}

df_forecast["store_code_encode"]=df_forecast["store_code"].map(store_code_dict)

In [None]:
df_forecast['item_encoded'] = df_forecast['item'].str[-3:].astype(int)

In [None]:
df_forecast['year'] = df_forecast['year'].astype(int)
df_forecast['week'] = df_forecast['week'].astype(int)

In [None]:
df_forecast.head(5)

In [None]:
# Borramos las columnas encodeadas:
df_forecast.drop([
                    'category',
                    'department', 
                    'region', 
                    'store', 
                    'store_code',
                    'item'], 
                    axis=1, inplace=True)

In [None]:
df_forecast.head()

In [None]:
# Renombramos 'week_day' como 'date':
df_forecast.rename(columns={'week_day': 'date'}, inplace=True)

In [None]:
df_forecast.info()

In [None]:
df_forecast['sell_price'] = df_forecast['sell_price'].astype('float32')

In [None]:
df_forecast['qty_sold'] = df_forecast['qty_sold'].astype('float32')

In [None]:
df_forecast['holiday'] = df_forecast['holiday'].astype('int32')

In [None]:
df_forecast['store_encode'] = df_forecast['store_encode'].astype('int32')

In [None]:
df_forecast['department_encode'] = df_forecast['department_encode'].astype('int32')

In [None]:
df_forecast['category_encode'] = df_forecast['category_encode'].astype('int32')

In [None]:
df_forecast['region_encode'] = df_forecast['region_encode'].astype('int32')

In [None]:
df_forecast['store_code_encode'] = df_forecast['store_code_encode'].astype('int32')

## Guardamos el archivo final de Forecasting

In [None]:
df_forecast.to_csv((ruta+'df_forecast.csv'), index=False)