<img src = 'fotos/logo_dani.jpeg'>

In [1]:
import pandas as pd
 import numpy as np
import os
from datetime import datetime
from statsmodels.tsa.statespace.sarimax import SARIMAX 
import itertools
import pickle
from IPython.display import clear_output

import warnings
warnings.filterwarnings('ignore')

In [2]:
def rmse(real, predicted):
    return np.sqrt(((real - predicted) ** 2).mean())

## Ficheros y rutas de entrada/salida 

In [3]:
dir_in = '../../datos/datos_desarrollo'
file1_in = 'consumo_final.csv'
dir_out = dir_in

## Carga de datos 

In [7]:
df_consumo = pd.read_csv(os.path.join(dir_in, file1_in), sep = ';')
df_consumo.columns = [columna.lower() for columna in df_consumo.columns]
df_consumo.rename(columns = {'fecha_inicio': 'mes_inicio_temp', 'fecha_fin': 'mes_fin_temp'}, inplace = True)
df_consumo.date = pd.to_datetime(df_consumo.date, format = '%Y-%m-%d')

In [8]:
precio_model = df_consumo[['ccaa', 'producto', 'volumen_miles_de_kg', 'valor_miles_de_€', 'precio_medio_kg', 'date']]
cmpl_model_dict = {product: {comunidad: precio_model[(precio_model.producto.eq(product))&
                                                      (precio_model.ccaa.eq(comunidad))].drop(['producto', 'ccaa'], axis = 1)
                              for comunidad in precio_model.ccaa.unique()} for product in precio_model.producto.unique()}

In [9]:
df_prueba = df_consumo[(df_consumo.ccaa.isin(['Andalucia', 'Aragon'])) & (df_consumo.producto.isin(['Patatas', 'Mango']))]
prueba_model = df_prueba[['ccaa', 'producto', 'volumen_miles_de_kg', 'valor_miles_de_€', 'precio_medio_kg', 'date']]
sample_model_dict = {product: {comunidad: prueba_model[(prueba_model.producto.eq(product))&
                                                      (prueba_model.ccaa.eq(comunidad))].drop(['producto', 'ccaa'], axis = 1)
                              for comunidad in prueba_model.ccaa.unique()} for product in prueba_model.producto.unique()}

In [10]:
p = d = q = range(0, 3)
pdq = list(itertools.product(p, d, q))
seasonal_pdq = [(x[0], x[1], x[2], 12) for x in list(itertools.product(p, d, q))]

In [11]:
def best_sarima(variable, model_dict):
    min_error_df = pd.DataFrame(columns = ['comunidad', 'producto', 'error'])
    try:
        len(predicciones)
    except NameError:
        predicciones = pd.DataFrame(columns = ['ccaa', 'producto'])
    vuelta_general = 1
    fila_df = 0
    for producto, v in model_dict.items():
        n_comunidad = 1
        for comunidad in v.keys():
            combinacion = 1
            error_dict = {}
            timeseries_data = model_dict[producto][comunidad][[variable, 'date']].copy()
            timeseries_data.index = pd.DatetimeIndex(timeseries_data.date)
            timeseries_data.drop('date', axis = 1, inplace = True)
            timeseries_data.sort_index(inplace = True)

            train = timeseries_data.loc[timeseries_data.index <= '2020-02-01']
            test = timeseries_data.loc[timeseries_data.index > '2020-02-01']
            
            for param in pdq:
                for seasonal_param in seasonal_pdq:
                    print('Tienes', len(model_dict), 'productos y vas por el', vuelta_general, '. Este producto tiene', 
                          len(v), 'comunidades y vas por la', n_comunidad, 'Hay un total de', len(pdq) * len(seasonal_pdq), 
                          'combinaciones y vas por la', combinacion, '. La variable es', variable)
                    try:
                        print(param, seasonal_param)
                        mod = SARIMAX(train, 
                            order = param,
                            seasonal_order = seasonal_param, 
                            enforce_stationarity = False,
                            enforce_invertibility = False)                
                        result = mod.fit()
                        ypred = result.get_forecast(steps = len(test)).predicted_mean.values
                        error = rmse(test.values, ypred)
                        error_dict.update({error: [param, seasonal_param]})
                    except IndexError:
                        continue
                    combinacion += 1
                    clear_output(wait = True)
                    
            min_error = min(list(error_dict.keys()))
            min_error_df.loc[fila_df, ['comunidad', 'producto', 'error']] = comunidad, producto, min_error
            final_model = SARIMAX(train, order = error_dict[min_error][0], seasonal_order = error_dict[min_error][1], 
                                  enforce_stationarity = False, enforce_invertibilty = False)
            final_result = final_model.fit()
            final_ypred = final_result.get_forecast(steps = len(test))
            ypred_df = pd.DataFrame(final_ypred.predicted_mean).rename(columns = {'predicted_mean': str(variable) + '_predicted'})
            pred_ci = final_ypred.conf_int()
            pred_ci_df = pd.concat([ypred_df, pred_ci], axis = 1)
            pred_ci_df[['ccaa', 'producto']] = comunidad, producto
            
            line = pd.to_datetime('2020-02-01', format = '%Y-%m-%d')
            real_value = train.loc[line].values[0]
            new_row =  pd.DataFrame({'ccaa': comunidad, 'producto': producto, (variable + '_predicted'): real_value, 
                         ('lower ' + variable):real_value, ('upper ' + variable): real_value}, 
                                    index = [line])
            predicciones = pd.concat([predicciones, new_row, pred_ci_df], axis = 0)
            n_comunidad += 1
            fila_df += 1
        vuelta_general += 1
    return predicciones, min_error_df

In [9]:
df_peque, error_peque = best_sarima('volumen_miles_de_kg', cmpl_model_dict)

with open('df_peque.pkl', 'wb') as fp:
    pickle.dump(df_peque, fp)    
with open('error_peque.pkl', 'wb') as fp:
    pickle.dump(error_peque, fp)

Tienes 50 productos y vas por el 50 . Este producto tiene 18 comunidades y vas por la 18 Hay un total de 729 combinaciones y vas por la 649 . La variable es volumen_miles_de_kg


In [12]:
with open('df_peque.pkl', 'rb') as fp:
    df_peque = pickle.load(fp)
with open('error_peque.pkl', 'rb') as fp:
    error_peque = pickle.load(fp)

In [None]:
df_peque2, error_peque2 = best_sarima('valor_miles_de_€', cmpl_model_dict)
df_medi = pd.concat([df_peque, df_peque2], axis = 1)
error_medi = pd.concat([error_peque, error_peque2], axis = 1)
with open('df_medi.pkl', 'wb') as fp:
    pickle.dump(df_medi, fp)    
with open('error_medi.pkl', 'wb') as fp:
    pickle.dump(error_medi, fp)

In [16]:
with open('df_medi.pkl', 'rb') as fp:
    df_medi = pickle.load(fp)
with open('error_medi.pkl', 'rb') as fp:
    error_medi = pickle.load(fp)

In [14]:
df_peque3, error_peque3 = best_sarima('precio_medio_kg', cmpl_model_dict)
df_gran = pd.concat([df_medi, df_peque3], axis = 1)
error_gran = pd.concat([error_medi, error_peque3], axis = 1)
with open('df_gran.pkl', 'wb') as fp:
    pickle.dump(df_gran, fp)    
with open('error_gran.pkl', 'wb') as fp:
    pickle.dump(error_gran, fp)

In [21]:
with open('df_gran.pkl', 'rb') as fp:
    df_gran = pickle.load(fp)
with open('error_gran.pkl', 'rb') as fp:
    error_gran = pickle.load(fp)

In [41]:
df_final = df_gran.T.drop_duplicates().T
df_final = df_final.loc[:, 'producto':]
df_final.to_csv(os.path.join(dir_out, 'predicciones_consumo.csv'), sep = ';')