# TFM : Aplicación de *Machine Learning* para la Gestión de Inventarios
> Erendira Teresa Navarro García

## Cross Validation 
Para elección del modelo

In [3]:
# Python 3 environment Google Colab
import pandas as pd
import os
import csv
import datetime as datetime
import pywt
import json 
import sklearn
import numpy as np
import math
import matplotlib.pyplot as plt
import matplotlib as mpl
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import seaborn as sns
import matplotlib.pylab as plab
from datetime import datetime
from dateutil.parser import parse
from pandas.plotting import lag_plot
from pandas.plotting import autocorrelation_plot
from statsmodels.tsa.stattools import adfuller, kpss
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.seasonal import seasonal_decompose
from tabulate import tabulate


pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.



In [4]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error, explained_variance_score, mean_absolute_error, mean_squared_log_error, median_absolute_error, r2_score, make_scorer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
import sklearn.metrics as metrics
from sklearn.model_selection import TimeSeriesSplit
from sklearn.neighbors import KNeighborsRegressor

In [5]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

### Lectura de datos

In [6]:
dateparse = lambda x: datetime.strptime(x, '%Y-%m-%d')
df_tfm = pd.read_csv('/content/drive/MyDrive/Files_TFM/export_data_tfm.csv', 
                     parse_dates=['fecha'], date_parser=dateparse, dtype={'sku': str})
df_tfm = df_tfm.sort_values(by='fecha').set_index('fecha')

### Preparación de los datos

In [7]:
# Función para creación de nuevas variables - lag de 1 a 7 y diferencia en ventas del día anterior y del día previo
def features(dataset_t, columns_y):
  for i in range(1,8):
    dataset_t.loc[:,'lag'+str(i)] = dataset_t.loc[:,columns_y].shift(i)
    if i in [1,2]:
      dataset_t.loc[:,'diff'+str(i)] = dataset_t.loc[:,columns_y].diff(i)
  # Eliminar NAs
  dataset_t = dataset_t.dropna()
  return dataset_t

# Separar dataset y crear nuevas variables
def split_dataset(dataset, split_per, column_y):
  data_train, data_test = train_test_split(dataset, test_size=split_per, shuffle=False)
  data_train_total = features(data_train,column_y)
  data_test_total = features(data_test,column_y)
  y_train = data_train_total.loc[:,column_y]
  X_train = data_train_total.drop(columns=column_y)
  y_test = data_test_total.loc[:,column_y]
  X_test = data_test_total.drop(columns=column_y)
  return X_train, X_test, y_train, y_test

### Modelos

In [8]:
# Variables a utilizar en los modelos
columns_ok = ["bolOpen","promo"]
columns_to_scale  = ['lag'+str(i) for i in range(1,8)] + ['diff'+str(i) for i in [1,2]]
y_to_scale  = ["udsVentaT"]
# Elección de modelos
models = []
models.append(('RF', RandomForestRegressor(random_state=0)))
models.append(('SVR', SVR()))
models.append(('NN', MLPRegressor(random_state=0, max_iter=2000)))

In [13]:
def cv_analisis(y_target):
  df_cv_modelos = pd.DataFrame([],columns =["sku", "model", "mean_cv", "std_cv"])
  df_resultados = pd.DataFrame([],columns =["sku", "best_model", "best_cv_mean"])
  for i in range (1,51):
    #dataset filtrado por sku
    dataset_sku = df_tfm[(df_tfm["sku"] == str(i))].drop(columns=["udsStock","label", "bolHoliday"])
    #split dataset
    X_train, X_test, y_train, y_test = split_dataset(dataset_sku, 0.3, y_target)
    #Scalers
    mm_train_cv = MinMaxScaler(feature_range = (0,1))
    mm_ytrain_cv = MinMaxScaler(feature_range = (0,1))
    # Escalamiento de datos
    y_train = mm_ytrain_cv.fit_transform(y_train.reset_index()[[y_target]]).ravel()
    X_tmp  = mm_train_cv.fit_transform(X_train.reset_index()[columns_to_scale]) 
    # Tabla X
    X_train = np.concatenate([X_tmp , X_train.reset_index()[columns_ok].values], axis=1)
    #### CV
    results = []
    names = []
    mean_cv = []
    std_cv = []
    for name, model in models:
      # TimeSeries Cross validation
      tscv = TimeSeriesSplit(n_splits=8)
      cv_results = cross_val_score(model, X_train, y_train, cv = tscv, scoring = "r2")
      results.append(cv_results)
      names.append(name)
      mean_cv.append(cv_results.mean())
      std_cv.append(cv_results.std())
    df_cv = pd.DataFrame({"model":names,"mean_cv":mean_cv, "std_cv":std_cv})
    df_cv["sku"] = str(i)
    df_cv = df_cv[["sku","model", "mean_cv", "std_cv"]]
    df_cv_modelos = df_cv_modelos.append(df_cv, ignore_index = True)
    modelo_max = df_cv[df_cv.mean_cv == np.max(df_cv.mean_cv)].model.values[0]
    valor_max = df_cv[df_cv.mean_cv == np.max(df_cv.mean_cv)].mean_cv.values[0]
    df = pd.DataFrame([[i,modelo_max,valor_max]], columns =["sku", "best_model", "best_cv_mean"])
    df_resultados = df_resultados.append(df, ignore_index = True)
  return df_cv_modelos, df_resultados

In [14]:
df_cv, df_result = cv_analisis("udsVentaT")

In [15]:
cv_resumen = df_result.groupby("best_model")["sku"].count().reset_index()
cv_resumen['per'] = 100 * cv_resumen["sku"] / cv_resumen["sku"].sum()
print(tabulate(cv_resumen.set_index("best_model"), headers=cv_resumen.columns, tablefmt='latex'))

\begin{tabular}{lrr}
\hline
 best\_model   &   sku &   per \\
\hline
 NN           &    14 &    28 \\
 RF           &    36 &    72 \\
\hline
\end{tabular}


In [16]:
print(tabulate(df_cv[df_cv["model"] == "RF"].set_index("sku"), headers=df_cv.columns, tablefmt='latex'))

\begin{tabular}{rlrr}
\hline
   sku & model   &   mean\_cv &    std\_cv \\
\hline
     1 & RF      &  0.932223 & 0.0617921 \\
     2 & RF      &  0.875731 & 0.0658859 \\
     3 & RF      &  0.836148 & 0.203283  \\
     4 & RF      &  0.892016 & 0.120825  \\
     5 & RF      &  0.929246 & 0.0619969 \\
     6 & RF      &  0.889253 & 0.0971438 \\
     7 & RF      &  0.894692 & 0.145432  \\
     8 & RF      &  0.932069 & 0.0600548 \\
     9 & RF      &  0.927128 & 0.0760093 \\
    10 & RF      &  0.90503  & 0.0991373 \\
    11 & RF      &  0.928058 & 0.0612708 \\
    12 & RF      &  0.93155  & 0.0536939 \\
    13 & RF      &  0.926514 & 0.0776869 \\
    14 & RF      &  0.922361 & 0.102774  \\
    15 & RF      &  0.870836 & 0.11996   \\
    16 & RF      &  0.930176 & 0.0615033 \\
    17 & RF      &  0.940783 & 0.0536499 \\
    18 & RF      &  0.939066 & 0.056004  \\
    19 & RF      &  0.921971 & 0.0658504 \\
    20 & RF      &  0.890643 & 0.134221  \\
    21 & RF      &  0.935876 & 0.06808

In [17]:
print(tabulate(df_cv[df_cv["model"] == "NN"].set_index("sku"), headers=df_cv.columns, tablefmt='latex'))

\begin{tabular}{rlrr}
\hline
   sku & model   &   mean\_cv &    std\_cv \\
\hline
     1 & NN      &  0.908663 & 0.0962707 \\
     2 & NN      &  0.75421  & 0.33352   \\
     3 & NN      &  0.643769 & 0.68998   \\
     4 & NN      &  0.884613 & 0.154002  \\
     5 & NN      &  0.923556 & 0.0790675 \\
     6 & NN      &  0.908002 & 0.113443  \\
     7 & NN      &  0.824936 & 0.373629  \\
     8 & NN      &  0.933607 & 0.0752581 \\
     9 & NN      &  0.902373 & 0.125716  \\
    10 & NN      &  0.840848 & 0.215739  \\
    11 & NN      &  0.907152 & 0.104634  \\
    12 & NN      &  0.96739  & 0.0300188 \\
    13 & NN      &  0.877358 & 0.185135  \\
    14 & NN      &  0.944644 & 0.0486788 \\
    15 & NN      &  0.74985  & 0.385393  \\
    16 & NN      &  0.899858 & 0.154506  \\
    17 & NN      &  0.93528  & 0.0511386 \\
    18 & NN      &  0.962152 & 0.0411246 \\
    19 & NN      &  0.848627 & 0.235157  \\
    20 & NN      &  0.772477 & 0.42236   \\
    21 & NN      &  0.723438 & 0.62418

In [18]:
print(tabulate(df_cv[df_cv["model"] == "SVR"].set_index("sku"), headers=df_cv.columns, tablefmt='latex'))

\begin{tabular}{rlrr}
\hline
   sku & model   &   mean\_cv &    std\_cv \\
\hline
     1 & SVR     &  0.832733 & 0.138646  \\
     2 & SVR     &  0.753355 & 0.154309  \\
     3 & SVR     &  0.746426 & 0.249016  \\
     4 & SVR     &  0.811515 & 0.145059  \\
     5 & SVR     &  0.849153 & 0.101559  \\
     6 & SVR     &  0.795521 & 0.135691  \\
     7 & SVR     &  0.848609 & 0.165218  \\
     8 & SVR     &  0.816395 & 0.173238  \\
     9 & SVR     &  0.855234 & 0.0884667 \\
    10 & SVR     &  0.792116 & 0.123172  \\
    11 & SVR     &  0.845881 & 0.0939758 \\
    12 & SVR     &  0.85293  & 0.0825983 \\
    13 & SVR     &  0.845457 & 0.111642  \\
    14 & SVR     &  0.852046 & 0.141302  \\
    15 & SVR     &  0.724708 & 0.209709  \\
    16 & SVR     &  0.821777 & 0.0914042 \\
    17 & SVR     &  0.787016 & 0.106329  \\
    18 & SVR     &  0.858598 & 0.134343  \\
    19 & SVR     &  0.829264 & 0.103413  \\
    20 & SVR     &  0.755463 & 0.218987  \\
    21 & SVR     &  0.709207 & 0.44820