# TFM : Aplicación de *Machine Learning* para la Gestión de Inventarios
> Erendira Teresa Navarro García

**Análisis de correlación**

In [None]:
# Python 3 environment Google Colab
import pandas as pd
import os
import csv
import datetime as datetime
import json 
import sklearn
import numpy as np
import math
import matplotlib.pyplot as plt
import matplotlib as mpl
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import seaborn as sns
import matplotlib.pylab as plab
from datetime import datetime
from dateutil.parser import parse
from pandas.plotting import lag_plot
from pandas.plotting import autocorrelation_plot
from statsmodels.tsa.stattools import adfuller, kpss
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.seasonal import seasonal_decompose
from tabulate import tabulate
from pickle import dump
from scipy.stats import f_oneway, chi2_contingency


pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.



In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
os.chdir("/content/drive/MyDrive/Files_TFM/")
print(os.getcwd())

/content/drive/MyDrive/Files_TFM


## Lectura de datos

In [None]:
dateparse = lambda x: datetime.strptime(x, '%Y-%m-%d')
df_tfm = pd.read_csv('export_data_tfm.csv', parse_dates=['fecha'], date_parser=dateparse, dtype={'sku': str})
#Prepare data
df_tfm['weekday'] = df_tfm.fecha.dt.strftime('%w').astype(int)
df_tfm = df_tfm.sort_values(by='fecha').set_index('fecha')

In [None]:
df_tfm.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 51100 entries, 2019-01-02 to 2021-10-19
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   sku         51100 non-null  object 
 1   bolOpen     51100 non-null  int64  
 2   bolHoliday  51100 non-null  int64  
 3   udsVenta    51100 non-null  float64
 4   udsVentaO   51100 non-null  float64
 5   udsStock    32900 non-null  float64
 6   promo       51100 non-null  int64  
 7   udsVentaT   51100 non-null  float64
 8   label       51100 non-null  int64  
 9   weekday     51100 non-null  int64  
dtypes: float64(4), int64(5), object(1)
memory usage: 4.3+ MB


### Preparación de los datos

In [None]:
# Función para creación de nuevas variables - lag de 1 a 7 y diferencia en ventas del día anterior y del día previo
def features(dataset, columns_y):
  dataset_t = dataset.copy()
  for i in range(1,8):
    dataset_t['lag'+str(i)] = dataset_t[columns_y].shift(i)
    if i in [1,2]:
      dataset_t['diff'+str(i)] = dataset_t[columns_y].diff(i)
  # Eliminar NAs
  dataset_t = dataset_t.dropna()
  return dataset_t

In [None]:
results_anova = pd.DataFrame()
results_chisq = pd.DataFrame()
results_corr = pd.DataFrame()
listx = ['promo', 'promo', 'bolOpen','bolHoliday','bolHoliday','bolHoliday']
listy = ['weekday', 'bolOpen', 'weekday','promo','weekday', 'bolOpen']
columns_corr = ['udsVentaT'] + ['lag'+str(i) for i in range(1,8)] + ['diff'+str(i) for i in [1,2]]
for i in range(1,51):
  df = df_tfm[df_tfm["sku"] == str(i)]
  dataset = features(df, 'udsVentaT')
  for k in ["promo","weekday","bolOpen","bolHoliday"]:
    cat_vs_num=dataset.groupby(k)['udsVentaT'].apply(list)
    AnovaResults = f_oneway(*cat_vs_num)
    print(str(i),k,'P-Value for Anova is: ', AnovaResults[1])
    k_vs = k + ' vs udsVentaT'
    tab = pd.DataFrame({"sku":[str(i)],"prueba":[k_vs], "pValue":[AnovaResults[1]]})
    results_anova = results_anova.append(tab,ignore_index=True) 
  for x, y in zip(listx, listy):
    Crosst=pd.crosstab(index=dataset[x],columns=dataset[y])
    print(Crosst)
    ChiSqR = chi2_contingency(Crosst)
    print(str(i),x,y,'The P-Value of the ChiSq Test is:', ChiSqR[1])
    x_vs_y = x + ' vs ' + y
    tabc = pd.DataFrame({"sku":[str(i)],"prueba":[x_vs_y], "pValue":[ChiSqR[1]]})
    results_chisq = results_chisq.append(tabc,ignore_index=True)
  #Correlación Pearson 
  plt.figure(figsize=(12,10))
  cor = dataset[columns_corr].corr()
  sns_plot = sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
  plt.title("SKU" + str(i))
  plt.savefig("corr_sku_" + str(i) + ".png", dpi=300, bbox_inches='tight')
  plt.close()
  cor["sku"] = str(i)
  results_corr = results_corr.append(cor.reset_index(),ignore_index=True) 

1 promo P-Value for Anova is:  0.8993999844488696
1 weekday P-Value for Anova is:  9.150991617656698e-33
1 bolOpen P-Value for Anova is:  3.7839531734113903e-41
1 bolHoliday P-Value for Anova is:  5.4234292635997765e-43
weekday   0   1   2   3   4   5   6
promo                              
0        54  56  56  57  54  53  54
1        40  38  38  37  40  41  40
1 promo weekday The P-Value of the ChiSq Test is: 0.9969876259484729
bolOpen   0    1
promo           
0        52  332
1        51  223
1 promo bolOpen The P-Value of the ChiSq Test is: 0.09770312630882257
weekday   0   1   2   3   4   5   6
bolOpen                            
0        85   4   1   3   1   6   3
1         9  90  93  91  93  88  91
1 bolOpen weekday The P-Value of the ChiSq Test is: 1.9189480866985953e-97
promo         0    1
bolHoliday          
0           331  219
1            53   55
1 bolHoliday promo The P-Value of the ChiSq Test is: 0.04193924740884467
weekday      0   1   2   3   4   5   6
bolHoliday    


invalid value encountered in double_scalars



7 promo P-Value for Anova is:  nan
7 weekday P-Value for Anova is:  3.0275216474212584e-57
7 bolOpen P-Value for Anova is:  6.3446620086665995e-59
7 bolHoliday P-Value for Anova is:  1.343408476957277e-65
weekday   0   1   2   3   4   5   6
promo                              
0        94  94  94  94  94  94  94
7 promo weekday The P-Value of the ChiSq Test is: 1.0
bolOpen   0    1
promo           
0        99  559
7 promo bolOpen The P-Value of the ChiSq Test is: 1.0
weekday   0   1   2   3   4   5   6
bolOpen                            
0        83   2   2   4   2   4   2
1        11  92  92  90  92  90  92
7 bolOpen weekday The P-Value of the ChiSq Test is: 2.290398241366141e-96
promo         0
bolHoliday     
0           548
1           110
7 bolHoliday promo The P-Value of the ChiSq Test is: 1.0
weekday      0   1   2   3   4   5   6
bolHoliday                            
0            6  90  92  89  92  88  91
1           88   4   2   5   2   6   3
7 bolHoliday weekday The P-Value 


invalid value encountered in double_scalars


invalid value encountered in double_scalars



13 promo P-Value for Anova is:  nan
13 weekday P-Value for Anova is:  7.394971026012023e-33
13 bolOpen P-Value for Anova is:  7.373921575491074e-40
13 bolHoliday P-Value for Anova is:  5.7087249509574314e-40
weekday   0   1   2   3   4   5   6
promo                              
0        94  94  94  94  94  94  94
13 promo weekday The P-Value of the ChiSq Test is: 1.0
bolOpen    0    1
promo            
0        100  558
13 promo bolOpen The P-Value of the ChiSq Test is: 1.0
weekday   0   1   2   3   4   5   6
bolOpen                            
0        86   1   2   4   2   4   1
1         8  93  92  90  92  90  93
13 bolOpen weekday The P-Value of the ChiSq Test is: 5.973094632322571e-104
promo         0
bolHoliday     
0           549
1           109
13 bolHoliday promo The P-Value of the ChiSq Test is: 1.0
weekday      0   1   2   3   4   5   6
bolHoliday                            
0            6  88  92  90  91  90  92
1           88   6   2   4   3   4   2
13 bolHoliday weekday 


invalid value encountered in double_scalars



26 promo P-Value for Anova is:  1.3530688470737282e-07
26 weekday P-Value for Anova is:  3.319527196806458e-13
26 bolOpen P-Value for Anova is:  3.407244104753256e-17
26 bolHoliday P-Value for Anova is:  1.600651923650527e-17
weekday   0   1   2   3   4   5   6
promo                              
0        54  56  56  57  54  53  54
1        40  38  38  37  40  41  40
26 promo weekday The P-Value of the ChiSq Test is: 0.9969876259484729
bolOpen   0    1
promo           
0        52  332
1        46  228
26 promo bolOpen The P-Value of the ChiSq Test is: 0.2973739556492217
weekday   0   1   2   3   4   5   6
bolOpen                            
0        85   2   3   2   0   5   1
1         9  92  91  92  94  89  93
26 bolOpen weekday The P-Value of the ChiSq Test is: 1.085410499195868e-103
promo         0    1
bolHoliday          
0           329  221
1            55   53
26 bolHoliday promo The P-Value of the ChiSq Test is: 0.10802897774540106
weekday      0   1   2   3   4   5   6
bolHo


invalid value encountered in double_scalars



31 promo P-Value for Anova is:  0.4453786348354444
31 weekday P-Value for Anova is:  4.352780585502679e-30
31 bolOpen P-Value for Anova is:  1.3585816226229863e-34
31 bolHoliday P-Value for Anova is:  4.665222120030123e-36
weekday   0   1   2   3   4   5   6
promo                              
0        21  22  23  23  20  20  21
1        73  72  71  71  74  74  73
31 promo weekday The P-Value of the ChiSq Test is: 0.9966101430769535
bolOpen   0    1
promo           
0        18  132
1        82  426
31 promo bolOpen The P-Value of the ChiSq Test is: 0.26609479410152015
weekday   0   1   2   3   4   5   6
bolOpen                            
0        86   1   2   4   2   4   1
1         8  93  92  90  92  90  93
31 bolOpen weekday The P-Value of the ChiSq Test is: 5.973094632322571e-104
promo         0    1
bolHoliday          
0           134  415
1            16   93
31 bolHoliday promo The P-Value of the ChiSq Test is: 0.0369215098796428
weekday      0   1   2   3   4   5   6
bolHolid


invalid value encountered in double_scalars



44 promo P-Value for Anova is:  nan
44 weekday P-Value for Anova is:  3.2098733979411234e-19
44 bolOpen P-Value for Anova is:  1.474970796310664e-23
44 bolHoliday P-Value for Anova is:  2.2038295059691008e-23
weekday   0   1   2   3   4   5   6
promo                              
0        94  94  94  94  94  94  94
44 promo weekday The P-Value of the ChiSq Test is: 1.0
bolOpen    0    1
promo            
0        103  555
44 promo bolOpen The P-Value of the ChiSq Test is: 1.0
weekday   0   1   2   3   4   5   6
bolOpen                            
0        85   4   1   3   1   6   3
1         9  90  93  91  93  88  91
44 bolOpen weekday The P-Value of the ChiSq Test is: 1.9189480866985953e-97
promo         0
bolHoliday     
0           550
1           108
44 bolHoliday promo The P-Value of the ChiSq Test is: 1.0
weekday      0   1   2   3   4   5   6
bolHoliday                            
0            7  89  92  91  93  88  90
1           87   5   2   3   1   6   4
44 bolHoliday weekday

In [None]:
results_chisq.head()

Unnamed: 0,sku,prueba,pValue
0,1,promo vs weekday,0.9969876
1,1,promo vs bolOpen,0.09770313
2,1,bolOpen vs weekday,1.918948e-97
3,1,bolHoliday vs promo,0.04193925
4,1,bolHoliday vs weekday,3.255755e-97


In [None]:
results_corr.to_csv('tablas_correlaciones_por_sku.csv', index=False)

In [None]:
results_anova.to_csv('resultados_anova_por_sku.csv', index=False)

In [None]:
results_chisq.to_csv('resultados_chisq_por_sku.csv', index=False)

In [None]:
res = results_chisq[results_chisq["sku"]=="3"].append(results_anova[results_anova["sku"]=="3"])

In [None]:
print(tabulate(res.set_index("sku"), headers=res.columns, tablefmt='latex'))

\begin{tabular}{rlr}
\hline
   sku & prueba                  &       pValue \\
\hline
     3 & promo vs weekday        & 0.996988     \\
     3 & promo vs bolOpen        & 0.206157     \\
     3 & bolOpen vs weekday      & 9.99946e-98  \\
     3 & bolHoliday vs promo     & 0.0399127    \\
     3 & bolHoliday vs weekday   & 1.17444e-97  \\
     3 & bolHoliday vs bolOpen   & 3.03246e-119 \\
     3 & promo vs udsVentaT      & 0.12388      \\
     3 & weekday vs udsVentaT    & 7.92651e-24  \\
     3 & bolOpen vs udsVentaT    & 5.22379e-25  \\
     3 & bolHoliday vs udsVentaT & 4.67007e-26  \\
\hline
\end{tabular}


In [None]:
res = results_chisq[results_chisq["sku"]=="1"].append(results_anova[results_anova["sku"]=="1"])
print(tabulate(res.set_index("sku"), headers=res.columns, tablefmt='latex'))

\begin{tabular}{rlr}
\hline
   sku & prueba                  &       pValue \\
\hline
     1 & promo vs weekday        & 0.996988     \\
     1 & promo vs bolOpen        & 0.0977031    \\
     1 & bolOpen vs weekday      & 1.91895e-97  \\
     1 & bolHoliday vs promo     & 0.0419392    \\
     1 & bolHoliday vs weekday   & 3.25576e-97  \\
     1 & bolHoliday vs bolOpen   & 1.73182e-126 \\
     1 & promo vs udsVentaT      & 0.8994       \\
     1 & weekday vs udsVentaT    & 9.15099e-33  \\
     1 & bolOpen vs udsVentaT    & 3.78395e-41  \\
     1 & bolHoliday vs udsVentaT & 5.42343e-43  \\
\hline
\end{tabular}
