# Aleatoriedad en asignación de dificultades

In [1]:
import numpy as np
import pandas as pd
import os
from statsmodels.formula.api import ols
import statsmodels.api as sm
from scipy import stats

In [2]:
def LeerBasesRegresion(datadir, ligas, column_condition):
    dfs = []
    # Primer caso:
    archivo = ligas[0] + '.xlsx'
    df = pd.read_excel(os.path.join(datadir, archivo))
    columns_filter = [i for i in df.columns if column_condition(i)]
    columns_to_keep = ['equipo','Torneo'] + columns_filter
    df = df[columns_to_keep]
    dfs.append(df)
    try:
        for liga in ligas[1:]:
            archivo = liga + '.xlsx'
            df = pd.read_excel(os.path.join(datadir, archivo))
            columns_to_keep = ['equipo','Torneo'] + [i for i in df.columns if column_condition(i)]
            df = df[columns_to_keep]
            dfs.append(df)
    except:
        pass
    return dfs, columns_filter

In [3]:
# def FiltrarRegulares(df, defin, c, h):
#     if defin == 'elo':
#         col = 'categoria_%s_c%s' % (defin, c)
#         df_filter = df[df[col] == 'regular']
#     elif defin == 'general':
#         col = 'categoria_%s_h%s_c%s' % (defin, h, c)
#         df_filter = df[df[col] == 'regular']
#     else:
#         col1 = 'categoria_local_h%s_c%s' % (h,c)
#         col2 = 'categoria_visita_h%s_c%s' % (h,c)
#         df_filter = df[(df[col1] == 'regular') & 
#                        (df[col2] == 'regular')]
#     return df_filter
def FiltrarRegulares(df, defin, c, h, filt):
    cats = ['facil','regular','dificil']
    if filt not in  ['regular', ""]:
        cats = [c for c in cats if c != filt]
    elif filt == 'regular':
        cats = [filt]
    else:
        pass
    if defin == 'elo':
        col = 'categoria_%s_c%s' % (defin, c)
        df_filter = df[df[col].isin(cats)]
    elif defin == 'general':
        col = 'categoria_%s_h%s_c%s' % (defin, h, c)
        df_filter = df[df[col].isin(cats)]
    else:
        col1 = 'categoria_local_h%s_c%s' % (h,c)
        col2 = 'categoria_visita_h%s_c%s' % (h,c)
        df_filter = df[(df[col1].isin(cats)) & 
                       (df[col2].isin(cats))]
    return df_filter

In [4]:
def SignificanciaVariablesManual(coef, se, pval, replace_spaces = True):
    coef = "%.3f" % (np.round(coef, decimals = 3))
    se = "%.3f" % (np.round(se, decimals = 3))
    if coef > 0:
        output = '_' + str(coef) + ' (' + str(se) + ')'                    
    else:
        output = str(coef) + ' (' + str(se) + ')'
    pval = np.round(pval, 3)
    if pval < 0.01:
        output = output + '***'
    elif pval < 0.05 and pval >= 0.01:
        output = output + '**_'
    elif pval < 0.1 and pval >=0.05:
        output = output + '*__'
    else:
        output = output + '___'
    if replace_spaces:
        output = output.replace("_"," ")
    return output

In [5]:
def ANOVA1ViaIndicadores(datadir, ligas, defin, filtro):
    dict_replaces = {'_mean': '',
                     '_':' ',
                     'dificil':'Difícil',
                     'dificl' : 'Difícil',
                     'facil': 'Fácil',
                     'regular' : 'Regular',
                     'prom': 'Promedio',
                     'posicion': 'Posición',
                     'primer' : 'Primer',
                     'ronda' : 'Ronda',
                     'gral': 'Gral.',
                     'elo' : 'ELO',
                     'categoria': 'Categoría',
                     'local': 'Local',
                     'visita': 'Visita',
                     'general' : 'Gral.',
                     'corr' : 'Correg.',
                     'perc' : 'Frac.',
                     ' h' : ' Ant. ',
                     ' p': ' Primeros ',
                     ' c': ' Corte ',
                     'lv' : 'LV',
                     'Primer Fácil C': 'Primer Fácil LV C',
                     'Primer Difícil C': 'Primer Difícil LV C'}
    if defin != 'lv':
        dfs, columns_filter = LeerBasesRegresion(datadir,
                                                 ligas,
                                                 lambda x: defin in x)
    else:
        dfs, columns_filter = LeerBasesRegresion(datadir,
                                                 ligas,
                                                 lambda x: defin in x or 'local' in x or 'visita' in x)        
    columns_facil = [i for i in columns_filter if 'facil' in i and 'perc' in i]
    columns_dificil = [i for i in columns_filter if 'dificil' in i and 'perc' in i]
    dictresumen_l = {'Indicador' : columns_facil}
    dfs_resumen = []
    # Columnas de facil
    for i in range(len(ligas)):
        liga = ligas[i]
        if liga != 'All':
            df_liga = dfs[i]
        else:
            df_liga = pd.concat(dfs, ignore_index = True)
        resultados = []
        for var in columns_facil:
            if filtro != 'regular':
                if 'facil' in var:
                    filt = 'facil'
                else:
                    filt = 'dificil'
            else:
                filt = filtro
            if defin != 'elo':
                aux = var.split('_h')[1].split('_p')
                h, c = aux[0], aux[1].split('_c')[1]
            else:
                h, c = None, var.split('_c')[1]
            formula = var + ' ~ C(equipo)'
            df_anova = FiltrarRegulares(df_liga, defin, c, h, filt)
            n_obs = df_anova.shape[0]
            model = ols(formula,
                        data =  df_anova).fit()
            aov_table = sm.stats.anova_lm(model, typ=2)
            out = '%s (N = %s)' % (np.round(aov_table['PR(>F)'][0], decimals = 3), n_obs)
            resultados.append(out)
        dictresumen_l[liga] = resultados
    df_facil = pd.DataFrame(data = dictresumen_l)
    df_facil['Indicador'] = df_facil['Indicador'].replace(to_replace = dict_replaces, regex=True)
    dfs_resumen.append(df_facil)
    dictresumen_v = {'Indicador' : columns_dificil}
    for i in range(len(ligas)):
        liga = ligas[i]
        liga = ligas[i]
        if liga != 'All':
            df_liga = dfs[i]
        else:
            df_liga = pd.concat(dfs, ignore_index = True)
        resultados = []
        for var in columns_dificil:
            if filtro != 'regular':
                if 'facil' in var:
                    filt = 'facil'
                else:
                    filt = 'dificil'
            else:
                filt = filtro
            if defin != 'elo':
                aux = var.split('_h')[1].split('_p')
                h, c = aux[0], aux[1].split('_c')[1]
            else:
                h, c = None, var.split('_c')[1]
            formula = var + ' ~ C(equipo)'
            df_anova = FiltrarRegulares(df_liga, defin, c, h, filt)
            n_obs = df_anova.shape[0]
            model = ols(formula,
                        data = df_anova).fit()
            aov_table = sm.stats.anova_lm(model, typ=2)
            out = '%s (N = %s)' % (np.round(aov_table['PR(>F)'][0], decimals = 3), n_obs)
            resultados.append(out)
        dictresumen_v[liga] = resultados
    df_dificil = pd.DataFrame(data = dictresumen_v)
    df_dificil['Indicador'] = df_dificil['Indicador'].replace(to_replace = dict_replaces, regex=True)
    dfs_resumen.append(df_dificil)
    return dfs_resumen

## Aleatoriedad para indicadores según puntaje ELO

In [6]:
filtro = 'regular'

In [7]:
%%time
datadir = os.path.join(os.path.pardir, 'datos', 'regresiones')
ligas = ['Alemania','Espana','Francia', 'Inglaterra','Italia', 'All']
outputdir = os.path.join(os.path.pardir,'resultados','aleatoriedad-inicio')
dfs_resumen_elo = ANOVA1ViaIndicadores(datadir,
                                       ligas,
                                       defin = 'elo',
                                       filtro = filtro)

outputfile = 'aleatoriedad-elo-%s.xlsx' % filtro
writer = pd.ExcelWriter(os.path.join(outputdir, outputfile), engine='xlsxwriter')
for df, sheet in zip(dfs_resumen_elo, ['Facil','Dificil']):
    df.to_excel(writer, sheet_name = sheet, index = False)
writer.save()

CPU times: user 11.8 s, sys: 159 ms, total: 11.9 s
Wall time: 5.94 s


In [8]:
dfs_resumen_elo[0]

Unnamed: 0,Indicador,Alemania,Espana,Francia,Inglaterra,Italia,All
0,Frac. Fácil ELO p4 c3,0.282 (N = 180),0.028 (N = 210),0.028 (N = 140),0.135 (N = 354),0.902 (N = 208),0.009 (N = 1092)
1,Frac. Fácil ELO p5 c3,0.03 (N = 180),0.161 (N = 210),0.089 (N = 140),0.22 (N = 354),0.953 (N = 208),0.025 (N = 1092)
2,Frac. Fácil ELO p6 c3,0.031 (N = 180),0.064 (N = 210),0.111 (N = 140),0.318 (N = 354),0.866 (N = 208),0.019 (N = 1092)
3,Frac. Fácil ELO p7 c3,0.005 (N = 180),0.065 (N = 210),0.113 (N = 140),0.864 (N = 354),0.524 (N = 208),0.01 (N = 1092)
4,Frac. Fácil ELO p4 c4,0.326 (N = 150),0.094 (N = 180),0.326 (N = 120),0.576 (N = 304),0.351 (N = 178),0.07 (N = 932)
5,Frac. Fácil ELO p5 c4,0.009 (N = 150),0.342 (N = 180),0.559 (N = 120),0.519 (N = 304),0.374 (N = 178),0.027 (N = 932)
6,Frac. Fácil ELO p6 c4,0.035 (N = 150),0.092 (N = 180),0.87 (N = 120),0.728 (N = 304),0.458 (N = 178),0.041 (N = 932)
7,Frac. Fácil ELO p7 c4,0.0 (N = 150),0.072 (N = 180),0.671 (N = 120),0.758 (N = 304),0.325 (N = 178),0.001 (N = 932)
8,Frac. Fácil ELO p4 c5,0.58 (N = 120),0.049 (N = 150),0.855 (N = 100),0.496 (N = 254),0.235 (N = 148),0.191 (N = 772)
9,Frac. Fácil ELO p5 c5,0.126 (N = 120),0.061 (N = 150),0.594 (N = 100),0.582 (N = 254),0.195 (N = 148),0.051 (N = 772)


In [9]:
dfs_resumen_elo[1]

Unnamed: 0,Indicador,Alemania,Espana,Francia,Inglaterra,Italia,All
0,Frac. Difícil ELO p4 c3,0.998 (N = 180),0.207 (N = 210),0.472 (N = 140),0.926 (N = 354),0.159 (N = 208),0.834 (N = 1092)
1,Frac. Difícil ELO p5 c3,0.933 (N = 180),0.03 (N = 210),0.596 (N = 140),0.933 (N = 354),0.044 (N = 208),0.356 (N = 1092)
2,Frac. Difícil ELO p6 c3,0.806 (N = 180),0.252 (N = 210),0.379 (N = 140),0.74 (N = 354),0.34 (N = 208),0.537 (N = 1092)
3,Frac. Difícil ELO p7 c3,0.724 (N = 180),0.143 (N = 210),0.502 (N = 140),0.572 (N = 354),0.564 (N = 208),0.445 (N = 1092)
4,Frac. Difícil ELO p4 c4,0.757 (N = 150),0.023 (N = 180),0.015 (N = 120),0.991 (N = 304),0.215 (N = 178),0.352 (N = 932)
5,Frac. Difícil ELO p5 c4,0.192 (N = 150),0.013 (N = 180),0.162 (N = 120),0.939 (N = 304),0.091 (N = 178),0.073 (N = 932)
6,Frac. Difícil ELO p6 c4,0.408 (N = 150),0.068 (N = 180),0.082 (N = 120),0.715 (N = 304),0.419 (N = 178),0.182 (N = 932)
7,Frac. Difícil ELO p7 c4,0.186 (N = 150),0.044 (N = 180),0.32 (N = 120),0.855 (N = 304),0.697 (N = 178),0.325 (N = 932)
8,Frac. Difícil ELO p4 c5,0.403 (N = 120),0.004 (N = 150),0.289 (N = 100),0.633 (N = 254),0.166 (N = 148),0.071 (N = 772)
9,Frac. Difícil ELO p5 c5,0.072 (N = 120),0.025 (N = 150),0.596 (N = 100),0.755 (N = 254),0.103 (N = 148),0.088 (N = 772)


## Aleatoriedad para indicadores según definición general

In [10]:
datadir = os.path.join(os.path.pardir, 'datos', 'regresiones')
ligas = ['Alemania','Espana','Francia', 'Inglaterra','Italia', 'All']
dfs_resumen_general = ANOVA1ViaIndicadores(datadir,
                                           ligas,
                                           defin = 'general',
                                           filtro = filtro)
outputfile = 'aleatoriedad-general-%s.xlsx' % filtro
writer = pd.ExcelWriter(os.path.join(outputdir, outputfile), engine='xlsxwriter')
for df, sheet in zip(dfs_resumen_general, ['Facil','Dificil']):
    df.to_excel(writer, sheet_name = sheet, index = False)
writer.save()

In [11]:
dfs_resumen_general[0]

Unnamed: 0,Indicador,Alemania,Espana,Francia,Inglaterra,Italia,All
0,Frac. Fácil Gral. h1 p4 c3,0.679 (N = 157),0.628 (N = 180),0.518 (N = 120),0.406 (N = 305),0.161 (N = 174),0.506 (N = 936)
1,Frac. Fácil Gral. h1 p5 c3,0.199 (N = 157),0.486 (N = 180),0.471 (N = 120),0.139 (N = 305),0.153 (N = 174),0.11 (N = 936)
2,Frac. Fácil Gral. h1 p6 c3,0.472 (N = 157),0.467 (N = 180),0.74 (N = 120),0.365 (N = 305),0.349 (N = 174),0.519 (N = 936)
3,Frac. Fácil Gral. h1 p7 c3,0.418 (N = 157),0.272 (N = 180),0.601 (N = 120),0.615 (N = 305),0.349 (N = 174),0.473 (N = 936)
4,Frac. Fácil Gral. h1 p4 c4,0.248 (N = 127),0.356 (N = 150),0.427 (N = 100),0.47 (N = 255),0.511 (N = 144),0.299 (N = 776)
5,Frac. Fácil Gral. h1 p5 c4,0.009 (N = 127),0.387 (N = 150),0.552 (N = 100),0.355 (N = 255),0.257 (N = 144),0.081 (N = 776)
6,Frac. Fácil Gral. h1 p6 c4,0.054 (N = 127),0.379 (N = 150),0.866 (N = 100),0.697 (N = 255),0.432 (N = 144),0.479 (N = 776)
7,Frac. Fácil Gral. h1 p7 c4,0.138 (N = 127),0.367 (N = 150),0.866 (N = 100),0.825 (N = 255),0.338 (N = 144),0.592 (N = 776)
8,Frac. Fácil Gral. h1 p4 c5,0.891 (N = 97),0.402 (N = 120),0.382 (N = 80),0.347 (N = 205),0.226 (N = 114),0.391 (N = 616)
9,Frac. Fácil Gral. h1 p5 c5,0.71 (N = 97),0.253 (N = 120),0.563 (N = 80),0.61 (N = 205),0.118 (N = 114),0.413 (N = 616)


In [12]:
dfs_resumen_general[1]

Unnamed: 0,Indicador,Alemania,Espana,Francia,Inglaterra,Italia,All
0,Frac. Difícil Gral. h1 p4 c3,0.926 (N = 157),0.204 (N = 180),0.687 (N = 120),0.867 (N = 305),0.817 (N = 174),0.865 (N = 936)
1,Frac. Difícil Gral. h1 p5 c3,0.556 (N = 157),0.025 (N = 180),0.792 (N = 120),0.799 (N = 305),0.157 (N = 174),0.23 (N = 936)
2,Frac. Difícil Gral. h1 p6 c3,0.312 (N = 157),0.204 (N = 180),0.807 (N = 120),0.574 (N = 305),0.556 (N = 174),0.422 (N = 936)
3,Frac. Difícil Gral. h1 p7 c3,0.52 (N = 157),0.149 (N = 180),0.646 (N = 120),0.568 (N = 305),0.589 (N = 174),0.288 (N = 936)
4,Frac. Difícil Gral. h1 p4 c4,0.88 (N = 127),0.159 (N = 150),0.613 (N = 100),0.851 (N = 255),0.72 (N = 144),0.738 (N = 776)
5,Frac. Difícil Gral. h1 p5 c4,0.617 (N = 127),0.366 (N = 150),0.559 (N = 100),0.685 (N = 255),0.407 (N = 144),0.426 (N = 776)
6,Frac. Difícil Gral. h1 p6 c4,0.948 (N = 127),0.482 (N = 150),0.561 (N = 100),0.44 (N = 255),0.34 (N = 144),0.604 (N = 776)
7,Frac. Difícil Gral. h1 p7 c4,0.638 (N = 127),0.318 (N = 150),0.618 (N = 100),0.607 (N = 255),0.159 (N = 144),0.389 (N = 776)
8,Frac. Difícil Gral. h1 p4 c5,0.732 (N = 97),0.003 (N = 120),0.875 (N = 80),0.26 (N = 205),0.533 (N = 114),0.177 (N = 616)
9,Frac. Difícil Gral. h1 p5 c5,0.472 (N = 97),0.013 (N = 120),0.91 (N = 80),0.191 (N = 205),0.4 (N = 114),0.086 (N = 616)


In [13]:
datadir = os.path.join(os.path.pardir, 'datos', 'regresiones')
ligas = ['Alemania','Espana','Francia', 'Inglaterra','Italia', 'All']
dfs_resumen_lv = ANOVA1ViaIndicadores(datadir,
                                      ligas,
                                      defin = 'lv',
                                      filtro = filtro)

outputfile = 'aleatoriedad-localvisita-%s.xlsx' % filtro
writer = pd.ExcelWriter(os.path.join(outputdir, outputfile), engine='xlsxwriter')
for df, sheet in zip(dfs_resumen_lv, ['Facil','Dificil']):
    df.to_excel(writer, sheet_name = sheet, index = False)
writer.save()

In [14]:
dfs_resumen_lv[0]

Unnamed: 0,Indicador,Alemania,Espana,Francia,Inglaterra,Italia,All
0,Frac. Fácil LV h1 p4 c3,0.199 (N = 109),0.587 (N = 140),0.196 (N = 89),0.479 (N = 234),0.396 (N = 134),0.283 (N = 706)
1,Frac. Fácil LV h1 p5 c3,0.144 (N = 109),0.587 (N = 140),0.149 (N = 89),0.315 (N = 234),0.727 (N = 134),0.276 (N = 706)
2,Frac. Fácil LV h1 p6 c3,0.478 (N = 109),0.328 (N = 140),0.169 (N = 89),0.417 (N = 234),0.658 (N = 134),0.41 (N = 706)
3,Frac. Fácil LV h1 p7 c3,0.341 (N = 109),0.366 (N = 140),0.222 (N = 89),0.774 (N = 234),0.622 (N = 134),0.525 (N = 706)
4,Frac. Fácil LV h1 p4 c4,0.049 (N = 75),0.1 (N = 95),0.321 (N = 58),0.843 (N = 165),0.279 (N = 88),0.06 (N = 481)
5,Frac. Fácil LV h1 p5 c4,0.064 (N = 75),0.056 (N = 95),0.213 (N = 58),0.716 (N = 165),0.507 (N = 88),0.063 (N = 481)
6,Frac. Fácil LV h1 p6 c4,0.032 (N = 75),0.049 (N = 95),0.306 (N = 58),0.594 (N = 165),0.378 (N = 88),0.049 (N = 481)
7,Frac. Fácil LV h1 p7 c4,0.002 (N = 75),0.051 (N = 95),0.271 (N = 58),0.733 (N = 165),0.624 (N = 88),0.043 (N = 481)
8,Frac. Fácil LV h1 p4 c5,0.415 (N = 51),0.099 (N = 62),0.329 (N = 40),0.768 (N = 103),0.495 (N = 56),0.19 (N = 312)
9,Frac. Fácil LV h1 p5 c5,0.469 (N = 51),0.306 (N = 62),0.19 (N = 40),0.601 (N = 103),0.445 (N = 56),0.167 (N = 312)


In [15]:
dfs_resumen_lv[1]

Unnamed: 0,Indicador,Alemania,Espana,Francia,Inglaterra,Italia,All
0,Frac. Difícil LV h1 p4 c3,0.275 (N = 109),0.564 (N = 140),0.558 (N = 89),0.398 (N = 234),0.837 (N = 134),0.43 (N = 706)
1,Frac. Difícil LV h1 p5 c3,0.682 (N = 109),0.187 (N = 140),0.626 (N = 89),0.112 (N = 234),0.396 (N = 134),0.135 (N = 706)
2,Frac. Difícil LV h1 p6 c3,0.938 (N = 109),0.286 (N = 140),0.687 (N = 89),0.165 (N = 234),0.472 (N = 134),0.457 (N = 706)
3,Frac. Difícil LV h1 p7 c3,0.842 (N = 109),0.206 (N = 140),0.56 (N = 89),0.276 (N = 234),0.706 (N = 134),0.477 (N = 706)
4,Frac. Difícil LV h1 p4 c4,0.848 (N = 75),0.266 (N = 95),0.414 (N = 58),0.184 (N = 165),0.526 (N = 88),0.162 (N = 481)
5,Frac. Difícil LV h1 p5 c4,0.72 (N = 75),0.332 (N = 95),0.233 (N = 58),0.165 (N = 165),0.379 (N = 88),0.081 (N = 481)
6,Frac. Difícil LV h1 p6 c4,0.898 (N = 75),0.414 (N = 95),0.273 (N = 58),0.355 (N = 165),0.73 (N = 88),0.401 (N = 481)
7,Frac. Difícil LV h1 p7 c4,0.861 (N = 75),0.318 (N = 95),0.263 (N = 58),0.532 (N = 165),0.909 (N = 88),0.438 (N = 481)
8,Frac. Difícil LV h1 p4 c5,0.801 (N = 51),0.8 (N = 62),0.316 (N = 40),0.01 (N = 103),0.345 (N = 56),0.156 (N = 312)
9,Frac. Difícil LV h1 p5 c5,0.876 (N = 51),0.674 (N = 62),0.068 (N = 40),0.021 (N = 103),0.325 (N = 56),0.08 (N = 312)
