# Aleatoriedad en asignación de dificultades

In [1]:
import numpy as np
import pandas as pd
import os
from statsmodels.formula.api import ols
import statsmodels.api as sm
from scipy import stats

In [2]:
def LeerBasesRegresion(datadir, ligas, column_condition):
    dfs = []
    # Primer caso:
    archivo = ligas[0] + '.xlsx'
    df = pd.read_excel(os.path.join(datadir, archivo))
    columns_filter = [i for i in df.columns if column_condition(i)]
    columns_to_keep = ['equipo','Torneo'] + columns_filter
    df = df[columns_to_keep]
    dfs.append(df)
    try:
        for liga in ligas[1:]:
            archivo = liga + '.xlsx'
            df = pd.read_excel(os.path.join(datadir, archivo))
            columns_to_keep = ['equipo','Torneo'] + [i for i in df.columns if column_condition(i)]
            df = df[columns_to_keep]
            dfs.append(df)
    except:
        pass
    return dfs, columns_filter

In [3]:
# def FiltrarRegulares(df, defin, c, h):
#     if defin == 'elo':
#         col = 'categoria_%s_c%s' % (defin, c)
#         df_filter = df[df[col] == 'regular']
#     elif defin == 'general':
#         col = 'categoria_%s_h%s_c%s' % (defin, h, c)
#         df_filter = df[df[col] == 'regular']
#     else:
#         col1 = 'categoria_local_h%s_c%s' % (h,c)
#         col2 = 'categoria_visita_h%s_c%s' % (h,c)
#         df_filter = df[(df[col1] == 'regular') & 
#                        (df[col2] == 'regular')]
#     return df_filter
def FiltrarRegulares(df, defin, c, h, filt):
    cats = ['facil','regular','dificil']
    if filt not in  ['regular', ""]:
        cats = [c for c in cats if c != filt]
    elif filt == 'regular':
        cats = [filt]
    else:
        pass
    if defin == 'elo':
        col = 'categoria_%s_c%s' % (defin, c)
        df_filter = df[df[col].isin(cats)]
    elif defin == 'general':
        col = 'categoria_%s_h%s_c%s' % (defin, h, c)
        df_filter = df[df[col].isin(cats)]
    else:
        col1 = 'categoria_local_h%s_c%s' % (h,c)
        col2 = 'categoria_visita_h%s_c%s' % (h,c)
        df_filter = df[(df[col1].isin(cats)) & 
                       (df[col2].isin(cats))]
    return df_filter

In [4]:
def SignificanciaVariablesManual(coef, se, pval, replace_spaces = True):
    coef = "%.3f" % (np.round(coef, decimals = 3))
    se = "%.3f" % (np.round(se, decimals = 3))
    if coef > 0:
        output = '_' + str(coef) + ' (' + str(se) + ')'                    
    else:
        output = str(coef) + ' (' + str(se) + ')'
    pval = np.round(pval, 3)
    if pval < 0.01:
        output = output + '***'
    elif pval < 0.05 and pval >= 0.01:
        output = output + '**_'
    elif pval < 0.1 and pval >=0.05:
        output = output + '*__'
    else:
        output = output + '___'
    if replace_spaces:
        output = output.replace("_"," ")
    return output

In [5]:
def ANOVA1ViaIndicadores(datadir, ligas, defin, filtro):
    dict_replaces = {'_mean': '',
                     '_':' ',
                     'dificil':'Difícil',
                     'dificl' : 'Difícil',
                     'facil': 'Fácil',
                     'regular' : 'Regular',
                     'prom': 'Promedio',
                     'posicion': 'Posición',
                     'primer' : 'Primer',
                     'ronda' : 'Ronda',
                     'gral': 'Gral.',
                     'elo' : 'ELO',
                     'categoria': 'Categoría',
                     'local': 'Local',
                     'visita': 'Visita',
                     'general' : 'Gral.',
                     'corr' : 'Correg.',
                     'perc' : 'Frac.',
                     ' h' : ' Ant. ',
                     ' p': ' Primeros ',
                     ' c': ' Corte ',
                     'lv' : 'LV',
                     'Primer Fácil C': 'Primer Fácil LV C',
                     'Primer Difícil C': 'Primer Difícil LV C'}
    if defin != 'lv':
        dfs, columns_filter = LeerBasesRegresion(datadir,
                                                 ligas,
                                                 lambda x: defin in x)
    else:
        dfs, columns_filter = LeerBasesRegresion(datadir,
                                                 ligas,
                                                 lambda x: defin in x or 'local' in x or 'visita' in x)        
    columns_facil = [i for i in columns_filter if 'facil' in i and 'perc' in i]
    columns_dificil = [i for i in columns_filter if 'dificil' in i and 'perc' in i]
    dictresumen_l = {'Indicador' : columns_facil}
    dfs_resumen = []
    # Columnas de facil
    for i in range(len(ligas)):
        liga = ligas[i]
        if liga != 'All':
            df_liga = dfs[i]
        else:
            df_liga = pd.concat(dfs, ignore_index = True)
        resultados = []
        for var in columns_facil:
            if filtro != 'regular':
                if 'facil' in var:
                    filt = 'facil'
                else:
                    filt = 'dificil'
            else:
                filt = filtro
            if defin != 'elo':
                aux = var.split('_h')[1].split('_p')
                h, c = aux[0], aux[1].split('_c')[1]
            else:
                h, c = None, var.split('_c')[1]
            formula = var + ' ~ C(equipo)'
            df_anova = FiltrarRegulares(df_liga, defin, c, h, filt)
            n_obs = df_anova.shape[0]
            model = ols(formula,
                        data =  df_anova).fit()
            aov_table = sm.stats.anova_lm(model, typ=2)
            out = '%s (N = %s)' % (np.round(aov_table['PR(>F)'][0], decimals = 3), n_obs)
            resultados.append(out)
        dictresumen_l[liga] = resultados
    df_facil = pd.DataFrame(data = dictresumen_l)
    df_facil['Indicador'] = df_facil['Indicador'].replace(to_replace = dict_replaces, regex=True)
    dfs_resumen.append(df_facil)
    dictresumen_v = {'Indicador' : columns_dificil}
    for i in range(len(ligas)):
        liga = ligas[i]
        liga = ligas[i]
        if liga != 'All':
            df_liga = dfs[i]
        else:
            df_liga = pd.concat(dfs, ignore_index = True)
        resultados = []
        for var in columns_dificil:
            if filtro != 'regular':
                if 'facil' in var:
                    filt = 'facil'
                else:
                    filt = 'dificil'
            else:
                filt = filtro
            if defin != 'elo':
                aux = var.split('_h')[1].split('_p')
                h, c = aux[0], aux[1].split('_c')[1]
            else:
                h, c = None, var.split('_c')[1]
            formula = var + ' ~ C(equipo)'
            df_anova = FiltrarRegulares(df_liga, defin, c, h, filt)
            n_obs = df_anova.shape[0]
            model = ols(formula,
                        data = df_anova).fit()
            aov_table = sm.stats.anova_lm(model, typ=2)
            out = '%s (N = %s)' % (np.round(aov_table['PR(>F)'][0], decimals = 3), n_obs)
            resultados.append(out)
        dictresumen_v[liga] = resultados
    df_dificil = pd.DataFrame(data = dictresumen_v)
    df_dificil['Indicador'] = df_dificil['Indicador'].replace(to_replace = dict_replaces, regex=True)
    dfs_resumen.append(df_dificil)
    return dfs_resumen

## Aleatoriedad para indicadores según puntaje ELO

In [6]:
filtro = 'regular'

In [7]:
%%time
datadir = os.path.join(os.path.pardir, 'datos', 'regresiones')
ligas = ['Alemania','Espana','Francia', 'Inglaterra','Italia', 'All']
outputdir = os.path.join(os.path.pardir,'resultados','aleatoriedad-inicio')
dfs_resumen_elo = ANOVA1ViaIndicadores(datadir,
                                       ligas,
                                       defin = 'elo',
                                       filtro = filtro)

outputfile = 'aleatoriedad-elo-%s.xlsx' % filtro
writer = pd.ExcelWriter(os.path.join(outputdir, outputfile), engine='xlsxwriter')
for df, sheet in zip(dfs_resumen_elo, ['Facil','Dificil']):
    df.to_excel(writer, sheet_name = sheet, index = False)
writer.save()

CPU times: user 14.1 s, sys: 201 ms, total: 14.3 s
Wall time: 8.9 s


In [8]:
dfs_resumen_elo[0]

Unnamed: 0,Indicador,Alemania,Espana,Francia,Inglaterra,Italia,All
0,Frac. Fácil ELO p4 c3,0.072 (N = 225),0.04 (N = 255),0.035 (N = 170),0.175 (N = 429),0.371 (N = 253),0.002 (N = 1332)
1,Frac. Fácil ELO p5 c3,0.005 (N = 225),0.328 (N = 255),0.076 (N = 170),0.448 (N = 429),0.494 (N = 253),0.015 (N = 1332)
2,Frac. Fácil ELO p6 c3,0.019 (N = 225),0.088 (N = 255),0.12 (N = 170),0.597 (N = 429),0.506 (N = 253),0.026 (N = 1332)
3,Frac. Fácil ELO p7 c3,0.001 (N = 225),0.134 (N = 255),0.103 (N = 170),0.955 (N = 429),0.506 (N = 253),0.018 (N = 1332)
4,Frac. Fácil ELO p4 c4,0.092 (N = 210),0.166 (N = 240),0.419 (N = 160),0.828 (N = 404),0.105 (N = 238),0.075 (N = 1252)
5,Frac. Fácil ELO p5 c4,0.001 (N = 210),0.619 (N = 240),0.72 (N = 160),0.955 (N = 404),0.077 (N = 238),0.058 (N = 1252)
6,Frac. Fácil ELO p6 c4,0.011 (N = 210),0.24 (N = 240),0.923 (N = 160),0.994 (N = 404),0.101 (N = 238),0.121 (N = 1252)
7,Frac. Fácil ELO p7 c4,0.0 (N = 210),0.228 (N = 240),0.682 (N = 160),0.986 (N = 404),0.296 (N = 238),0.014 (N = 1252)
8,Frac. Fácil ELO p4 c5,0.381 (N = 195),0.057 (N = 225),0.629 (N = 150),0.614 (N = 379),0.155 (N = 223),0.131 (N = 1172)
9,Frac. Fácil ELO p5 c5,0.042 (N = 195),0.086 (N = 225),0.631 (N = 150),0.922 (N = 379),0.038 (N = 223),0.064 (N = 1172)


In [9]:
dfs_resumen_elo[1]

Unnamed: 0,Indicador,Alemania,Espana,Francia,Inglaterra,Italia,All
0,Frac. Difícil ELO p4 c3,0.951 (N = 225),0.082 (N = 255),0.366 (N = 170),0.953 (N = 429),0.445 (N = 253),0.792 (N = 1332)
1,Frac. Difícil ELO p5 c3,0.58 (N = 225),0.036 (N = 255),0.439 (N = 170),0.405 (N = 429),0.068 (N = 253),0.072 (N = 1332)
2,Frac. Difícil ELO p6 c3,0.628 (N = 225),0.041 (N = 255),0.152 (N = 170),0.136 (N = 429),0.353 (N = 253),0.044 (N = 1332)
3,Frac. Difícil ELO p7 c3,0.773 (N = 225),0.023 (N = 255),0.177 (N = 170),0.161 (N = 429),0.424 (N = 253),0.071 (N = 1332)
4,Frac. Difícil ELO p4 c4,0.536 (N = 210),0.043 (N = 240),0.1 (N = 160),0.948 (N = 404),0.732 (N = 238),0.603 (N = 1252)
5,Frac. Difícil ELO p5 c4,0.122 (N = 210),0.056 (N = 240),0.137 (N = 160),0.43 (N = 404),0.153 (N = 238),0.033 (N = 1252)
6,Frac. Difícil ELO p6 c4,0.104 (N = 210),0.013 (N = 240),0.134 (N = 160),0.202 (N = 404),0.649 (N = 238),0.027 (N = 1252)
7,Frac. Difícil ELO p7 c4,0.171 (N = 210),0.024 (N = 240),0.446 (N = 160),0.253 (N = 404),0.776 (N = 238),0.112 (N = 1252)
8,Frac. Difícil ELO p4 c5,0.45 (N = 195),0.032 (N = 225),0.505 (N = 150),0.717 (N = 379),0.761 (N = 223),0.537 (N = 1172)
9,Frac. Difícil ELO p5 c5,0.218 (N = 195),0.149 (N = 225),0.42 (N = 150),0.425 (N = 379),0.394 (N = 223),0.229 (N = 1172)


## Aleatoriedad para indicadores según definición general

In [10]:
datadir = os.path.join(os.path.pardir, 'datos', 'regresiones')
ligas = ['Alemania','Espana','Francia', 'Inglaterra','Italia', 'All']
dfs_resumen_general = ANOVA1ViaIndicadores(datadir,
                                       ligas,
                                       defin = 'general')

outputfile = 'aleatoriedad-general-%s.xlsx' % filtro
writer = pd.ExcelWriter(os.path.join(outputdir, outputfile), engine='xlsxwriter')
for df, sheet in zip(dfs_resumen_general, ['Facil','Dificil']):
    df.to_excel(writer, sheet_name = sheet, index = False)
writer.save()

In [11]:
dfs_resumen_general[0]

Unnamed: 0,Indicador,Alemania,Espana,Francia,Inglaterra,Italia,All
0,Frac. Fácil Gral. h1 p4 c3,0.427 (N = 202),0.432 (N = 225),0.37 (N = 150),0.708 (N = 380),0.399 (N = 219),0.533 (N = 1176)
1,Frac. Fácil Gral. h1 p5 c3,0.143 (N = 202),0.238 (N = 225),0.377 (N = 150),0.767 (N = 380),0.215 (N = 219),0.243 (N = 1176)
2,Frac. Fácil Gral. h1 p6 c3,0.453 (N = 202),0.156 (N = 225),0.79 (N = 150),0.972 (N = 380),0.432 (N = 219),0.789 (N = 1176)
3,Frac. Fácil Gral. h1 p7 c3,0.358 (N = 202),0.12 (N = 225),0.657 (N = 150),0.984 (N = 380),0.582 (N = 219),0.757 (N = 1176)
4,Frac. Fácil Gral. h1 p4 c4,0.101 (N = 187),0.443 (N = 210),0.619 (N = 140),0.465 (N = 355),0.418 (N = 204),0.323 (N = 1096)
5,Frac. Fácil Gral. h1 p5 c4,0.01 (N = 187),0.316 (N = 210),0.652 (N = 140),0.693 (N = 355),0.083 (N = 204),0.117 (N = 1096)
6,Frac. Fácil Gral. h1 p6 c4,0.136 (N = 187),0.283 (N = 210),0.924 (N = 140),0.943 (N = 355),0.2 (N = 204),0.646 (N = 1096)
7,Frac. Fácil Gral. h1 p7 c4,0.048 (N = 187),0.344 (N = 210),0.81 (N = 140),0.978 (N = 355),0.474 (N = 204),0.71 (N = 1096)
8,Frac. Fácil Gral. h1 p4 c5,0.638 (N = 172),0.429 (N = 195),0.852 (N = 130),0.261 (N = 330),0.299 (N = 189),0.383 (N = 1016)
9,Frac. Fácil Gral. h1 p5 c5,0.384 (N = 172),0.088 (N = 195),0.908 (N = 130),0.853 (N = 330),0.017 (N = 189),0.233 (N = 1016)


In [12]:
dfs_resumen_general[1]

Unnamed: 0,Indicador,Alemania,Espana,Francia,Inglaterra,Italia,All
0,Frac. Difícil Gral. h1 p4 c3,0.736 (N = 225),0.068 (N = 255),0.526 (N = 170),0.984 (N = 429),0.389 (N = 253),0.743 (N = 1332)
1,Frac. Difícil Gral. h1 p5 c3,0.488 (N = 225),0.037 (N = 255),0.726 (N = 170),0.44 (N = 429),0.048 (N = 253),0.105 (N = 1332)
2,Frac. Difícil Gral. h1 p6 c3,0.429 (N = 225),0.037 (N = 255),0.693 (N = 170),0.18 (N = 429),0.156 (N = 253),0.072 (N = 1332)
3,Frac. Difícil Gral. h1 p7 c3,0.628 (N = 225),0.101 (N = 255),0.746 (N = 170),0.069 (N = 429),0.079 (N = 253),0.063 (N = 1332)
4,Frac. Difícil Gral. h1 p4 c4,0.452 (N = 210),0.157 (N = 240),0.455 (N = 160),0.993 (N = 404),0.86 (N = 238),0.898 (N = 1252)
5,Frac. Difícil Gral. h1 p5 c4,0.172 (N = 210),0.305 (N = 240),0.535 (N = 160),0.555 (N = 404),0.567 (N = 238),0.356 (N = 1252)
6,Frac. Difícil Gral. h1 p6 c4,0.422 (N = 210),0.176 (N = 240),0.549 (N = 160),0.449 (N = 404),0.742 (N = 238),0.437 (N = 1252)
7,Frac. Difícil Gral. h1 p7 c4,0.627 (N = 210),0.295 (N = 240),0.789 (N = 160),0.275 (N = 404),0.386 (N = 238),0.416 (N = 1252)
8,Frac. Difícil Gral. h1 p4 c5,0.429 (N = 195),0.008 (N = 225),0.833 (N = 150),0.805 (N = 379),0.803 (N = 223),0.607 (N = 1172)
9,Frac. Difícil Gral. h1 p5 c5,0.248 (N = 195),0.044 (N = 225),0.66 (N = 150),0.329 (N = 379),0.44 (N = 223),0.15 (N = 1172)


In [13]:
datadir = os.path.join(os.path.pardir, 'datos', 'regresiones')
ligas = ['Alemania','Espana','Francia', 'Inglaterra','Italia', 'All']
dfs_resumen_lv = ANOVA1ViaIndicadores(datadir,
                                       ligas,
                                       defin = 'lv')

outputfile = 'aleatoriedad-localvisita-%s.xlsx' % filtro
writer = pd.ExcelWriter(os.path.join(outputdir, outputfile), engine='xlsxwriter')
for df, sheet in zip(dfs_resumen_lv, ['Facil','Dificil']):
    df.to_excel(writer, sheet_name = sheet, index = False)
writer.save()

In [14]:
dfs_resumen_lv[0]

Unnamed: 0,Indicador,Alemania,Espana,Francia,Inglaterra,Italia,All
0,Frac. Fácil LV h1 p4 c3,0.031 (N = 174),0.628 (N = 195),0.112 (N = 130),0.112 (N = 333),0.728 (N = 191),0.067 (N = 1023)
1,Frac. Fácil LV h1 p5 c3,0.021 (N = 174),0.524 (N = 195),0.03 (N = 130),0.545 (N = 333),0.791 (N = 191),0.096 (N = 1023)
2,Frac. Fácil LV h1 p6 c3,0.343 (N = 174),0.347 (N = 195),0.164 (N = 130),0.449 (N = 333),0.723 (N = 191),0.35 (N = 1023)
3,Frac. Fácil LV h1 p7 c3,0.326 (N = 174),0.326 (N = 195),0.077 (N = 130),0.697 (N = 333),0.743 (N = 191),0.373 (N = 1023)
4,Frac. Fácil LV h1 p4 c4,0.316 (N = 150),0.295 (N = 173),0.281 (N = 111),0.442 (N = 291),0.569 (N = 165),0.184 (N = 890)
5,Frac. Fácil LV h1 p5 c4,0.361 (N = 150),0.141 (N = 173),0.023 (N = 111),0.602 (N = 291),0.638 (N = 165),0.092 (N = 890)
6,Frac. Fácil LV h1 p6 c4,0.492 (N = 150),0.189 (N = 173),0.299 (N = 111),0.493 (N = 291),0.706 (N = 165),0.419 (N = 890)
7,Frac. Fácil LV h1 p7 c4,0.186 (N = 150),0.036 (N = 173),0.133 (N = 111),0.582 (N = 291),0.974 (N = 165),0.221 (N = 890)
8,Frac. Fácil LV h1 p4 c5,0.637 (N = 129),0.12 (N = 154),0.355 (N = 97),0.233 (N = 254),0.578 (N = 148),0.165 (N = 782)
9,Frac. Fácil LV h1 p5 c5,0.579 (N = 129),0.107 (N = 154),0.029 (N = 97),0.447 (N = 254),0.419 (N = 148),0.058 (N = 782)


In [15]:
dfs_resumen_lv[1]

Unnamed: 0,Indicador,Alemania,Espana,Francia,Inglaterra,Italia,All
0,Frac. Difícil LV h1 p4 c3,0.415 (N = 204),0.683 (N = 242),0.492 (N = 157),0.944 (N = 401),0.389 (N = 241),0.862 (N = 1245)
1,Frac. Difícil LV h1 p5 c3,0.871 (N = 204),0.32 (N = 242),0.446 (N = 157),0.844 (N = 401),0.027 (N = 241),0.488 (N = 1245)
2,Frac. Difícil LV h1 p6 c3,0.899 (N = 204),0.158 (N = 242),0.619 (N = 157),0.547 (N = 401),0.072 (N = 241),0.431 (N = 1245)
3,Frac. Difícil LV h1 p7 c3,0.968 (N = 204),0.434 (N = 242),0.801 (N = 157),0.384 (N = 401),0.028 (N = 241),0.602 (N = 1245)
4,Frac. Difícil LV h1 p4 c4,0.906 (N = 187),0.034 (N = 218),0.649 (N = 145),0.864 (N = 370),0.812 (N = 219),0.733 (N = 1139)
5,Frac. Difícil LV h1 p5 c4,0.525 (N = 187),0.07 (N = 218),0.455 (N = 145),0.432 (N = 370),0.274 (N = 219),0.13 (N = 1139)
6,Frac. Difícil LV h1 p6 c4,0.616 (N = 187),0.13 (N = 218),0.35 (N = 145),0.421 (N = 370),0.763 (N = 219),0.314 (N = 1139)
7,Frac. Difícil LV h1 p7 c4,0.605 (N = 187),0.145 (N = 218),0.606 (N = 145),0.469 (N = 370),0.419 (N = 219),0.348 (N = 1139)
8,Frac. Difícil LV h1 p4 c5,0.878 (N = 170),0.446 (N = 197),0.531 (N = 131),0.823 (N = 338),0.586 (N = 203),0.866 (N = 1039)
9,Frac. Difícil LV h1 p5 c5,0.817 (N = 170),0.493 (N = 197),0.242 (N = 131),0.321 (N = 338),0.185 (N = 203),0.283 (N = 1039)
