# Aleatoriedad en asignación de dificultades

In [1]:
import numpy as np
import pandas as pd
import os
from statsmodels.formula.api import ols
import statsmodels.api as sm
from scipy import stats

In [2]:
def LeerBasesRegresion(datadir, ligas, column_condition):
    dfs = []
    # Primer caso:
    archivo = ligas[0] + '.xlsx'
    df = pd.read_excel(os.path.join(datadir, archivo))
    columns_filter = [i for i in df.columns if column_condition(i)]
    columns_to_keep = ['equipo','Torneo'] + columns_filter
    df = df[columns_to_keep]
    dfs.append(df)
    try:
        for liga in ligas[1:]:
            archivo = liga + '.xlsx'
            df = pd.read_excel(os.path.join(datadir, archivo))
            columns_to_keep = ['equipo','Torneo'] + [i for i in df.columns if column_condition(i)]
            df = df[columns_to_keep]
            dfs.append(df)
    except:
        pass
    return dfs, columns_filter

In [3]:
# def FiltrarRegulares(df, defin, c, h):
#     if defin == 'elo':
#         col = 'categoria_%s_c%s' % (defin, c)
#         df_filter = df[df[col] == 'regular']
#     elif defin == 'general':
#         col = 'categoria_%s_h%s_c%s' % (defin, h, c)
#         df_filter = df[df[col] == 'regular']
#     else:
#         col1 = 'categoria_local_h%s_c%s' % (h,c)
#         col2 = 'categoria_visita_h%s_c%s' % (h,c)
#         df_filter = df[(df[col1] == 'regular') & 
#                        (df[col2] == 'regular')]
#     return df_filter
def FiltrarRegulares(df, defin, c, h, filt):
    cats = ['facil','regular','dificil']
    if filt == '' or filt == "":
        pass
    elif filt == 'regular':
        cats = [filt]
    else:
        cats = [c for c in cats if c != filt]
    if defin == 'elo':
        col = 'categoria_%s_c%s' % (defin, c)
        df_filter = df[df[col].isin(cats)]
    elif defin == 'general':
        col = 'categoria_%s_h%s_c%s' % (defin, h, c)
        df_filter = df[df[col].isin(cats)]
    else:
        col1 = 'categoria_local_h%s_c%s' % (h,c)
        col2 = 'categoria_visita_h%s_c%s' % (h,c)
        df_filter = df[(df[col1].isin(cats)) & 
                       (df[col2].isin(cats))]
    return df_filter

In [4]:
def SignificanciaVariablesManual(coef, se, pval, replace_spaces = True):
    coef = "%.3f" % (np.round(coef, decimals = 3))
    se = "%.3f" % (np.round(se, decimals = 3))
    if coef > 0:
        output = '_' + str(coef) + ' (' + str(se) + ')'                    
    else:
        output = str(coef) + ' (' + str(se) + ')'
    pval = np.round(pval, 3)
    if pval < 0.01:
        output = output + '***'
    elif pval < 0.05 and pval >= 0.01:
        output = output + '**_'
    elif pval < 0.1 and pval >=0.05:
        output = output + '*__'
    else:
        output = output + '___'
    if replace_spaces:
        output = output.replace("_"," ")
    return output

In [5]:
def ANOVA1ViaIndicadores(datadir, ligas, defin, filtro):
    dict_replaces = {'_mean': '',
                     '_':' ',
                     'dificil':'Difícil',
                     'dificl' : 'Difícil',
                     'facil': 'Fácil',
                     'regular' : 'Regular',
                     'prom': 'Promedio',
                     'posicion': 'Posición',
                     'primer' : 'Primer',
                     'ronda' : 'Ronda',
                     'gral': 'Gral.',
                     'elo' : 'ELO',
                     'categoria': 'Categoría',
                     'local': 'Local',
                     'visita': 'Visita',
                     'general' : 'Gral.',
                     'corr' : 'Correg.',
                     'perc' : 'Frac.',
                     ' h' : ' Ant. ',
                     ' p': ' Primeros ',
                     ' c': ' Corte ',
                     'lv' : 'LV',
                     'Primer Fácil C': 'Primer Fácil LV C',
                     'Primer Difícil C': 'Primer Difícil LV C'}
    if defin != 'lv':
        dfs, columns_filter = LeerBasesRegresion(datadir,
                                                 ligas,
                                                 lambda x: defin in x)
    else:
        dfs, columns_filter = LeerBasesRegresion(datadir,
                                                 ligas,
                                                 lambda x: defin in x or 'local' in x or 'visita' in x)
    columns_facil = [i for i in columns_filter if 'facil' in i and 'perc' in i]
    columns_dificil = [i for i in columns_filter if 'dificil' in i and 'perc' in i]
    dictresumen_l = {'Indicador' : columns_facil}
    dfs_resumen = []
    # Columnas de facil
    for i in range(len(ligas)):
        liga = ligas[i]
        if liga != 'All':
            df_liga = dfs[i]
        else:
            df_liga = pd.concat(dfs, ignore_index = True)
        resultados = []
        for var in columns_facil:
            if filtro != 'regular':
                if 'facil' in var:
                    filt = 'facil'
                else:
                    filt = 'dificil'
            else:
                filt = filtro
            if defin != 'elo':
                aux = var.split('_h')[1].split('_p')
                h, c = aux[0], aux[1].split('_c')[1]
            else:
                h, c = None, var.split('_c')[1]
            formula = var + ' ~ C(equipo)'
            if len(filtro) > 0:
                df_anova = FiltrarRegulares(df_liga, defin, c, h, filt)
            else:
                df_anova = df_liga.copy()
            n_obs = df_anova.shape[0]
            model = ols(formula,
                        data =  df_anova).fit()
            aov_table = sm.stats.anova_lm(model, typ=2)
            out = '%s (N = %s)' % (np.round(aov_table['PR(>F)'][0], decimals = 3), n_obs)
            resultados.append(out)
        dictresumen_l[liga] = resultados
    df_facil = pd.DataFrame(data = dictresumen_l)
    df_facil['Indicador'] = df_facil['Indicador'].replace(to_replace = dict_replaces, regex=True)
    dfs_resumen.append(df_facil)
    dictresumen_v = {'Indicador' : columns_dificil}
    for i in range(len(ligas)):
        liga = ligas[i]
        liga = ligas[i]
        if liga != 'All':
            df_liga = dfs[i]
        else:
            df_liga = pd.concat(dfs, ignore_index = True)
        resultados = []
        for var in columns_dificil:
            if filtro != 'regular':
                if 'facil' in var:
                    filt = 'facil'
                else:
                    filt = 'dificil'
            else:
                filt = filtro
            if defin != 'elo':
                aux = var.split('_h')[1].split('_p')
                h, c = aux[0], aux[1].split('_c')[1]
            else:
                h, c = None, var.split('_c')[1]
            formula = var + ' ~ C(equipo)'
            df_anova = FiltrarRegulares(df_liga, defin, c, h, filt)
            n_obs = df_anova.shape[0]
            model = ols(formula,
                        data = df_anova).fit()
            aov_table = sm.stats.anova_lm(model, typ=2)
            out = '%s (N = %s)' % (np.round(aov_table['PR(>F)'][0], decimals = 3), n_obs)
            resultados.append(out)
        dictresumen_v[liga] = resultados
    df_dificil = pd.DataFrame(data = dictresumen_v)
    df_dificil['Indicador'] = df_dificil['Indicador'].replace(to_replace = dict_replaces, regex=True)
    dfs_resumen.append(df_dificil)
    return dfs_resumen

## Aleatoriedad para indicadores según puntaje ELO

In [6]:
filtro = ''

In [7]:
%%time
datadir = os.path.join(os.path.pardir, 'datos', 'regresiones')
ligas = ['Alemania','Espana','Francia', 'Inglaterra','Italia', 'All']
outputdir = os.path.join(os.path.pardir,'resultados','aleatoriedad-inicio')
dfs_resumen_elo = ANOVA1ViaIndicadores(datadir,
                                       ligas,
                                       defin = 'elo',
                                       filtro = filtro)

outputfile = 'aleatoriedad-elo-%s.xlsx' % filtro
writer = pd.ExcelWriter(os.path.join(outputdir, outputfile), engine='xlsxwriter')
for df, sheet in zip(dfs_resumen_elo, ['Facil','Dificil']):
    df.to_excel(writer, sheet_name = sheet, index = False)
writer.save()

CPU times: user 16.8 s, sys: 182 ms, total: 17 s
Wall time: 7.28 s


In [8]:
dfs_resumen_elo[0]

Unnamed: 0,Indicador,Alemania,Espana,Francia,Inglaterra,Italia,All
0,Frac. Fácil ELO p3 c3,0.238 (N = 270),0.265 (N = 300),0.424 (N = 200),0.08 (N = 504),0.628 (N = 298),0.134 (N = 1572)
1,Frac. Fácil ELO p4 c3,0.121 (N = 270),0.105 (N = 300),0.341 (N = 200),0.532 (N = 504),0.646 (N = 298),0.149 (N = 1572)
2,Frac. Fácil ELO p5 c3,0.067 (N = 270),0.264 (N = 300),0.381 (N = 200),0.743 (N = 504),0.664 (N = 298),0.277 (N = 1572)
3,Frac. Fácil ELO p6 c3,0.24 (N = 270),0.015 (N = 300),0.593 (N = 200),0.79 (N = 504),0.844 (N = 298),0.33 (N = 1572)
4,Frac. Fácil ELO p3 c4,0.398 (N = 270),0.495 (N = 300),0.541 (N = 200),0.237 (N = 504),0.087 (N = 298),0.177 (N = 1572)
5,Frac. Fácil ELO p4 c4,0.309 (N = 270),0.215 (N = 300),0.537 (N = 200),0.793 (N = 504),0.055 (N = 298),0.196 (N = 1572)
6,Frac. Fácil ELO p5 c4,0.095 (N = 270),0.234 (N = 300),0.488 (N = 200),0.946 (N = 504),0.113 (N = 298),0.192 (N = 1572)
7,Frac. Fácil ELO p6 c4,0.406 (N = 270),0.026 (N = 300),0.759 (N = 200),0.989 (N = 504),0.129 (N = 298),0.259 (N = 1572)
8,Frac. Fácil ELO p3 c5,0.222 (N = 270),0.164 (N = 300),0.472 (N = 200),0.405 (N = 504),0.229 (N = 298),0.124 (N = 1572)
9,Frac. Fácil ELO p4 c5,0.365 (N = 270),0.012 (N = 300),0.507 (N = 200),0.632 (N = 504),0.19 (N = 298),0.071 (N = 1572)


In [9]:
dfs_resumen_elo[1]

Unnamed: 0,Indicador,Alemania,Espana,Francia,Inglaterra,Italia,All
0,Frac. Difícil ELO p3 c3,0.848 (N = 225),0.037 (N = 255),0.592 (N = 170),0.647 (N = 429),0.742 (N = 253),0.595 (N = 1332)
1,Frac. Difícil ELO p4 c3,0.951 (N = 225),0.082 (N = 255),0.366 (N = 170),0.953 (N = 429),0.445 (N = 253),0.792 (N = 1332)
2,Frac. Difícil ELO p5 c3,0.58 (N = 225),0.036 (N = 255),0.439 (N = 170),0.405 (N = 429),0.068 (N = 253),0.072 (N = 1332)
3,Frac. Difícil ELO p6 c3,0.628 (N = 225),0.041 (N = 255),0.152 (N = 170),0.136 (N = 429),0.353 (N = 253),0.044 (N = 1332)
4,Frac. Difícil ELO p3 c4,0.067 (N = 210),0.044 (N = 240),0.235 (N = 160),0.763 (N = 404),0.938 (N = 238),0.345 (N = 1252)
5,Frac. Difícil ELO p4 c4,0.536 (N = 210),0.043 (N = 240),0.1 (N = 160),0.948 (N = 404),0.732 (N = 238),0.603 (N = 1252)
6,Frac. Difícil ELO p5 c4,0.122 (N = 210),0.056 (N = 240),0.137 (N = 160),0.43 (N = 404),0.153 (N = 238),0.033 (N = 1252)
7,Frac. Difícil ELO p6 c4,0.104 (N = 210),0.013 (N = 240),0.134 (N = 160),0.202 (N = 404),0.649 (N = 238),0.027 (N = 1252)
8,Frac. Difícil ELO p3 c5,0.09 (N = 195),0.066 (N = 225),0.611 (N = 150),0.316 (N = 379),0.877 (N = 223),0.298 (N = 1172)
9,Frac. Difícil ELO p4 c5,0.45 (N = 195),0.032 (N = 225),0.505 (N = 150),0.717 (N = 379),0.761 (N = 223),0.537 (N = 1172)


## Aleatoriedad para indicadores según definición general

In [10]:
datadir = os.path.join(os.path.pardir, 'datos', 'regresiones')
ligas = ['Alemania','Espana','Francia', 'Inglaterra','Italia', 'All']
dfs_resumen_general = ANOVA1ViaIndicadores(datadir,
                                           ligas,
                                           defin = 'general',
                                           filtro = filtro)
outputfile = 'aleatoriedad-general-%s.xlsx' % filtro
writer = pd.ExcelWriter(os.path.join(outputdir, outputfile), engine='xlsxwriter')
for df, sheet in zip(dfs_resumen_general, ['Facil','Dificil']):
    df.to_excel(writer, sheet_name = sheet, index = False)
writer.save()

In [11]:
dfs_resumen_general[0]

Unnamed: 0,Indicador,Alemania,Espana,Francia,Inglaterra,Italia,All
0,Frac. Fácil Gral. h1 p3 c3,0.785 (N = 270),0.759 (N = 300),0.131 (N = 200),0.583 (N = 504),0.549 (N = 298),0.651 (N = 1572)
1,Frac. Fácil Gral. h1 p4 c3,0.603 (N = 270),0.562 (N = 300),0.403 (N = 200),0.695 (N = 504),0.481 (N = 298),0.686 (N = 1572)
2,Frac. Fácil Gral. h1 p5 c3,0.248 (N = 270),0.341 (N = 300),0.619 (N = 200),0.876 (N = 504),0.306 (N = 298),0.57 (N = 1572)
3,Frac. Fácil Gral. h1 p6 c3,0.563 (N = 270),0.198 (N = 300),0.871 (N = 200),0.986 (N = 504),0.412 (N = 298),0.874 (N = 1572)
4,Frac. Fácil Gral. h1 p3 c4,0.657 (N = 270),0.576 (N = 300),0.219 (N = 200),0.631 (N = 504),0.398 (N = 298),0.549 (N = 1572)
5,Frac. Fácil Gral. h1 p4 c4,0.3 (N = 270),0.417 (N = 300),0.674 (N = 200),0.801 (N = 504),0.3 (N = 298),0.609 (N = 1572)
6,Frac. Fácil Gral. h1 p5 c4,0.182 (N = 270),0.22 (N = 300),0.695 (N = 200),0.871 (N = 504),0.081 (N = 298),0.342 (N = 1572)
7,Frac. Fácil Gral. h1 p6 c4,0.704 (N = 270),0.054 (N = 300),0.869 (N = 200),0.981 (N = 504),0.171 (N = 298),0.675 (N = 1572)
8,Frac. Fácil Gral. h1 p3 c5,0.577 (N = 270),0.188 (N = 300),0.292 (N = 200),0.33 (N = 504),0.342 (N = 298),0.219 (N = 1572)
9,Frac. Fácil Gral. h1 p4 c5,0.73 (N = 270),0.042 (N = 300),0.679 (N = 200),0.421 (N = 504),0.149 (N = 298),0.21 (N = 1572)


In [12]:
dfs_resumen_general[1]

Unnamed: 0,Indicador,Alemania,Espana,Francia,Inglaterra,Italia,All
0,Frac. Difícil Gral. h1 p3 c3,0.77 (N = 225),0.008 (N = 255),0.458 (N = 170),0.772 (N = 427),0.743 (N = 253),0.426 (N = 1330)
1,Frac. Difícil Gral. h1 p4 c3,0.736 (N = 225),0.068 (N = 255),0.526 (N = 170),0.981 (N = 427),0.389 (N = 253),0.711 (N = 1330)
2,Frac. Difícil Gral. h1 p5 c3,0.488 (N = 225),0.037 (N = 255),0.726 (N = 170),0.426 (N = 427),0.048 (N = 253),0.096 (N = 1330)
3,Frac. Difícil Gral. h1 p6 c3,0.429 (N = 225),0.037 (N = 255),0.693 (N = 170),0.193 (N = 427),0.156 (N = 253),0.072 (N = 1330)
4,Frac. Difícil Gral. h1 p3 c4,0.288 (N = 210),0.181 (N = 240),0.429 (N = 160),0.942 (N = 402),0.95 (N = 238),0.828 (N = 1250)
5,Frac. Difícil Gral. h1 p4 c4,0.452 (N = 210),0.157 (N = 240),0.455 (N = 160),0.995 (N = 402),0.86 (N = 238),0.9 (N = 1250)
6,Frac. Difícil Gral. h1 p5 c4,0.172 (N = 210),0.305 (N = 240),0.535 (N = 160),0.639 (N = 402),0.567 (N = 238),0.387 (N = 1250)
7,Frac. Difícil Gral. h1 p6 c4,0.422 (N = 210),0.176 (N = 240),0.549 (N = 160),0.517 (N = 402),0.742 (N = 238),0.467 (N = 1250)
8,Frac. Difícil Gral. h1 p3 c5,0.473 (N = 195),0.036 (N = 225),0.641 (N = 150),0.699 (N = 377),0.825 (N = 223),0.536 (N = 1170)
9,Frac. Difícil Gral. h1 p4 c5,0.429 (N = 195),0.008 (N = 225),0.833 (N = 150),0.846 (N = 377),0.803 (N = 223),0.627 (N = 1170)


In [13]:
datadir = os.path.join(os.path.pardir, 'datos', 'regresiones')
ligas = ['Alemania','Espana','Francia', 'Inglaterra','Italia', 'All']
dfs_resumen_lv = ANOVA1ViaIndicadores(datadir,
                                      ligas,
                                      defin = 'lv',
                                      filtro = filtro)

outputfile = 'aleatoriedad-localvisita-%s.xlsx' % filtro
writer = pd.ExcelWriter(os.path.join(outputdir, outputfile), engine='xlsxwriter')
for df, sheet in zip(dfs_resumen_lv, ['Facil','Dificil']):
    df.to_excel(writer, sheet_name = sheet, index = False)
writer.save()

In [14]:
dfs_resumen_lv[0]

Unnamed: 0,Indicador,Alemania,Espana,Francia,Inglaterra,Italia,All
0,Frac. Fácil LV h1 p3 c3,0.05 (N = 270),0.833 (N = 300),0.015 (N = 200),0.164 (N = 504),0.611 (N = 298),0.106 (N = 1572)
1,Frac. Fácil LV h1 p4 c3,0.014 (N = 270),0.324 (N = 300),0.111 (N = 200),0.095 (N = 504),0.811 (N = 298),0.035 (N = 1572)
2,Frac. Fácil LV h1 p5 c3,0.061 (N = 270),0.131 (N = 300),0.217 (N = 200),0.303 (N = 504),0.705 (N = 298),0.087 (N = 1572)
3,Frac. Fácil LV h1 p6 c3,0.254 (N = 270),0.095 (N = 300),0.444 (N = 200),0.254 (N = 504),0.598 (N = 298),0.154 (N = 1572)
4,Frac. Fácil LV h1 p3 c4,0.666 (N = 270),0.532 (N = 300),0.244 (N = 200),0.14 (N = 504),0.483 (N = 298),0.337 (N = 1572)
5,Frac. Fácil LV h1 p4 c4,0.37 (N = 270),0.131 (N = 300),0.389 (N = 200),0.049 (N = 504),0.782 (N = 298),0.144 (N = 1572)
6,Frac. Fácil LV h1 p5 c4,0.521 (N = 270),0.044 (N = 300),0.183 (N = 200),0.094 (N = 504),0.612 (N = 298),0.081 (N = 1572)
7,Frac. Fácil LV h1 p6 c4,0.653 (N = 270),0.033 (N = 300),0.488 (N = 200),0.203 (N = 504),0.545 (N = 298),0.195 (N = 1572)
8,Frac. Fácil LV h1 p3 c5,0.764 (N = 270),0.682 (N = 300),0.326 (N = 200),0.147 (N = 504),0.402 (N = 298),0.456 (N = 1572)
9,Frac. Fácil LV h1 p4 c5,0.726 (N = 270),0.147 (N = 300),0.43 (N = 200),0.141 (N = 504),0.447 (N = 298),0.244 (N = 1572)


In [15]:
dfs_resumen_lv[1]

Unnamed: 0,Indicador,Alemania,Espana,Francia,Inglaterra,Italia,All
0,Frac. Difícil LV h1 p3 c3,0.521 (N = 204),0.533 (N = 242),0.684 (N = 157),0.952 (N = 399),0.764 (N = 241),0.947 (N = 1243)
1,Frac. Difícil LV h1 p4 c3,0.415 (N = 204),0.683 (N = 242),0.492 (N = 157),0.947 (N = 399),0.389 (N = 241),0.854 (N = 1243)
2,Frac. Difícil LV h1 p5 c3,0.871 (N = 204),0.32 (N = 242),0.446 (N = 157),0.844 (N = 399),0.027 (N = 241),0.472 (N = 1243)
3,Frac. Difícil LV h1 p6 c3,0.899 (N = 204),0.158 (N = 242),0.619 (N = 157),0.604 (N = 399),0.072 (N = 241),0.454 (N = 1243)
4,Frac. Difícil LV h1 p3 c4,0.859 (N = 187),0.097 (N = 218),0.769 (N = 145),0.935 (N = 368),0.945 (N = 219),0.93 (N = 1137)
5,Frac. Difícil LV h1 p4 c4,0.906 (N = 187),0.034 (N = 218),0.649 (N = 145),0.868 (N = 368),0.812 (N = 219),0.725 (N = 1137)
6,Frac. Difícil LV h1 p5 c4,0.525 (N = 187),0.07 (N = 218),0.455 (N = 145),0.473 (N = 368),0.274 (N = 219),0.136 (N = 1137)
7,Frac. Difícil LV h1 p6 c4,0.616 (N = 187),0.13 (N = 218),0.35 (N = 145),0.491 (N = 368),0.763 (N = 219),0.341 (N = 1137)
8,Frac. Difícil LV h1 p3 c5,0.792 (N = 170),0.598 (N = 197),0.485 (N = 131),0.981 (N = 336),0.702 (N = 203),0.964 (N = 1037)
9,Frac. Difícil LV h1 p4 c5,0.878 (N = 170),0.446 (N = 197),0.531 (N = 131),0.826 (N = 336),0.586 (N = 203),0.862 (N = 1037)


In [16]:
len("")

0