# Concordancia definiciones dificultad

In [1]:
import numpy as np
import pandas as pd
import os
import datetime
from time import time
import statsmodels.api as sm
from statsmodels.formula.api import ols
from scipy import stats

In [2]:
def LeerBasesRegresion(datadir, ligas, column_condition):
    dfs = []
    # Primer caso:
    archivo = ligas[0] + '.xlsx'
    df = pd.read_excel(os.path.join(datadir, archivo))
    columns_filter = [i for i in df.columns if column_condition(i)]
    dependants = [i for i in df.columns if 'position' in i or 'points' in i]
    columns_to_keep = ['equipo','Torneo'] + dependants + columns_filter
    df = df[columns_to_keep]
    for col in columns_filter:
        if 'categoria' in col:
            df[col] = df[col].astype('category')
            df[col].cat.reorder_categories(['regular', 'facil','dificil'], inplace = True)
    dfs.append(df)
    
    try:
        for liga in ligas[1:]:
            archivo = liga + '.xlsx'
            df = pd.read_excel(os.path.join(datadir, archivo))
            for col in columns_filter:
                if 'categoria' in col:
                    df[col] = df[col].astype('category')
                    df[col].cat.reorder_categories(['regular', 'facil','dificil'], inplace = True)
            dfs.append(df)
    except:
        pass
    return dfs, columns_filter, dependants

In [3]:
def significancia_variables(mod,var,replace_spaces = True, std_err = True, stars = True):
    coef = "%.3f" % (np.round(mod.params[var], decimals = 3))
    se = "%.3f" % (np.round(mod.bse[var], decimals = 3))
    if mod.params[var] > 0:
        output = '~' + str(coef)                 
    else:
        output = str(coef)
    if std_err:
        output = output + ' (' + str(se) + ')'
    if stars:
        pval = np.round(mod.pvalues[var], 3)
        if pval < 0.01:
            output = output + '***'
        elif pval < 0.05 and pval >= 0.01:
            output = output + '**~'
        elif pval < 0.1 and pval >=0.05:
            output = output + '*~~'
        else:
            output = output + '~~~'
    if replace_spaces:
        output = output.replace("~"," ")
    return output

In [4]:
def RegresionesConcordancia(datadir, ligas, outputdir, outputfile , column_condition):
    dict_replaces = {'_mean': '',
                     '_':' ',
                     'T.' : '',
                     'dificil':'Difícil',
                     'dificl' : 'Difícil',
                     'facil': 'Fácil',
                     'regular' : 'Regular',
                     'prom': 'Promedio',
                     'posicion': 'Posición',
                     'primer' : 'Primer',
                     'ronda' : 'Ronda',
                     'gral': 'Gral.',
                     'elo' : 'ELO',
                     'categoria': 'Categoría',
                     'local': 'Local',
                     'visita': 'Visita',
                     'general' : 'Gral.',
                     'corr' : 'Correg.',
                     'perc' : 'Frac.',
                     ' h' : ' Ant. ',
                     ' p': ' Primeros ',
                     ' c': ' Corte ',
                     'lv' : 'LV',
                     'Primer Fácil C': 'Primer Fácil LV C',
                     'Primer Difícil C': 'Primer Difícil LV C'}
    dfs, columns_filter, dependants = LeerBasesRegresion(datadir,
                                             ligas,
                                             column_condition)
    dfs_resumen = []
    for y in dependants:
        categories = []
        for var in columns_filter:
            categories.append('%s[T.facil]' % var)
            categories.append('%s[T.dificil]' % var)
        dictresumen_l = {'Categoria' : categories}
        # Columnas de facil
        for i in range(len(ligas)):
            liga = ligas[i]
            resultados = []
            for var in columns_filter:
                var_f = 'C(%s)[T.facil]' % var
                var_d = 'C(%s)[T.dificil]' % var
                formula = '%s ~ C(%s) + equipo' % (y, var)
                mod = ols(formula = formula, data = dfs[i]).fit(cov_type = 'cluster',
                                                                 cov_kwds={'groups': dfs[i]['equipo']})
                resultados.append(significancia_variables(mod,var_f,
                                                          replace_spaces=True,
                                                          std_err = False,
                                                          stars = True))
                resultados.append(significancia_variables(mod,var_d,
                                                          replace_spaces=True,
                                                          std_err = False,
                                                          stars = True))
            dictresumen_l[liga] = resultados
        df = pd.DataFrame(data = dictresumen_l)
        df['Categoria'] = df['Categoria'].replace(to_replace = dict_replaces, regex=True)
        dfs_resumen.append(df)
    # Inicia script para guardar en excel
    writer = pd.ExcelWriter(os.path.join(outputdir, outputfile), engine='xlsxwriter')
    for df, sheet in zip(dfs_resumen, dependants):
        df.to_excel(writer, sheet_name = sheet, index = False)
    writer.save()
    # Finaliza script para guardar en excel
    return dfs_resumen

## Concordancia definición ELO

In [5]:
datadir = os.path.join(os.path.pardir, 'datos', 'regresiones')
outputdir = os.path.join(os.path.pardir,'resultados','concordancia-dificultad')
outputfile = 'concordancia-elo.xlsx'
ligas = ['Alemania','Espana','Francia', 'Inglaterra','Italia']
dfs_resumen_elo = RegresionesConcordancia(datadir,
                                          ligas,
                                          outputdir,
                                          outputfile, 
                                          column_condition = lambda x: 'categoria' in x and 'elo' in x)

## Concordancia definición General

In [6]:
datadir = os.path.join(os.path.pardir, 'datos', 'regresiones')
outputdir = os.path.join(os.path.pardir,'resultados','concordancia-dificultad')
outputfile = 'concordancia-general.xlsx'
ligas = ['Alemania','Espana','Francia', 'Inglaterra','Italia']
dfs_resumen_general = RegresionesConcordancia(datadir,
                                              ligas,
                                              outputdir,
                                              outputfile,
                                              column_condition = lambda x: 'categoria' in x and 'general' in x)

## Concordancia definición Local

In [7]:
datadir = os.path.join(os.path.pardir, 'datos', 'regresiones')
outputdir = os.path.join(os.path.pardir,'resultados','concordancia-dificultad')
outputfile = 'concordancia-local.xlsx'
ligas = ['Alemania','Espana','Francia', 'Inglaterra','Italia']
dfs_resumen_local = RegresionesConcordancia(datadir,
                                            ligas,
                                            outputdir,
                                            outputfile,
                                            column_condition = lambda x: 'categoria' in x and 'local' in x)

## Concordancia definición Visita

In [8]:
datadir = os.path.join(os.path.pardir, 'datos', 'regresiones')
outputdir = os.path.join(os.path.pardir,'resultados','concordancia-dificultad')
outputfile = 'concordancia-visita.xlsx'
ligas = ['Alemania','Espana','Francia', 'Inglaterra','Italia']
dfs_resumen_visita = RegresionesConcordancia(datadir,
                                             ligas,
                                             outputdir,
                                             outputfile,
                                             column_condition = lambda x: 'categoria' in x and 'visita' in x)