# Regresiones 1 Variable

In [1]:
import numpy as np
import pandas as pd
import os
import datetime
from time import time
import statsmodels.api as sm
from statsmodels.formula.api import ols
from scipy import stats

import matplotlib
import matplotlib.pyplot as plt

## Familias indicadores
- ``elo``: definición ELO
- ``general``: definición general
- ``lv``: definición LV

Se reportarán los valores (y significancia) del coeficiente de regresión asociado al indicador de interés

In [2]:
def LeerBasesRegresion(datadir, ligas, column_condition):
    dfs = []
    # Primer caso:
    archivo = ligas[0] + '.xlsx'
    df = pd.read_excel(os.path.join(datadir, archivo))
    columns_filter = [i for i in df.columns if column_condition(i)]
    dependants = [i for i in df.columns if 'position' in i or 'points' in i]
    columns_to_keep = ['equipo','Torneo'] + dependants + columns_filter
    df = df[columns_to_keep]
    for col in columns_filter:
        if 'categoria' in col:
            df[col] = df[col].astype('category')
            df[col].cat.reorder_categories(['regular', 'facil','dificil'], inplace = True)
    dfs.append(df)
    
    try:
        for liga in ligas[1:]:
            archivo = liga + '.xlsx'
            df = pd.read_excel(os.path.join(datadir, archivo))
            for col in columns_filter:
                if 'categoria' in col:
                    df[col] = df[col].astype('category')
                    df[col].cat.reorder_categories(['regular', 'facil','dificil'], inplace = True)
            dfs.append(df)
    except:
        pass
    return dfs, columns_filter, dependants

In [3]:
def FiltrarRegulares(df, defin, c, h):
    if defin == 'elo':
        col = 'categoria_%s_c%s' % (defin, c)
        df_filter = df[df[col] == 'regular']
    elif defin == 'general':
        col = 'categoria_%s_h%s_c%s' % (defin, h, c)
        df_filter = df[df[col] == 'regular']
    else:
        col1 = 'categoria_local_h%s_c%s' % (h,c)
        col2 = 'categoria_visita_h%s_c%s' % (h,c)
        df_filter = df[(df[col1] == 'regular') & 
                       (df[col2] == 'regular')]
    return df_filter

In [4]:
def significancia_variables(mod,var,replace_spaces = True, std_err = True):
    coef = "%.3f" % (np.round(mod.params[var], decimals = 3))
    se = "%.3f" % (np.round(mod.bse[var], decimals = 3))
    if mod.params[var] > 0:
        output = '~' + str(coef)                 
    else:
        output = str(coef)
    if std_err:
        output = output + ' (' + str(se) + ')'   
    pval = np.round(mod.pvalues[var], 3)
    if pval < 0.01:
        output = output + '***'
    elif pval < 0.05 and pval >= 0.01:
        output = output + '**~'
    elif pval < 0.1 and pval >=0.05:
        output = output + '*~~'
    else:
        output = output + '~~~'
    if replace_spaces:
        output = output.replace("~"," ")
    return output

In [5]:
def ConstructorTablaCoeficientes1VarLR(datadir,
                                       ligas,
                                       outputdir,
                                       outputfile,
                                       ratio_lv,
                                       defin):
    columnas = ['Base','EF Equipo', 'EF Torneo', 'EF Equipo y Torneo']
    if defin != 'lv':
        dfs, columns_filter, dependants = LeerBasesRegresion(datadir,
                                                             ligas,
                                                             lambda x: defin in x)
    else:
        dfs, columns_filter, dependants = LeerBasesRegresion(datadir,
                                                             ligas,
                                                             lambda x: defin in x or 'local' in x or 'visita' in x) 
#     dfs, columns_filter, dependants = LeerBasesRegresion(datadir, ligas, column_condition)
    columns_filter = [c for c in columns_filter if 'perc' in c]
    columns_filter_title = [c for c in columns_filter if 'perc' in c]
    dict_replaces = {'_mean': '',
                     '_':' ',
                     'dificil':'Difícil',
                     'dificl' : 'Difícil',
                     'facil': 'Fácil',
                     'regular' : 'Regular',
                     'prom': 'Promedio',
                     'posicion': 'Posición',
                     'primer' : 'Primer',
                     'ronda' : 'Ronda',
                     'gral': 'Gral.',
                     'categoria': 'Categoría',
                     'local': 'Local',
                     'visita': 'Visita',
                     'general' : 'Gral.',
                     'corr' : 'Correg.',
                     'perc' : 'Frac. ',
                     ' p': ' Primeros ',
                     ' c': ' Corte ',
                     ' h' : ' Ant. ',
                     'lv' : ' LV',
                     'elo' : 'ELO'}
    for key, value in dict_replaces.items():
        columns_filter_title = [i.replace(key,value) for i in columns_filter_title]
    for l in range(len(ligas)):
        liga = ligas[l]
        if liga != 'All':
            df_liga = dfs[l]
        else:
            df_liga = pd.concat(dfs, ignore_index = True)
        liga = ligas[l].lower()
        outputfile_ = outputfile % liga
        dfs_coefs = []
        for y in dependants:
            df_coef = pd.DataFrame(data = np.zeros((len(columns_filter),4)), columns = columnas)
            coefs = [[] for i in range(4)]
            if ratio_lv:
                ratio = 'ratio_lv_p%s' % r
                if '%s' in y:
                    formulas = [y % r + ' ~ %s + ' + ratio,
                                y % r + ' ~ %s + equipo + ' + ratio,
                                y % r + ' ~ %s + Torneo + ' + ratio,
                                y % r + ' ~ %s + equipo + Torneo + ' + ratio]
                else:
                    formulas = [y + ' ~ %s + ' + ratio,
                                y + ' ~ %s + equipo + ' + ratio,
                                y + ' ~ %s + Torneo + ' + ratio,
                                y + ' ~ %s + equipo + Torneo + ' + ratio]
            else:
                if '%s' in y:
                    formulas = [y % r + ' ~ %s',
                                y % r + ' ~ %s + equipo',
                                y % r + ' ~ %s + Torneo',
                                y % r + ' ~ %s + equipo + Torneo']
                else:
                    formulas = [y + ' ~ %s',
                                y + ' ~ %s + equipo',
                                y + ' ~ %s + Torneo',
                                y + ' ~ %s + equipo + Torneo']
            for var in columns_filter:
                for i in range(len(formulas)):
                    formula_ = formulas[i] % var
                    if defin != 'elo':
                        aux = var.split('_h')[1].split('_p')
                        h, c = aux[0], aux[1].split('_c')[1]
                    else:
                        h, c = None, var.split('_c')[1]
                    df_reg = FiltrarRegulares(df_liga, defin, c, h)
                    mod = ols(formula = formula_, data = df_reg).fit(cov_type = 'cluster',
                                                                     cov_kwds={'groups': df_reg['equipo']})
                    coefs[i].append(significancia_variables(mod,var, replace_spaces=True, std_err=False))
            for i in range(4):
                df_coef[columnas[i]] = coefs[i]
            df_coef['Indicador'] = columns_filter_title
            df_coef.set_index('Indicador', inplace = True)
            dfs_coefs.append(df_coef)
        writer = pd.ExcelWriter(os.path.join(outputdir, outputfile_), engine='xlsxwriter')
        for df, sheet in zip(dfs_coefs, dependants):
            df.to_excel(writer, sheet_name = sheet)
        writer.save()
    return dfs_coefs

In [6]:
# Esta funcion es para hacer los dataframes que permiten graficar
def ConstructorCoeficientes1VarLR(df_reg,
                                  variable,
                                  formula,
                                  conf = 0.05,
                                  ratio_lv = True):
    rl = []
    coefs = []
    lb = []
    ub = []
    for r in [4,5,6,7]:
        for l in [3,4,5]:
            var = variable % (r,l)
            rl.append("$r = %s, l = %s$" % (r,l))
            if ratio_lv:
                ratio = 'ratio_lv_p%s' % r
                if formula.count('%s') > 1:
                    strreplace = (r,var)
                else:
                    strreplace = var
                formula_ = formula % strreplace + ' + ' + ratio
            else:
                if formula.count('%s') > 1:
                    strreplace = (r,var)
                else:
                    strreplace = var
                formula_ = formula % strreplace
            mod = ols(formula = formula_, data = df_reg).fit(cov_type = 'cluster',
                                                             cov_kwds={'groups': df_reg['equipo']})
            coefs.append(mod.params[var])
            ci = mod.conf_int(alpha=conf)
            lb.append(ci.loc[var,0])
            ub.append(ci.loc[var,1])
    df_coef = pd.DataFrame(data = {'LB':lb, 'Beta' : coefs, 'UB':ub}, index = rl)
    return df_coef

In [7]:
def GraficoCoeficientes1VarLR(datadir,
                              ligas,
                              variable,
                              plotdir,
                              plotname,
                              conf,
                              ratio_lv,
                              column_condition):
    
    
    dictreplaces = {'perc': 'Frac. partidos', 'facil': 'fáciles', 'dificil': 'dificiles','_':' ', 'h': 'Ant. '}
    cint = str(int((1-conf)*100)) + '%'
    dfs, columns_filter, dependants = LeerBasesRegresion(datadir, ligas, column_condition)
    textformulas = ['Base','EF Equipo', 'EF Torneo', 'EF Equipo y Torneo']
    for y in dependants:
        if ratio_lv:
            ratio = 'ratio_lv_p%s' % r
            formulas = [y + ' ~ %s + ' + ratio,
                        y + ' ~ %s + equipo + ' + ratio,
                        y + ' ~ %s + Torneo + ' + ratio,
                        y + ' ~ %s + equipo + Torneo + ' + ratio]
        else:
            formulas = [y + ' ~ %s',
                        y + ' ~ %s + equipo',
                        y + ' ~ %s + Torneo',
                        y + ' ~ %s + equipo + Torneo']
        for l in range(len(ligas)):
            df_reg = dfs[l]
            liga = ligas[l].lower()
            plotname_ = plotname % (liga, y)
            title = variable.split('_p')[0]
            for word, replace in dictreplaces.items():
                title = title.replace(word, replace)
            title = '$y_{it}$: ' + y + ', $x_{it}$: ' + title + ' - %s' % ligas[l]
            fig, axs = plt.subplots(figsize=(5,10), nrows = len(formulas), sharex= True)
            plt.suptitle(title)
            for i in range(len(formulas)):
                formula = formulas[i]
                textformula = textformulas[i]
                ax = axs[i]
                df_coef = ConstructorCoeficientes1VarLR(df_reg, variable, formula, conf, ratio_lv)
                x = df_coef.index.tolist()
                lb = df_coef.LB.tolist()
                ub = df_coef.UB.tolist()
                beta = df_coef.Beta.tolist()
                ax.plot(x, beta, color = 'g' ,label='Beta')
                ax.fill_between(x, lb, ub, color='grey', label='I.C. %s' %cint)
        #         ax.plot(df_coef.index.tolist(), ,linestyle = ':', color = 'r' ,label='LB %s' % cint)
        #         ax.plot(df_coef.index.tolist(), df_coef.UB.tolist(),linestyle = ':', color = 'b' ,label='UB %s' % cint)
                ax.axhline(y=0, color='k')
                ax.set_ylabel(r'$\beta$')
                ax.legend()
                ax.set_title(textformula)
                if i == len(formulas) - 1:
                    ax.set_xlabel('Valores de $r$ y $l$')
            fig.tight_layout()
            fig.subplots_adjust(top=0.92)
            plt.xticks(rotation=90)
            fig.savefig(os.path.join(plotdir,'%s' % plotname_), bbox_inches = "tight", dpi = 250)
            plt.cla()
            plt.clf()
            plt.close('all')

## Regresiones 1 Var ELO

In [8]:
datadir = os.path.join(os.path.pardir, 'datos', 'regresiones')
outputdir = os.path.join(os.path.pardir,'resultados','regresiones-1var-regulares')
ligas = ['Alemania','Espana','Francia', 'Inglaterra','Italia', 'All']
outputfile = '%s-elo-1var.xlsx'
dfs_resumen = ConstructorTablaCoeficientes1VarLR(datadir,
                                                 ligas,
                                                 outputdir,
                                                 outputfile,
                                                 ratio_lv = False,
                                                 defin = 'elo')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




## Graficos 1 Var ELO

In [None]:
# Fácil
plotdir = os.path.join(os.path.pardir,'resultados','regresiones-1var-graficos')
plotname = '%s-elo-1var-facil-%s.png'
variable = 'perc_facil_elo_p%s_c%s'
GraficoCoeficientes1VarLR(datadir = datadir,
                          ligas = ligas,
                          variable = variable,
                          plotdir = plotdir,
                          plotname = plotname,
                          conf = 0.05,
                          ratio_lv = False,
                          column_condition = lambda x: 'perc' in x and 'elo' in x and 'facil' in x)

In [None]:
# Difícil
plotdir = os.path.join(os.path.pardir,'resultados','regresiones-1var-graficos')
plotname = '%s-elo-1var-dificil-%s.png'
variable = 'perc_dificil_elo_p%s_c%s'
GraficoCoeficientes1VarLR(datadir = datadir,
                          ligas = ligas,
                          variable = variable,
                          plotdir = plotdir,
                          plotname = plotname,
                          conf = 0.05,
                          ratio_lv = False,
                          column_condition = lambda x: 'perc' in x and 'elo' in x and 'dificil' in x)

## Regresiones 1 Var General

In [9]:
outputfile = '%s-general-reg-1var.xlsx'
dfs_resumen = ConstructorTablaCoeficientes1VarLR(datadir,
                                                 ligas,
                                                 outputdir,
                                                 outputfile,
                                                 ratio_lv = False,
                                                 defin = 'general')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




## Gráficos 1 Var General

In [None]:
%%time
# General fácil
plotdir = os.path.join(os.path.pardir,'resultados','regresiones-1var-graficos')
# for h in [1,2,3,4,5]:
for h in [4,5]:
    plotname = '%s-general_h' + str(h) + '-1var-facil-%s.png'
    variable = 'perc_facil_general_h' + str(h) + '_p%s_c%s'
    GraficoCoeficientes1VarLR(datadir = datadir,
                              ligas = ligas,
                              variable = variable,
                              plotdir = plotdir,
                              plotname = plotname,
                              conf = 0.05,
                              ratio_lv = False,
                              column_condition = lambda x: 'perc' in x and 'general' in x and 'facil' in x)

In [None]:
%%time
# General difícil
# for h in [1,2,3,4,5]:
for h in [4,5]:
    plotname = '%s-general_h' + str(h) + '-1var-dificil-%s.png'
    variable = 'perc_dificil_general_h' + str(h) + '_p%s_c%s'
    GraficoCoeficientes1VarLR(datadir = datadir,
                              ligas = ligas,
                              variable = variable,
                              plotdir = plotdir,
                              plotname = plotname,
                              conf = 0.05,
                              ratio_lv = False,
                              column_condition = lambda x: 'perc' in x and 'general' in x and 'dificil' in x)

## Regresiones 1 Var LV

In [10]:
outputfile = '%s-localvisita-reg-1var.xlsx'
dfs_resumen = ConstructorTablaCoeficientes1VarLR(datadir,
                                                 ligas,
                                                 outputdir,
                                                 outputfile,
                                                 ratio_lv = False,
                                                 defin = 'lv')

  return np.sqrt(np.diag(self.cov_params()))
  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = cond0 & (x <= _a)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [None]:
%%time
plotdir = os.path.join(os.path.pardir,'resultados','regresiones-1var-graficos')
datadir = os.path.join(os.path.pardir, 'datos', 'regresiones')
outputdir = os.path.join(os.path.pardir,'resultados','regresiones-1var')
ligas = ['Alemania','Espana','Francia', 'Inglaterra','Italia']
# LV fácil
for h in [1,2,3,4,5]:
    plotname = '%s-localvisita_h' + str(h) + '-1var-facil-%s.png'
    variable = 'perc_facil_lv_h' + str(h) + '_p%s_c%s'
    GraficoCoeficientes1VarLR(datadir = datadir,
                              ligas = ligas,
                              variable = variable,
                              plotdir = plotdir,
                              plotname = plotname,
                              conf = 0.05,
                              ratio_lv = False,
                              column_condition = lambda x: 'perc' in x and 'lv' in x and 'facil' in x)

In [None]:
%%time
#LV difícil
for h in [1,2,3,4,5]:
    plotname = '%s-localvisita_h' + str(h) + '-1var-dificil-%s.png'
    variable = 'perc_dificil_lv_h' + str(h) + '_p%s_c%s'
    GraficoCoeficientes1VarLR(datadir = datadir,
                              ligas = ligas,
                              variable = variable,
                              plotdir = plotdir,
                              plotname = plotname,
                              conf = 0.05,
                              ratio_lv = False,
                              column_condition = lambda x: 'perc' in x and 'lv' in x and 'dificil' in x)