In [1]:
# Librerías de siempre
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import os
import datetime
from time import time
import statsmodels.api as sm
import itertools
from random import shuffle
from statsmodels.formula.api import ols
from scipy import stats
import statsmodels.formula.api as smf
import warnings
warnings.filterwarnings('ignore')

In [2]:
def significancia_variables(mod,var,replace_spaces = True, std_err = True):
    coef = "%.3f" % (np.round(mod.params[var], decimals = 3))
    se = "%.3f" % (np.round(mod.bse[var], decimals = 3))
    if mod.params[var] > 0:
        output = '~' + str(coef)                 
    else:
        output = str(coef)
    if std_err:
        output = output + ' (' + str(se) + ')'   
    pval = np.round(mod.pvalues[var], 3)
    if pval < 0.01:
        output = output + '***'
    elif pval < 0.05 and pval >= 0.01:
        output = output + '**~'
    elif pval < 0.1 and pval >=0.05:
        output = output + '*~~'
    else:
        output = output + '~~~'
    if replace_spaces:
        output = output.replace("~"," ")
    return output

In [3]:
def significancia_oddsratio(mod,var,replace_spaces = True, std_err = True):
    orr = (np.exp(mod.params[var]) - 1)*100
    coef = "%.3f" % (np.round(orr, decimals = 3))
    se = "%.3f" % (np.round(mod.bse[var], decimals = 3))
    if orr > 0:
        output = '~' + str(coef) + "%"                 
    else:
        output = str(coef) + "%"     
    if std_err:
        output = output + ' (' + str(se) + ')'   
    pval = np.round(mod.pvalues[var], 3)
    if pval < 0.01:
        output = output + '***'
    elif pval < 0.05 and pval >= 0.01:
        output = output + '**~'
    elif pval < 0.1 and pval >=0.05:
        output = output + '*~~'
    else:
        output = output + '~~~'
    if replace_spaces:
        output = output.replace("~"," ")
    return output

In [4]:
def ArmarDatosLogit(datadir, regdir, liga):
    arch_liga = liga + '.xlsx'
    df = pd.read_excel(os.path.join(datadir,arch_liga))
    df_reg = pd.read_excel(os.path.join(regdir,arch_liga))
    df = df[['Torneo','Local','Visita','goles L','goles V']]
    cols_categoria = [i for i in df_reg.columns if 'categoria' in i]
    df_reg = df_reg[['equipo','Torneo'] + cols_categoria]
    for col in cols_categoria:
        df_reg[col] = df_reg[col].astype('category')
        df_reg[col].cat.reorder_categories(['regular', 'facil','dificil'], inplace = True)
    conditions = [
        (df['goles L'] > df['goles V']),
        (df['goles L'] < df['goles V'])]
    choices = ['local', 'visita']
    df['outcome'] = np.select(conditions, choices, default = 'empate')
    df_reg_l = df_reg[[i for i in df_reg.columns if 'visita' not in i]]
    df_reg_l = df_reg_l[['Torneo','equipo'] + [i for i in df_reg_l if i not in ['equipo','Torneo']]]
    df_reg_l.columns = ['Torneo','Local'] + [i + '_l' for i in df_reg_l if i not in ['equipo','Torneo']]
    df_reg_v = df_reg[[i for i in df_reg.columns if 'local' not in i]]
    df_reg_v = df_reg_v[['Torneo','equipo'] + [i for i in df_reg_v if i not in ['equipo','Torneo']]]
    df_reg_v.columns = ['Torneo','Visita'] + [i + '_v' for i in df_reg_v if i not in ['equipo','Torneo']]
    df_logit = df.merge(df_reg_l,
                        how = 'left',
                        left_on = ['Torneo','Local'],
                        right_on = ['Torneo','Local'])
    df_logit = df_logit.merge(df_reg_v,
                        how = 'left',
                        left_on = ['Torneo','Visita'],
                        right_on = ['Torneo','Visita'])
    df_logit = df_logit[['Torneo','Local','Visita'] + [i for i in df_logit.columns if 'categoria' in i] + ['outcome']]
    return df_logit

In [5]:
def LogitProbGanarGeneral(df_logit, f_sig):
    dict_vars = {'rol[T.local]':'Localía',
                 'categoria_rival[T.facil]':'Rival fácil',
                 'categoria_rival[T.dificil]':'Rival difícil',
                 'rol[T.local]:categoria_rival[T.facil]':'Localía * Rival fácil',
                 'rol[T.local]:categoria_rival[T.dificil]':'Localía * Rival difícil'}
    cat_general = 'categoria_general_c%s_%s'
#     cat_local = 'categoria_local_c%s_l'
#     cat_visita = 'categoria_visita_c%s_v'
    col_vars = [val for val in dict_vars.values()]
    # Categorias generales
    for c in [3,4,5]:
        col_betas = []
        df_logit_l = df_logit[(df_logit[cat_general % (c,'l')] == 'regular')]
        df_logit_l['rol'] = 'local'
        df_logit_l = df_logit_l[['Torneo','Local','Visita','rol',cat_general % (c,'v'),'outcome']]
        df_logit_l.rename(columns={'Local': 'Equipo',
                                   'Visita': 'Rival',
                                   cat_general % (c,'v'): 'categoria_rival'},
                          inplace=True)
        df_logit_l['outcome'] = [1 if i == 'local' else 0 for i in df_logit_l['outcome'].tolist()]
        df_logit_l = df_logit_l.reset_index(drop=True)
    
        df_logit_v = df_logit[(df_logit[cat_general % (c,'v')] == 'regular')]
        df_logit_v['rol'] = 'visita'
        df_logit_v = df_logit_v[['Torneo','Visita','Local','rol',cat_general % (c,'l'),'outcome']]
        df_logit_v.rename(columns={'Local': 'Rival',
                                   'Visita': 'Equipo',
                                   cat_general % (c,'l'): 'categoria_rival'},
                          inplace=True)
        df_logit_v['outcome'] = [1 if i == 'visita' else 0 for i in df_logit_v['outcome'].tolist()]
        df_logit_v = df_logit_v.reset_index(drop=True)
        df_logit_filt = pd.concat([df_logit_l,df_logit_v]).reset_index(drop=True)
        df_logit_filt['rol'] = df_logit_filt['rol'].astype('category')
        df_logit_filt['rol'].cat.reorder_categories(['visita', 'local'], inplace = True)
        formula = 'outcome ~ Equipo + Rival + Torneo + rol*categoria_rival'
        mod = smf.logit(formula = formula, data = df_logit_filt).fit(disp=False)
        for var in dict_vars.keys():
            col_betas.append(f_sig(mod,var,replace_spaces = False, std_err = False))
        if c < 4:
            df_logit_res = pd.DataFrame({'Variable': col_vars, 'l = %s' % c : col_betas})
        else:
            df_logit_res['l = %s' % c] = col_betas
    return df_logit_res

In [6]:
def LogitProbGanarLV(df_logit, f_sig):
    dict_vars = {'rol[T.local]':'Localía',
                 'categoria_rival[T.facil]':'Rival fácil',
                 'categoria_rival[T.dificil]':'Rival difícil',
                 'rol[T.local]:categoria_rival[T.facil]':'Localía * Rival fácil',
                 'rol[T.local]:categoria_rival[T.dificil]':'Localía * Rival difícil'}
#     cat_general = 'categoria_general_c%s_%s'
    cat_local = 'categoria_local_c%s_l'
    cat_visita = 'categoria_visita_c%s_v'
    col_vars = [val for val in dict_vars.values()]
    for c in [3,4,5]:
        col_betas = []
        df_logit_l = df_logit[(df_logit[cat_local % c] == 'regular')]
        df_logit_l['rol'] = 'local'
        df_logit_l = df_logit_l[['Torneo','Local','Visita','rol',cat_visita % c,'outcome']]
        df_logit_l.rename(columns={'Local': 'Equipo',
                                   'Visita': 'Rival',
                                   cat_visita % c: 'categoria_rival'},
                          inplace=True)
        df_logit_l['outcome'] = [1 if i == 'local' else 0 for i in df_logit_l['outcome'].tolist()]
        df_logit_l = df_logit_l.reset_index(drop=True)
    
        df_logit_v = df_logit[(df_logit[cat_visita % c] == 'regular')]
        df_logit_v['rol'] = 'visita'
        df_logit_v = df_logit_v[['Torneo','Visita','Local','rol',cat_local % c,'outcome']]
        df_logit_v.rename(columns={'Local': 'Rival',
                                   'Visita': 'Equipo',
                                   cat_local % c: 'categoria_rival'},
                          inplace=True)
        df_logit_v['outcome'] = [1 if i == 'visita' else 0 for i in df_logit_v['outcome'].tolist()]
        df_logit_v = df_logit_v.reset_index(drop=True)
        df_logit_filt = pd.concat([df_logit_l,df_logit_v]).reset_index(drop=True)
        df_logit_filt['rol'] = df_logit_filt['rol'].astype('category')
        df_logit_filt['rol'].cat.reorder_categories(['visita', 'local'], inplace = True)
        formula = 'outcome ~ Equipo + Rival + Torneo + rol*categoria_rival'
        mod = smf.logit(formula = formula, data = df_logit_filt).fit(disp=False)
        for var in dict_vars.keys():
            col_betas.append(f_sig(mod,var,replace_spaces = False, std_err = False))
        if c < 4:
            df_logit_res = pd.DataFrame({'Variable': col_vars, 'l = %s' % c : col_betas})
        else:
            df_logit_res['l = %s' % c] = col_betas
    return df_logit_res

In [7]:
def LogitsLigas(datadir, regdir, ligas = ['Alemania','Chile','Espana','Francia','Inglaterra','Italia']):
    for liga in ligas:
        print('Liga: %s' % liga)
        print('Definición general')
        df_logit = ArmarDatosLogit(datadir, regdir, liga)
        print(LogitProbGanarGeneral(df_logit).to_latex(index=False).replace('\\textasciitilde','~'))
        print('Definición LV')       
        print(LogitProbGanarLV(df_logit).to_latex(index=False).replace('\\textasciitilde','~'))

In [8]:
datadir = os.path.join(os.path.pardir,'Datos','Datos simulacion')
regdir = os.path.join(os.path.pardir,'Datos','Bases construidas','Bases main','Bases efecto')
ligas = ['Alemania','Chile','Espana','Francia','Inglaterra','Italia']
for liga in ligas:
    print('Liga: %s' % liga)
    print('Definición general')
    df_logit = ArmarDatosLogit(datadir, regdir, liga)
    print(LogitProbGanarLV(df_logit, f_sig = significancia_oddsratio).to_latex(index=False).replace('\\textasciitilde ','~'))
# liga = 'Chile'
# df_logit_test = ArmarDatosLogit(datadir, regdir, liga)
# LogitProbGanarLV(ArmarDatosLogit(datadir, regdir, liga))
# LogitsLigas(datadir, regdir, ligas = ['Alemania','Chile','Espana','Francia','Inglaterra','Italia'])

Liga: Alemania
Definición general
\begin{tabular}{llll}
\toprule
                Variable &         l = 3 &        l = 4 &         l = 5 \\
\midrule
                 Localía &  ~116.712\%*** &  ~96.311\%*** &  ~101.805\%*** \\
             Rival fácil &   ~23.481\%*~~ &   -1.043\%~~~ &    ~9.081\%~~~ \\
           Rival difícil &   -25.268\%*~~ &  -15.538\%~~~ &   -10.986\%~~~ \\
   Localía * Rival fácil &    -7.380\%~~~ &  ~10.902\%~~~ &    -4.936\%~~~ \\
 Localía * Rival difícil &   ~22.011\%~~~ &   ~4.444\%~~~ &    ~1.179\%~~~ \\
\bottomrule
\end{tabular}

Liga: Chile
Definición general
\begin{tabular}{llll}
\toprule
                Variable &         l = 3 &         l = 4 &         l = 5 \\
\midrule
                 Localía &  ~137.912\%*** &  ~145.530\%*** &  ~148.509\%*** \\
             Rival fácil &    -3.695\%~~~ &    -2.089\%~~~ &    -1.750\%~~~ \\
           Rival difícil &   ~10.694\%~~~ &   ~23.389\%~~~ &   ~12.576\%~~~ \\
   Localía * Rival fácil &   ~22.773\%~~~ &    ~8.