In [1]:
# Librerías de siempre
import pandas as pd
import numpy as np
import os
import statsmodels.api as sm
from statsmodels.formula.api import ols
from scipy import stats
import warnings
warnings.filterwarnings("ignore")

In [2]:
def formatear_base(df):
    equipos = df['Local'].value_counts().index.tolist()
    equipos = pd.DataFrame(equipos, columns=['equipo'])
    equipos['i'] = equipos.index
    dict_equipos = dict(zip(equipos['i'].tolist(), equipos['equipo'].tolist()))
    df = pd.merge(df, equipos, left_on='Local', right_on='equipo', how='left')
    df = df.rename(columns = {'i': 'i_local'}).drop('equipo', 1)
    df = pd.merge(df, equipos, left_on='Visita', right_on='equipo', how='left')
    df = df.rename(columns = {'i': 'i_visita'}).drop('equipo', 1)
    return df

In [3]:
def tabla_final_torneo(db):
    torneo = db['Torneo'].drop_duplicates().tolist()[0]
    df = db.copy().reset_index(drop=True)
    df = formatear_base(df)
    tabla = df[['Local','i_local']].drop_duplicates()
    tabla = tabla.set_index(['i_local'])
    tabla.columns = ['equipo']
    conditions = [
            (df['goles L'] > df['goles V']),
            (df['goles L'] < df['goles V'])]
    choices = ['local', 'visita']
    df = df.join(pd.get_dummies(np.select(conditions, choices, default = 'empate')))
    ghome = df.groupby('i_local')
    gaway = df.groupby('i_visita')
    df_home = pd.DataFrame({'wins_h': ghome['local'].sum(),
                            'draws_h': ghome['empate'].sum(),
                            'losses_h': ghome['visita'].sum(),
                            'gf_h': ghome['goles L'].sum(),
                            'ga_h': ghome['goles V'].sum(),
                            'gd_h': ghome['goles L'].sum() - ghome['goles V'].sum(),
                            'promoccup': ghome['Occupation'].mean()})
    df_away = pd.DataFrame({'wins_a': gaway['visita'].sum(),
                            'draws_a': gaway['empate'].sum(),
                            'losses_a': gaway['local'].sum(),
                            'gf_a': gaway['goles V'].sum(),
                            'ga_a': gaway['goles L'].sum(),
                            'gd_a': gaway['goles V'].sum() - gaway['goles L'].sum()})
    tabla = tabla.join(df_home, how='left').join(df_away,how = 'left').fillna(0)
    tabla['wins'] = tabla.wins_h + tabla.wins_a
    tabla['draws'] = tabla.draws_h + tabla.draws_a
    tabla['losses'] = tabla.losses_h + tabla.losses_a
    tabla['gf'] = tabla.gf_h + tabla.gf_a
    tabla['ga'] = tabla.ga_h + tabla.ga_a
    tabla['gd'] = tabla.gd_h + tabla.gd_a
    tabla['points'] = (tabla['wins']*3 + tabla['draws']).astype(int)
    tabla = tabla.sort_values(by=['points','gd'], ascending = False).reset_index(drop=True)
    tabla['position'] = (tabla.index + 1).astype(int)
    tabla['Torneo'] = torneo
    return tabla[['equipo','Torneo','position','points', 'promoccup']]

In [4]:
def ConstruirDatosPreTratamiento(regdir,
                                 ligasdir,
                                 ligas = ['Alemania','Espana','Francia','Inglaterra','Italia']):
    dfs_pretrat = []
    for liga in ligas:
        archivo = liga + '.xlsx'
        df = pd.read_excel(os.path.join(regdir, archivo))
        torneos = df['Torneo'].drop_duplicates().tolist()
        df_ant = pd.read_excel(os.path.join(ligasdir, archivo))
        torneos_ant = df_ant['Torneo'].drop_duplicates().tolist()[:-1]
        dicttorant = dict(zip(torneos,torneos_ant))
        df['Torneo ant'] = [dicttorant[t] for t in df['Torneo'].tolist()]
        
        df_ant['Occupation'] = np.where(df_ant['Capacity'] > 0,
                                        np.minimum(df_ant['Attendance']/df_ant['Capacity'], 1),
                                        np.nan) 
        dfs_tablas = []
        for torneo in torneos_ant:
            test = tabla_final_torneo(df_ant[df_ant['Torneo'] == torneo].reset_index(drop=True))
            dfs_tablas.append(test)
        df_tablas_ant = pd.concat(dfs_tablas, ignore_index = True)
        df_tablas_ant.columns = ['equipo','Torneo ant','position_ant','points_ant', 'promoccup_ant']
        df = df.merge(df_tablas_ant,
              how = 'left',
              left_on = ['equipo','Torneo ant'],
              right_on = ['equipo', 'Torneo ant'])
        df = df[['equipo','Torneo'] + [i for i in df.columns if 'perc_' in i and 'corr' not in i] + [i for i in df.columns if '_ant' in i]]
        df = df.dropna().reset_index(drop=True)
        dfs_pretrat.append(df)
    return dfs_pretrat

In [5]:
regdir = os.path.join(os.path.pardir,
                      'datos',
                      'regresiones')
ligasdir = os.path.join(os.path.pardir,
                        'datos',
                        'ligas')
dfs_pretrat = ConstruirDatosPreTratamiento(regdir, ligasdir)

In [6]:
dfs_pretrat[0].columns.tolist()

['equipo',
 'Torneo',
 'perc_facil_elo_p4_c3',
 'perc_dificil_elo_p4_c3',
 'perc_facil_elo_p5_c3',
 'perc_dificil_elo_p5_c3',
 'perc_facil_elo_p6_c3',
 'perc_dificil_elo_p6_c3',
 'perc_facil_elo_p7_c3',
 'perc_dificil_elo_p7_c3',
 'perc_facil_elo_p4_c4',
 'perc_dificil_elo_p4_c4',
 'perc_facil_elo_p5_c4',
 'perc_dificil_elo_p5_c4',
 'perc_facil_elo_p6_c4',
 'perc_dificil_elo_p6_c4',
 'perc_facil_elo_p7_c4',
 'perc_dificil_elo_p7_c4',
 'perc_facil_elo_p4_c5',
 'perc_dificil_elo_p4_c5',
 'perc_facil_elo_p5_c5',
 'perc_dificil_elo_p5_c5',
 'perc_facil_elo_p6_c5',
 'perc_dificil_elo_p6_c5',
 'perc_facil_elo_p7_c5',
 'perc_dificil_elo_p7_c5',
 'perc_facil_general_h1_p4_c3',
 'perc_dificil_general_h1_p4_c3',
 'perc_facil_lv_h1_p4_c3',
 'perc_dificil_lv_h1_p4_c3',
 'perc_facil_general_h1_p5_c3',
 'perc_dificil_general_h1_p5_c3',
 'perc_facil_lv_h1_p5_c3',
 'perc_dificil_lv_h1_p5_c3',
 'perc_facil_general_h1_p6_c3',
 'perc_dificil_general_h1_p6_c3',
 'perc_facil_lv_h1_p6_c3',
 'perc_dificil_lv

In [19]:
tablas_text = """
"""

# Y = ['position_ant', 'points_ant', 'promoccup_ant']
# Y_print = ['Posición anterior', 'Puntos anterior', 'Prom. ocupación anterior']
Y = ['position_ant']
Y_print = ['Posición anterior']
X = ['perc_facil_general_h%s_p%s_c%s',
     'perc_dificil_general_h%s_p%s_c%s',
     'perc_facil_lv_h%s_p%s_c%s',
     'perc_dificil_lv_h%s_p%s_c%s',
     'perc_facil_elo_p%s_c%s',
     'perc_dificil_elo_p%s_c%s']
X_print = ['Frac. Fácil General',
           'Frac. Difícil General',
           'Frac. Fácil LV',
           'Frac. Difícil LV',
           'Frac. Fácil ELO',
           'Frac. Difícil ELO']
X_sheet = ['FFG',
           'FDG',
           'FFLV',
           'FDLV',
           'FFELO',
           'FDELO']
ligas = ['Alemania','Espana','Francia','Inglaterra','Italia']

dfs_res = []
sheets = []
for h in range(1,6):
    for k in range(len(Y)):
        y = Y[k]
        means = []
        for j in range(len(X)):
            agregar = True
            x = X[j]
            if 'h' in x:
                sheets.append(X_sheet[j] + ' %s anterior' % h)
            else:
                sheets.append(X_sheet[j])
            for i in range(len(ligas)):
                resultados = []
                liga = ligas[i]
                df = dfs_pretrat[i]
                rl = []
                for r in [4,5,6,7]:
                    for l in [3,4,5]:
                        if h > 1 and 'h' not in x:
                            agregar = False
                            pass
                        elif h < 2 and 'h' not in x:
                            x_ = x % (r,l)
                            rl.append("$r = %s, l = %s$" % (r,l))
                        else:
                            x_ = x % (h,r,l)
                            rl.append("$h = %s, r = %s, l = %s$" % (h,r,l))
                        formula = '%s ~ C(%s)' % (y,x_)
                        model = ols(formula, data = df).fit()
                        aov_table = sm.stats.anova_lm(model, typ=2)
                        resultados.append(np.round(aov_table['PR(>F)'][0], decimals = 3))
            if agregar:
                if i < 1:
                    df_res = pd.DataFrame({liga : resultados}, index = rl)
                else:
                    df_res[liga] = resultados
                tablas_text = tablas_text + df_res.to_latex()
                dfs_res.append(df_res)
                print('----------------------------------------------------------------------------------------')
                print('Análisis y = %s, x = %s para %s torneos anteriores' % (Y_print[k], X_print[j], h))
                print(df_res.to_latex())
outputdir = os.path.join(os.path.pardir, 'resultados','pretratamiento')
writer = pd.ExcelWriter(os.path.join(outputdir,'posicion-anterior.xlsx'), engine='xlsxwriter')
for df, sheet in zip(dfs_res, sheets):
    df.to_excel(writer, sheet_name = sheet)
writer.save()
#             means.append(df_res.mean().tolist())
#         df_means = pd.DataFrame(data = means, columns = ligas, index = X_print)
#         print('---------------------------------PROMEDIOS------------------------------------------')
#         print(df_means.to_latex())
# with open(os.path.join(os.path.pardir,'Tablas', 'tablaspretreatment.txt'),'w') as tf:
#     tf.write(tablas_text)

----------------------------------------------------------------------------------------
Análisis y = Posición anterior, x = Frac. Fácil General para 1 torneos anteriores
\begin{tabular}{lrrrrr}
\toprule
{} &  Alemania &  Espana &  Francia &  Inglaterra &  Italia \\
\midrule
\$h = 3, r = 4, l = 3\$ &     0.376 &   0.288 &    0.424 &       0.005 &   0.033 \\
\$h = 3, r = 4, l = 4\$ &     0.152 &   0.082 &    0.182 &       0.005 &   0.040 \\
\$h = 3, r = 4, l = 5\$ &     0.052 &   0.013 &    0.019 &       0.016 &   0.004 \\
\$h = 3, r = 5, l = 3\$ &     0.239 &   0.071 &    0.476 &       0.004 &   0.002 \\
\$h = 3, r = 5, l = 4\$ &     0.140 &   0.002 &    0.310 &       0.009 &   0.003 \\
\$h = 3, r = 5, l = 5\$ &     0.044 &   0.003 &    0.065 &       0.010 &   0.001 \\
\$h = 3, r = 6, l = 3\$ &     0.245 &   0.204 &    0.872 &       0.004 &   0.056 \\
\$h = 3, r = 6, l = 4\$ &     0.090 &   0.030 &    0.312 &       0.081 &   0.001 \\
\$h = 3, r = 6, l = 5\$ &     0.200 &   0.006 &    0

KeyboardInterrupt: 

In [14]:
len(dfs_res)

66

In [46]:
tablas_text = """
"""

Y = ['position_ant', 'points_ant', 'promoccup_ant']
Y_print = ['Posición anterior', 'Puntos anterior', 'Prom. ocupación anterior']
X = ['perc_facil_elo_p%s_c%s',
     'perc_dificil_elo_p%s_c%s']
X_print = ['Frac. Fácil ELO',
           'Frac. Difícil ELO']
ligas = ['Alemania','Espana','Francia','Inglaterra','Italia']


for k in range(len(Y)):
    y = Y[k]
    means = []
    for j in range(len(X)):
        x = X[j]
        print('----------------------------------------------------------------------------------------')
        print('Análisis y = %s, x = %s para %s' % (Y_print[k], X_print[j]))
        for i in range(len(ligas)):
            resultados = []
            liga = ligas[i]
            df = dfs_pretrat[i]
            rl = []
            for r in [4,5,6,7]:
                for l in [3,4,5]:
                    x_ = x % (r,l)
                    rl.append("$r = %s, l = %s$" % (r,l))
                    formula = '%s ~ C(%s)' % (y,x_)
                    model = ols(formula, data = df).fit()
                    aov_table = sm.stats.anova_lm(model, typ=2)
                    resultados.append(np.round(aov_table['PR(>F)'][0], decimals = 3))
            if i < 1:
                df_res = pd.DataFrame({liga : resultados}, index = rl)
            else:
                df_res[liga] = resultados
        tablas_text = tablas_text + df_res.to_latex()
        print(df_res.to_latex())
        means.append(df_res.mean().tolist())
    df_means = pd.DataFrame(data = means, columns = ligas, index = X_print)
    print('---------------------------------PROMEDIOS------------------------------------------')
    print(df_means.to_latex())

----------------------------------------------------------------------------------------
Análisis y = Posición anterior, x = Frac. Fácil ELO para 5 torneos anteriores
\begin{tabular}{lrrrrr}
\toprule
{} &  Alemania &  Espana &  Francia &  Inglaterra &  Italia \\
\midrule
\$r = 4, l = 3\$ &     0.450 &   0.080 &    0.443 &       0.571 &   0.007 \\
\$r = 4, l = 4\$ &     0.116 &   0.441 &    0.176 &       0.833 &   0.009 \\
\$r = 4, l = 5\$ &     0.223 &   0.391 &    0.235 &       0.352 &   0.003 \\
\$r = 5, l = 3\$ &     0.495 &   0.095 &    0.255 &       0.851 &   0.009 \\
\$r = 5, l = 4\$ &     0.802 &   0.562 &    0.193 &       0.828 &   0.017 \\
\$r = 5, l = 5\$ &     0.901 &   0.424 &    0.468 &       0.794 &   0.000 \\
\$r = 6, l = 3\$ &     0.742 &   0.614 &    0.858 &       0.605 &   0.086 \\
\$r = 6, l = 4\$ &     0.532 &   0.573 &    0.317 &       0.562 &   0.027 \\
\$r = 6, l = 5\$ &     0.602 &   0.449 &    0.191 &       0.869 &   0.000 \\
\$r = 7, l = 3\$ &     0.744 &   0.

In [None]:
df_res.mean()

In [2]:
regdir = os.path.join(os.path.pardir,
                      'Datos',
                      'Bases construidas',
                      'Bases main',
                      'Bases efecto')

In [4]:
df_reg = pd.read_excel(os.path.join(regdir, 'Inglaterra.xlsx'))

In [10]:
variables =  + ['perc_dificil_general_p%s_c%s' % (r,l) for r in range(4,8) for l in range(3,6)]

In [15]:
indices = ["$r = %s, l = %s$" % (r,l) for r in range(4,8) for l in range(3,6)]

In [25]:
df_facil = df_reg[['perc_dificil_general_p%s_c%s' % (r,l) for r in range(4,8) for l in range(3,6)]].describe().round(decimals = 3).T

In [26]:
df_facil.index = indices

In [27]:
print(df_facil.drop(['count'], axis = 1).to_latex())

\begin{tabular}{lrrrrrrr}
\toprule
{} &   mean &    std &  min &    25\% &    50\% &    75\% &    max \\
\midrule
\$r = 4, l = 3\$ &  0.149 &  0.164 &  0.0 &  0.000 &  0.250 &  0.250 &  0.750 \\
\$r = 4, l = 4\$ &  0.198 &  0.180 &  0.0 &  0.000 &  0.250 &  0.250 &  0.750 \\
\$r = 4, l = 5\$ &  0.248 &  0.195 &  0.0 &  0.000 &  0.250 &  0.250 &  1.000 \\
\$r = 5, l = 3\$ &  0.149 &  0.141 &  0.0 &  0.000 &  0.200 &  0.200 &  0.600 \\
\$r = 5, l = 4\$ &  0.198 &  0.152 &  0.0 &  0.000 &  0.200 &  0.200 &  0.600 \\
\$r = 5, l = 5\$ &  0.248 &  0.171 &  0.0 &  0.200 &  0.200 &  0.400 &  0.800 \\
\$r = 6, l = 3\$ &  0.149 &  0.124 &  0.0 &  0.000 &  0.167 &  0.167 &  0.500 \\
\$r = 6, l = 4\$ &  0.198 &  0.137 &  0.0 &  0.167 &  0.167 &  0.333 &  0.667 \\
\$r = 6, l = 5\$ &  0.248 &  0.152 &  0.0 &  0.167 &  0.167 &  0.333 &  0.667 \\
\$r = 7, l = 3\$ &  0.149 &  0.112 &  0.0 &  0.107 &  0.143 &  0.286 &  0.429 \\
\$r = 7, l = 4\$ &  0.198 &  0.124 &  0.0 &  0.143 &  0.143 &  0.286 &  0.57