# Análisis de Futbolistas

In [1]:
from blackops.utils.catalog import start_spark_session
import pyspark.sql.functions as f
import pandas as pd
import scipy.stats as ss
from typing import Iterable
import numpy as np
import plotly.express as px

In [2]:
def entropy(ser: Iterable, normalize: bool = False) -> float:
    """
    Computes the (base 2) entropy for `ser` argument.
    """
    if not isinstance(ser, (np.ndarray, list, tuple, pd.Series)):
        raise TypeError(
            f"Wrong type for `ser` argument: {type(ser)}. It must be an array-like object"
        )
    if isinstance(ser, (np.ndarray, list, tuple)):
        ser = pd.Series(ser)
    assert len(ser) > 0, "`ser` array must contain at least one element"
    pi = ser.value_counts(normalize=True, dropna=False).to_numpy()
    logpi = np.log2(pi, where=(pi > 0))
    result = -np.dot(pi, logpi)
    if normalize:
        max_entropy = np.log2(len(pi))
        return float(result / max_entropy)
    return float(result)


def information_gain(
    df: pd.DataFrame, X: str, Y: str, normalize: bool = False
) -> float:
    """
    The information gain (aka mutual information) in variable `Y` given `X`.

    Formulas
    --------
    IG(Y,X) = H(Y) - H(Y|X)

    H(Y|X) = - sum_{x} ( p(x) * H(Y|X=x) )
    """

    def weighted_entropy(X: pd.Series, n: int):
        # The weigth (or frequency) for this group
        pi = len(X) / n
        return entropy(X) * pi

    y_values = df[Y]
    data_size = df.shape[0]
    HY = entropy(y_values)
    HY_X = df.groupby(X)[Y].agg(weighted_entropy, n=data_size).sum()
    result = HY - HY_X
    if normalize:
        return float(result / HY)
    return float(result)

In [3]:
spark = start_spark_session()
spark.sparkContext.setLogLevel("OFF")

24/10/29 18:11:31 WARN Utils: Your hostname, pop-os resolves to a loopback address: 127.0.1.1; using 192.168.1.35 instead (on interface enp3s0)
24/10/29 18:11:31 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Ivy Default Cache set to: /home/dadiego/.ivy2/cache
The jars for the packages stored in: /home/dadiego/.ivy2/jars
io.delta#delta-spark_2.12 added as a dependency
io.delta#delta-sharing-spark_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-178d3031-e97c-45da-9b40-4fc3f5278878;1.0
	confs: [default]


:: loading settings :: url = jar:file:/home/dadiego/projects/ESIC/esic-bigdata-iv-blackops/.venv/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


	found io.delta#delta-spark_2.12;3.2.0 in central
	found io.delta#delta-storage;3.2.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
	found io.delta#delta-sharing-spark_2.12;3.2.0 in central
	found io.delta#delta-sharing-client_2.12;1.0.5 in central
	found org.apache.httpcomponents#httpclient;4.5.13 in central
	found org.apache.httpcomponents#httpcore;4.4.13 in central
	found commons-logging#commons-logging;1.2 in central
	found commons-codec#commons-codec;1.11 in central
:: resolution report :: resolve 186ms :: artifacts dl 8ms
	:: modules in use:
	commons-codec#commons-codec;1.11 from central in [default]
	commons-logging#commons-logging;1.2 from central in [default]
	io.delta#delta-sharing-client_2.12;1.0.5 from central in [default]
	io.delta#delta-sharing-spark_2.12;3.2.0 from central in [default]
	io.delta#delta-spark_2.12;3.2.0 from central in [default]
	io.delta#delta-storage;3.2.0 from central in [default]
	org.antlr#antlr4-runtime;4.9.3 from central in [default]
	

In [4]:
df = spark.read.parquet("data/football-stats.parquet")

In [5]:
display(df.limit(5))

Rk,Player,Nation,Pos,Squad,Comp,Age,Born,MP,Starts,Min,90s,Gls,Ast,G+A,G-PK,PK,PKatt,CrdY,CrdR,xG,npxG,xAG,npxG+xAG,PrgC,PrgP,PrgR,Gls_90,Ast_90,G+A_90,G-PK_90,G+A-PK_90,xG_90,xAG_90,xG+xAG_90,npxG_90,npxG+xAG_90
1,Max Aarons,eng ENG,DF,Bournemouth,eng Premier League,23,2000,20,13,1237,13.7,0,1,1,0,0,0,1,0,0.0,0.0,0.8,0.9,22,43,26,0.0,0.07,0.07,0.0,0.07,0.0,0.06,0.06,0.0,0.06
2,Brenden Aaronson,us USA,"MF,FW",Union Berlin,de Bundesliga,22,2000,30,14,1267,14.1,2,2,4,2,0,0,3,1,2.0,2.0,1.9,3.8,37,56,91,0.14,0.14,0.28,0.14,0.28,0.14,0.13,0.27,0.14,0.27
3,Paxten Aaronson,us USA,MF,Eint Frankfurt,de Bundesliga,19,2003,7,1,101,1.1,0,1,1,0,0,0,0,0,0.1,0.1,0.1,0.2,2,5,7,0.0,0.89,0.89,0.0,0.89,0.11,0.07,0.19,0.11,0.19
4,Keyliane Abdallah,fr FRA,FW,Marseille,fr Ligue 1,17,2006,1,0,4,0.0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Yunis Abdelhamid,ma MAR,DF,Reims,fr Ligue 1,35,1987,31,31,2781,30.9,4,0,4,3,1,1,5,0,3.4,2.6,0.3,2.9,36,137,9,0.13,0.0,0.13,0.1,0.1,0.11,0.01,0.12,0.09,0.09


In [6]:
df.summary()

                                                                                

summary,Rk,Player,Nation,Pos,Squad,Comp,Age,Born,MP,Starts,Min,90s,Gls,Ast,G+A,G-PK,PK,PKatt,CrdY,CrdR,xG,npxG,xAG,npxG+xAG,PrgC,PrgP,PrgR,Gls_90,Ast_90,G+A_90,G-PK_90,G+A-PK_90,xG_90,xAG_90,xG+xAG_90,npxG_90,npxG+xAG_90
count,2852.0,2852,2849,2852,2852,2852,2848.0,2848.0,2852.0,2852.0,2852.0,2852.0,2852.0,2852.0,2852.0,2852.0,2852.0,2852.0,2852.0,2852.0,2851.0,2851.0,2851.0,2851.0,2851.0,2851.0,2851.0,2852.0,2852.0,2852.0,2852.0,2852.0,2851.0,2851.0,2851.0,2851.0,2851.0
mean,1426.5,,,,,,25.06074438202247,1997.600070224719,18.96984572230014,13.514726507713885,1213.3015427769983,13.482328190743328,1.720196353436185,1.2219495091164094,2.9421458625525947,1.5655680224403927,0.1546283309957924,0.1931977559607293,2.654978962131837,0.1199158485273492,1.7464047702560477,1.5925990880392786,1.2315327955103437,2.823219922834094,21.482637670992634,46.889161697649946,46.42511399508944,0.1217776998597475,0.093295932678822,0.2149158485273491,0.1141409537166899,0.2072931276297334,0.1323325149070497,0.0990073658365487,0.2314380918975794,0.1241669589617674,0.2233321641529282
stddev,823.4458087816101,,,,,,4.590520738869418,4.596293614784638,11.517930248018404,11.251226699349408,961.5840664011922,10.685044789842047,3.203531717458709,1.9653706653549,4.6157681215122,2.8403527811457163,0.7158105505726141,0.8117317697150929,2.7261609370111213,0.3548475645096901,2.896131154882146,2.518002120921979,1.7095741956450154,3.817546876565479,26.885500498228616,51.81476784373843,60.407205222143446,0.2841381876259927,0.3662376221425277,0.4729886173369889,0.2778017856437242,0.4679836907528918,0.2078263350381571,0.2368249711564835,0.3347072582270988,0.1984823970519442,0.3277105828616009
min,1.0,Aaron Cresswell,al ALB,DF,Alavés,de Bundesliga,15.0,1982.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,713.0,,,,,,22.0,1994.0,8.0,2.0,296.0,3.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.1,0.1,0.2,2.0,5.0,3.0,0.0,0.0,0.0,0.0,0.0,0.02,0.01,0.05,0.02,0.05
50%,1426.0,,,,,,25.0,1998.0,20.0,12.0,1113.0,12.4,0.0,0.0,1.0,0.0,0.0,0.0,2.0,0.0,0.7,0.7,0.6,1.4,12.0,31.0,21.0,0.0,0.0,0.09,0.0,0.09,0.06,0.06,0.14,0.06,0.14
75%,2139.0,,,,,,28.0,2001.0,29.0,23.0,1987.0,22.1,2.0,2.0,4.0,2.0,0.0,0.0,4.0,0.0,2.0,1.9,1.7,3.7,30.0,73.0,71.0,0.15,0.12,0.29,0.14,0.28,0.18,0.13,0.33,0.17,0.31
max,2852.0,Łukasz Skorupski,zw ZIM,"MF,FW",Wolves,it Serie A,40.0,2008.0,38.0,38.0,3420.0,38.0,36.0,14.0,44.0,31.0,10.0,10.0,17.0,3.0,30.6,26.7,11.8,33.0,218.0,392.0,508.0,6.0,11.25,11.25,6.0,11.25,4.55,6.85,6.85,4.55,6.85


# Correlaciones

Determinamos en primer lugar el coeficiente de correlación lineal por pares de variables

In [7]:
corr = df.pandas_api().corr().to_pandas()
display(corr)

                                                                                

Unnamed: 0,Rk,Age,Born,MP,Starts,Min,90s,Gls,Ast,G+A,...,Gls_90,Ast_90,G+A_90,G-PK_90,G+A-PK_90,xG_90,xAG_90,xG+xAG_90,npxG_90,npxG+xAG_90
Rk,1.0,0.026363,-0.028722,0.039735,0.038177,0.042819,0.04279,0.019696,0.026871,0.025111,...,0.020534,-0.017966,-0.001729,0.018503,-0.003183,0.021603,0.007228,0.018767,0.020394,0.017815
Age,0.026363,1.0,-0.994687,0.237786,0.254977,0.254521,0.25457,0.065507,0.080411,0.079707,...,-0.006841,-0.015298,-0.0159,-0.020151,-0.02385,-0.023493,-0.003033,-0.016812,-0.043027,-0.028505
Born,-0.028722,-0.994687,1.0,-0.24157,-0.257952,-0.257613,-0.257657,-0.067403,-0.082989,-0.082121,...,0.005478,0.016779,0.016264,0.019003,0.024361,0.023133,0.004514,0.017607,0.043148,0.029632
MP,0.039735,0.237786,-0.24157,1.0,0.897578,0.917165,0.91717,0.44569,0.507686,0.525497,...,0.090087,0.015292,0.065781,0.07339,0.055403,0.088399,-0.006495,0.049967,0.066778,0.035616
Starts,0.038177,0.254977,-0.257952,0.897578,1.0,0.994046,0.994041,0.412909,0.483572,0.492479,...,0.024232,-0.007325,0.008778,0.006979,-0.001649,-0.021422,-0.039129,-0.041342,-0.043987,-0.055075
Min,0.042819,0.254521,-0.257613,0.917165,0.994046,1.0,0.999997,0.407147,0.475767,0.485156,...,0.025963,-0.008837,0.008627,0.009289,-0.001476,-0.016815,-0.041278,-0.04002,-0.038844,-0.053536
90s,0.04279,0.25457,-0.257657,0.91717,0.994041,0.999997,1.0,0.407126,0.475743,0.485131,...,0.025972,-0.008785,0.008672,0.009291,-0.001434,-0.016851,-0.041248,-0.040022,-0.038887,-0.053541
Gls,0.019696,0.065507,-0.067403,0.44569,0.412909,0.407147,0.407126,1.0,0.570193,0.936826,...,0.463267,0.080318,0.34042,0.414697,0.308922,0.465908,0.100499,0.36024,0.409608,0.321392
Ast,0.026871,0.080411,-0.082989,0.507686,0.483572,0.475767,0.475743,0.570193,1.0,0.821532,...,0.1893,0.228638,0.290753,0.160927,0.274473,0.218911,0.198383,0.276207,0.186466,0.256562
G+A,0.025111,0.079707,-0.082121,0.525497,0.492479,0.485156,0.485131,0.936826,0.821532,1.0,...,0.402129,0.153097,0.360067,0.356338,0.331273,0.416578,0.154223,0.367635,0.363687,0.332307


Podemos visualizarlo utilizando la función `imshow` de Plotly

In [8]:
px.imshow(corr, width=700)

Ahora, vamos a visualizar un scatter plot de diferentes pares de variables numéricas, para identificar posibles correlaciones, lineales o no.

In [9]:
fig = px.scatter_matrix(
    data_frame=df.toPandas(),
    dimensions=[
        "Age",
        "Born",
        "MP",
        "Starts"
    ],
    color="Gls"
)
fig.update_traces(diagonal_visible=False)

Naturalmente Starts (número de partidos de titular) debe ser menor que MP (número de partidos jugados)

In [10]:
fig = px.scatter_matrix(
    data_frame=df.toPandas(),
    dimensions=[
        "Starts",
        "Min",
        "CrdY",
    ],
    color="Gls"
)
fig.update_traces(diagonal_visible=False)

In [11]:
def bucketize(col, bins):
    @f.pandas_udf(returnType="string")
    def _bucketize(df: pd.Series) -> pd.Series:
        return pd.cut(df, bins=bins, right=False).astype("string")

    return _bucketize(col)

In [12]:
df_trans = df.select(
    bucketize("Age", bins=[15, 20, 25, 30, 35, 40, 45, 50]).alias("Age_bucket"),
    bucketize("Born", bins=[1980, 1990, 2000, 2010]).alias("Born_bucket"),
    bucketize("MP", bins=10).alias("MP_bucket"),
    bucketize("Min", bins=100).alias("Min_bucket"),
    bucketize("PrgC", bins=20).alias("PrgC_bucket"),
    bucketize("PrgP", bins=20).alias("PrgP_bucket"),
    bucketize("Gls", bins=[0, 5, 10, 15, 20, 25, 30, 40]).alias("Gls_bucket"),
    "Squad",
    "Pos",
).toPandas()

display(df_trans)

                                                                                

Unnamed: 0,Age_bucket,Born_bucket,MP_bucket,Min_bucket,PrgC_bucket,PrgP_bucket,Gls_bucket,Squad,Pos
0,"[20.0, 25.0)","[2000.0, 2010.0)","[19.5, 23.2)","[1231.84, 1266.03)","[21.8, 32.7)","[39.2, 58.8)","[0, 5)",Bournemouth,DF
1,"[20.0, 25.0)","[2000.0, 2010.0)","[26.9, 30.6)","[1266.03, 1300.22)","[32.7, 43.6)","[39.2, 58.8)","[0, 5)",Union Berlin,"MF,FW"
2,"[15.0, 20.0)","[2000.0, 2010.0)","[4.7, 8.4)","[69.38, 103.57)","[0.0, 10.9)","[0.0, 19.6)","[0, 5)",Eint Frankfurt,MF
3,"[15.0, 20.0)","[2000.0, 2010.0)","[1.0, 4.7)","[1.0, 35.19)","[0.0, 10.9)","[0.0, 19.6)","[0, 5)",Marseille,FW
4,"[35.0, 40.0)","[1980.0, 1990.0)","[30.6, 34.3)","[2770.39, 2804.58)","[32.7, 43.6)","[117.6, 137.2)","[0, 5)",Reims,DF
...,...,...,...,...,...,...,...,...,...
2847,"[15.0, 20.0)","[2000.0, 2010.0)","[4.7, 8.4)","[137.76, 171.95)","[0.0, 10.9)","[0.0, 19.6)","[0, 5)",Bayern Munich,"FW,MF"
2848,"[20.0, 25.0)","[1990.0, 2000.0)","[34.3, 38.037)","[3078.1, 3112.29)","[87.2, 98.1)","[333.2, 352.8)","[5, 10)",Arsenal,MF
2849,"[30.0, 35.0)","[1990.0, 2000.0)","[19.5, 23.2)","[1197.65, 1231.84)","[0.0, 10.9)","[0.0, 19.6)","[5, 10)",Hellas Verona,FW
2850,"[30.0, 35.0)","[1990.0, 2000.0)","[15.8, 19.5)","[1231.84, 1266.03)","[0.0, 10.9)","[0.0, 19.6)","[0, 5)",Monza,FW


In [13]:
contingency = pd.crosstab(
    df_trans.Age_bucket,
    df_trans.Born_bucket,
    normalize=False,
    dropna=True,
)

display(contingency)

Born_bucket,"[1980.0, 1990.0)","[1990.0, 2000.0)","[2000.0, 2010.0)"
Age_bucket,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"[15.0, 20.0)",0,0,350
"[20.0, 25.0)",0,280,716
"[25.0, 30.0)",0,986,0
"[30.0, 35.0)",65,379,0
"[35.0, 40.0)",69,0,0
"[40.0, 45.0)",3,0,0


In [14]:
ss.contingency.association(contingency)

0.7905984918514753

In [15]:
information_gain(df=df_trans, X="Age_bucket", Y="Born_bucket", normalize=True)

0.6759704517318766

In [16]:
information_gain(df=df_trans, X="Age_bucket", Y="Squad", normalize=True)

0.02357667692967669

In [17]:
ss.contingency.association(
    pd.crosstab(
        df_trans.Squad,
        df_trans.Age_bucket,
        normalize=False,
        dropna=True,
    )
)

0.20536794837501007

In [18]:
pd.crosstab(
    df_trans.Pos,
    df_trans.Gls_bucket,
    normalize=False,
    dropna=True,
)

Gls_bucket,"[0, 5)","[10, 15)","[15, 20)","[20, 25)","[25, 30)","[30, 40)","[5, 10)"
Pos,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
DF,839,1,0,0,0,0,8
"DF,FW",47,0,0,0,0,0,0
"DF,MF",105,0,0,0,0,0,3
FW,257,37,20,5,3,1,80
"FW,DF",26,0,0,0,0,0,2
"FW,MF",240,16,5,1,0,0,58
GK,203,0,0,0,0,0,0
MF,521,3,1,0,0,0,44
"MF,DF",78,0,0,0,0,0,3
"MF,FW",204,6,1,0,0,0,34


In [19]:
ss.contingency.association(
    pd.crosstab(
        df_trans.Pos,
        df_trans.Gls_bucket,
        normalize=False,
        dropna=True,
    )
)

0.17421573518742584

In [20]:
information_gain(df=df_trans, X="Pos", Y="Gls_bucket", normalize=True)

0.1865239649660588

In [21]:
px.imshow(
    pd.crosstab(
        df_trans.Gls_bucket,
        df_trans.Pos,
        normalize=False,
        dropna=True,
    )
)

# Feature Engineering

* Número de partidos no jugados de titular