In [1]:
import pandas as pd
from matplotlib.colors import LinearSegmentedColormap
from mplsoccer import VerticalPitch, Pitch
from highlight_text import ax_text, fig_text
import matplotlib.pyplot as plt

from joblib import Parallel, delayed
from tqdm import tqdm

In [2]:
m1 = pd.read_csv("WhoScored_ENG-Premier League.csv")
# m2 = pd.read_csv("WhoScored_ESP-La Liga.csv")
# m3 = pd.read_csv("WhoScored_FRA-Ligue 1.csv")
# m4 = pd.read_csv("WhoScored_GER-Bundesliga.csv")
# m5 = pd.read_csv("WhoScored_ITA-Serie A.csv")


In [3]:
matches = pd.concat([m1], axis = 0)

In [4]:
matches = matches[["game","game_id","type","outcome_type","team","fecha","player", "x", "y","minute"]]
matches = matches.sort_values(by=["fecha","game_id","minute"],ignore_index=True)

In [5]:
matches.shape

(7583712, 10)

In [6]:
example_match = matches[matches["game_id"] == 1729483] # 1729340  1729483 615168 410988

In [7]:
print(example_match["type"].unique())
print(example_match["outcome_type"].unique())

['FormationSet' 'Start' 'Pass' 'End' 'BallTouch' 'Foul' 'Aerial'
 'Interception' 'Dispossessed' 'Tackle' 'TakeOn' 'Clearance'
 'CornerAwarded' 'Claim' 'BlockedPass' 'Challenge' 'BallRecovery'
 'SavedShot' 'Save' 'KeeperPickup' 'OffsidePass' 'OffsideProvoked'
 'OffsideGiven' 'MissedShots' 'KeeperSweeper' 'ShieldBallOpp'
 'SubstitutionOff' 'SubstitutionOn' 'Goal' 'Punch' 'FormationChange'
 'CrossNotClaimed' 'Card']
['Successful' 'Unsuccessful']


In [16]:
def get_features_team_temporal(df_matches, match, n_matches, bins_x, bins_y, time_division): # time_division : divisor de 90 (usar por ejemplo 3,5,15,30,45)
    
    divisiones = list(range(0,90+1,time_division))
    # Agregar tiempo extra
    divisiones[-1] += 15
    ## Features elegidos
    features = ['Pass', 'TakeOn', 'Tackle', 'BallRecovery'
                'Interception', 'BlockedPass', 'Clearance', 'CornerAwarded', 'Aerial', 'Foul',
                'Dispossessed', 'BallTouch', 'ShieldBallOpp', 'Challenge', 'MissedShots',
                'SavedShot', 'Save', 'KeeperPickup', 'Goal', 'KeeperSweeper', 'Card', 'Smother',
                'OffsideProvoked', 'OffsidePass', 'OffsideGiven', 'Punch', 'Error']
    ##
    date = match["fecha"].unique()[0]

    df_matches_limited = df_matches[df_matches["fecha"] < date]
    str_teams = match["team"].unique()
    str_team1 = str_teams[0]
    str_team2 = str_teams[1]

    team1_match_ids = df_matches_limited[df_matches_limited["team"] == str_team1].game_id.unique()[-n_matches:]
    team1_events = df_matches_limited[(df_matches_limited["team"] == str_team1) & (df_matches_limited["game_id"].isin(team1_match_ids))]

    team2_match_ids = df_matches_limited[df_matches_limited["team"] == str_team2].game_id.unique()[-n_matches:]
    team2_events = df_matches_limited[(df_matches_limited["team"] == str_team2) & (df_matches_limited["game_id"].isin(team2_match_ids))]


    dict_team = {"Team" : str_team1}
    for fea in features:
        for div in divisiones[:-1]:
            pitch = Pitch(pitch_type='opta', pitch_color='white',
                            line_color='black', line_zorder=2)
            player_events_time = team1_events[(team1_events["minute"]>div) & (team1_events["minute"]<=div+time_division)]

            event_succesful = player_events_time[(player_events_time["type"] == fea) & (
                player_events_time["outcome_type"] == 'Successful')]
            event_unsuccessful = player_events_time[(player_events_time["type"] == fea) & (
                player_events_time["outcome_type"] == 'Unsuccessful')]

            bin_statistic_succesful = pitch.bin_statistic(
                event_succesful.x, event_succesful.y, statistic="count", bins=(bins_x, bins_y), normalize=False)

            bin_statistic_unsuccesful = pitch.bin_statistic(
                event_unsuccessful.x, event_unsuccessful.y, statistic="count", bins=(bins_x, bins_y), normalize=False)

            bins_successful = bin_statistic_succesful["statistic"].flatten()
            bins_unsuccessful = bin_statistic_unsuccesful["statistic"].flatten()

            for i in range(len(bins_successful)):
                dict_team[f"{fea}_Successful_bin_{i}_time_{div}-{div+time_division}"] = bins_successful[i]
                dict_team[f"{fea}_Unsuccessful_bin_{i}_time_{div}-{div+time_division}"] = bins_unsuccessful[i]

        # Agregar la cantidad de juegos 
        dict_team["Number_matches"] = len(player_events_time["game"].unique())
    
    df_team1 = pd.DataFrame([dict_team])

    dict_team = {"Team" : str_team2}
    for fea in features:
        for div in divisiones[:-1]:
            pitch = Pitch(pitch_type='opta', pitch_color='white',
                            line_color='black', line_zorder=2)
            player_events_time = team2_events[(team2_events["minute"]>div) & (team2_events["minute"]<=div+time_division)]

            event_succesful = player_events_time[(player_events_time["type"] == fea) & (
                player_events_time["outcome_type"] == 'Successful')]
            event_unsuccessful = player_events_time[(player_events_time["type"] == fea) & (
                player_events_time["outcome_type"] == 'Unsuccessful')]

            bin_statistic_succesful = pitch.bin_statistic(
                event_succesful.x, event_succesful.y, statistic="count", bins=(bins_x, bins_y), normalize=False)

            bin_statistic_unsuccesful = pitch.bin_statistic(
                event_unsuccessful.x, event_unsuccessful.y, statistic="count", bins=(bins_x, bins_y), normalize=False)

            bins_successful = bin_statistic_succesful["statistic"].flatten()
            bins_unsuccessful = bin_statistic_unsuccesful["statistic"].flatten()

            for i in range(len(bins_successful)):
                dict_team[f"{fea}_Successful_bin_{i}_time_{div}-{div+time_division}"] = bins_successful[i]
                dict_team[f"{fea}_Unsuccessful_bin_{i}_time_{div}-{div+time_division}"] = bins_unsuccessful[i]

        # Agregar la cantidad de juegos 
        dict_team["Number_matches"] = len(player_events_time["game"].unique())
        
    df_team2 = pd.DataFrame([dict_team])

    # Agregar fecha
    df_team1["Date"] = date
    df_team2["Date"] = date
    
    # Agregar id

    df_team1["game_id"] = match.game_id.unique()[0]
    df_team2["game_id"] = match.game_id.unique()[0]

    # Agregar resultado goles del equipo final
    df_team1["Goals"] = len(match[(match["type"] == "Goal") & (match["team"] == str_team1)])
    df_team2["Goals"] = len(match[(match["type"] == "Goal") & (match["team"] == str_team2)])

    columnas_a_mover = ["Team","Date","Goals","Number_matches","game_id"]

    # Obtén una lista de todas las columnas en el DataFrame
    columnas = df_team1.columns.tolist()

    # Mueve las columnas deseadas al principio
    for columna in reversed(columnas_a_mover):
        columnas.insert(0, columnas.pop(columnas.index(columna)))
    # Reordena las columnas del DataFrame
    df_team1 = df_team1[columnas]
    df_team2 = df_team2[columnas]

    return df_team1, df_team1

df_all_players_team1, df_all_players_team2 = get_features_team_temporal(matches, example_match, n_matches = 15, bins_x = 2, bins_y = 2, time_division=15) #usar divisor de 90

In [17]:
df_all_players_team1

Unnamed: 0,Team,Date,Goals,Number_matches,game_id,Pass_Successful_bin_0_time_0-15,Pass_Unsuccessful_bin_0_time_0-15,Pass_Successful_bin_1_time_0-15,Pass_Unsuccessful_bin_1_time_0-15,Pass_Successful_bin_2_time_0-15,...,Error_Successful_bin_3_time_60-75,Error_Unsuccessful_bin_3_time_60-75,Error_Successful_bin_0_time_75-90,Error_Unsuccessful_bin_0_time_75-90,Error_Successful_bin_1_time_75-90,Error_Unsuccessful_bin_1_time_75-90,Error_Successful_bin_2_time_75-90,Error_Unsuccessful_bin_2_time_75-90,Error_Successful_bin_3_time_75-90,Error_Unsuccessful_bin_3_time_75-90
0,Everton,2024-02-19,1,15,1729483,170.0,69.0,119.0,53.0,152.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
df_all_players_team2

Unnamed: 0,Team,Date,Goals,Number_matches,game_id,Pass_Successful_bin_0_time_0-15,Pass_Unsuccessful_bin_0_time_0-15,Pass_Successful_bin_1_time_0-15,Pass_Unsuccessful_bin_1_time_0-15,Pass_Successful_bin_2_time_0-15,...,Error_Successful_bin_3_time_60-75,Error_Unsuccessful_bin_3_time_60-75,Error_Successful_bin_0_time_75-90,Error_Unsuccessful_bin_0_time_75-90,Error_Successful_bin_1_time_75-90,Error_Unsuccessful_bin_1_time_75-90,Error_Successful_bin_2_time_75-90,Error_Unsuccessful_bin_2_time_75-90,Error_Successful_bin_3_time_75-90,Error_Unsuccessful_bin_3_time_75-90
0,Everton,2024-02-19,1,15,1729483,170.0,69.0,119.0,53.0,152.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
n_matches = 15
bins_x = 12
bins_y = 8
time_division = 45

n_workers = 12
matches_id = matches["game_id"].unique()[:]
resultado = []


# #En serie 
for id in tqdm(matches_id):
    data = get_features_team_temporal(matches, matches[matches["game_id"] == id], n_matches, bins_x, bins_y, time_division)
    resultado.append(data)

#En paralelo
# resultado = Parallel(n_jobs=n_workers, verbose=10)(
    # delayed(get_features_individuales_temporal)(matches[matches["fecha"]<=matches[matches["game_id"] == id]["fecha"].unique()[0]], matches[matches["game_id"] == id], n_matches, bins_x, bins_y, time_division) for id in tqdm(matches_id))

dfs_list = []
for df in resultado:
    dfs_list.append(df[0])
    dfs_list.append(df[1])
df = pd.concat(dfs_list, axis = 0)

df.to_csv(f"WhoScoredNumMatches{n_matches}Spatial{bins_x}x{bins_y}TimeDiv{time_division}.csv")

 47%|████▋     | 2263/4810 [3:24:05<3:49:42,  5.41s/it]


MemoryError: Unable to allocate 53.8 MiB for an array with shape (2, 3527973) and data type float64