# Data cleaning

In [1]:
import sys
sys.path.append('utils/')
import pandas as pd
import numpy as np
from utils import Metrica_IO as mio
from scipy.spatial import ConvexHull
from tqdm import trange

Import the datasets:

In [2]:
DATA_DIR = 'data/'
game_id = 2
home_xy, away_xy, events = mio.read_match_data(DATA_DIR, game_id)

Reading team: home
Reading team: away


Consider the dataset *events*. The condition `Type`=*SET PIECE* identifies a moment in which the game restart after a stop (kick-in after the ball exits from the pitch, or the kick-off after a goal). Thus, we seek to remove all the frames corresponding to "dead" moments preceding a set piece.

In [3]:
# Indexes of the rows where the set pieces start
index_list = events.loc[events['Type'] == 'SET PIECE'].index.to_list()
# We need of course the indexes of the events preceding the set pieces!
index_list.extend([index_list[i] -1 for i in range(1, len(index_list))])
sorted_index_list = sorted(index_list)[1:]

We now define an empty list that will store the frames to be discarded:

In [4]:
frames_to_remove = []
for i in sorted_index_list[::2]:
    frames_to_remove.extend(list(range(events['End Frame'][i], events['Start Frame'][i+1])))
print("Frames to remove (dead moments before SET PIECES)-->",len(frames_to_remove))

Frames to remove (dead moments before SET PIECES)--> 42074


Now we discard the frames associated to dead moments between an infraction and the moment in which the ref extracts the card

*Note*: the reasoning is exactly the same done for the set pieces!

In [5]:
card_list = events.loc[events['Type'] == 'CARD'].index.to_list()
card_list.extend([card_list[i] -1 for i in range(0, len(card_list))])
sorted_card_list = sorted(card_list)[:]

In [6]:
for i in sorted_card_list[::2]:
    frames_to_remove.extend(list(range(events['End Frame'][i], events['Start Frame'][i+1])))
print("Frames to remove (dead moments before SET PIECES and before CARDS)-->",len(frames_to_remove))

Frames to remove (dead moments before SET PIECES and before CARDS)--> 44508


Trick to remove potential duplicates:

In [8]:
frames_to_remove = list(set(frames_to_remove))
len(frames_to_remove)

44508

We also remove the frames preceding the effective beginning of the match:

In [3]:
# Start from the kick off
home_xy=home_xy[50:]
away_xy=away_xy[50:]

Now that we cleaned up the frames, we can convert the coordinates to the same reference system of the pitch:

In [4]:
home_xy=mio.to_metric_coordinates(home_xy)
away_xy=mio.to_metric_coordinates(away_xy)
events=mio.to_metric_coordinates(events)

In [5]:
home_xy.to_csv("data/home_xy.csv",index=False)

In [6]:
away_xy.to_csv("data/away_xy.csv",index=False)

# Convex Hulls

For each dataframe we compute the area of the convex hull identified by the players of each team

In [12]:
hulls_df=pd.DataFrame(columns=['Period','Frame','Time [s]','HomeHull','AwayHull'])
for frame in trange(home_xy.shape[0],desc="Convex Hulls estimation:"):
    time=home_xy.iloc[frame,:]['Time [s]']
    period=home_xy.iloc[frame,:]['Period']
    home_data=home_xy.iloc[frame,:]
    away_data=away_xy.iloc[frame,:]
    home_data=home_data.dropna()
    away_data=away_data.dropna()
    ball=np.array(home_data[-2:])
    home_data= home_data[4:-2] #exclude both the goalkeeper and the ball
    away_data= away_data[4:-2] #exclude both the goalkeeper and the ball
    #--------------------------------------------------------------------
    # divide x and y
    home_data_x=home_data[home_data.index.str.contains('_x')]
    home_data_y=home_data[home_data.index.str.contains('_y')]
    away_data_x=away_data[away_data.index.str.contains('_x')]
    away_data_y=away_data[away_data.index.str.contains('_y')]
    #--------------------------------------------------------------------
    # Coordinates
    home_pts= np.array([[x,y] for x,y in zip(home_data_x,home_data_y)])
    away_pts= np.array([[x,y] for x,y in zip(away_data_x,away_data_y)])
    # Compute the convex hulls
    home_hull=ConvexHull(home_pts)
    away_hull=ConvexHull(away_pts)
    # Compute the area of the convex hulls
    home_area=home_hull.volume
    away_area=away_hull.volume
    
    hulls_df.loc[frame]=[period,frame,time,home_area,away_area]
    
hulls_df.to_csv('data/hulls_df_matchday2.csv',index=False)

Convex Hulls estimation:: 100%|██████████| 96598/96598 [01:58<00:00, 812.78it/s] 


Code to make a gif for a range of frames:

# Grouping frames to reduce dimensionality

In [26]:
hulls_df.head(10)

Unnamed: 0,Period,Frame,Time [s],HomeHull,AwayHull
0,1.0,0.0,2.04,580.876874,782.477901
1,1.0,1.0,2.08,581.11677,782.804969
2,1.0,2.0,2.12,581.348315,783.177048
3,1.0,3.0,2.16,581.702889,783.361684
4,1.0,4.0,2.2,582.074936,783.558528
5,1.0,5.0,2.24,582.443186,783.839906
6,1.0,6.0,2.28,582.874884,784.179156
7,1.0,7.0,2.32,583.252112,784.663064
8,1.0,8.0,2.36,583.638331,785.343671
9,1.0,9.0,2.4,584.099468,786.090308


In [27]:
hulls_df.shape

(96598, 5)

We group the frames belonging to the same second

In [28]:
hulls_df["Time [s]"]=np.floor(hulls_df["Time [s]"])

In [29]:
hulls_df_compact=hulls_df.loc[:,["Period","Time [s]","HomeHull","AwayHull"]].groupby("Time [s]").mean().reset_index()
hulls_df_compact.head(10)

Unnamed: 0,Time [s],Period,HomeHull,AwayHull
0,2.0,1.0,586.134796,789.332167
1,3.0,1.0,610.435976,831.996092
2,4.0,1.0,642.984305,917.031595
3,5.0,1.0,651.063432,972.12781
4,6.0,1.0,667.510469,1036.001978
5,7.0,1.0,669.921617,1087.138392
6,8.0,1.0,657.239068,1174.089487
7,9.0,1.0,658.414797,1298.943735
8,10.0,1.0,672.590132,1420.388919
9,11.0,1.0,696.095962,1519.930379


After some analysis, we decided to group the frames every 2 seconds:

In [30]:
hulls_df_reduced= pd.DataFrame(columns=['Time [s]','Period','HomeHull','AwayHull'])
for row in range(0,hulls_df_compact.shape[0],2):
    time= hulls_df_compact.loc[row,"Time [s]"]
    period= hulls_df_compact.loc[row,"Period"]
    home= hulls_df_compact.loc[row:row+1,"HomeHull"].mean()
    away= hulls_df_compact.loc[row:row+1,"AwayHull"].mean()
    hulls_df_reduced.loc[row]=[time,period,home,away]

In [31]:
hulls_df_reduced.head()

Unnamed: 0,Time [s],Period,HomeHull,AwayHull
0,2.0,1.0,598.285386,810.66413
2,4.0,1.0,647.023869,944.579702
4,6.0,1.0,668.716043,1061.570185
6,8.0,1.0,657.826932,1236.516611
8,10.0,1.0,684.343047,1470.159649


In [32]:
hulls_df_reduced.to_csv("data/hulls_df_matchday2_reduced.csv",index=False)