## Metrica Data Processing

In [None]:
import os

wd = os.path.normpath(os.getcwd() + '/..')
os.chdir(wd)
os.getcwd()

In [None]:
%load_ext autoreload
%autoreload 2

import json
import xml.etree.ElementTree as ET

import numpy as np
import pandas as pd
import torch
from matplotlib import animation
from tqdm import tqdm

from dataset import SoccerDataset
from datatools.metrica_helper import MetricaHelper
from datatools.trace_animator import TraceAnimator
from datatools.trace_helper import TraceHelper
from models import load_model

### Parsing Metrica Sample Game 3 Data

In [None]:
tree = ET.parse("data/metrica_traces/Sample_Game_3/Sample_Game_3_metadata.xml")
root = tree.getroot()
root[0].tag, root[1].tag

In [None]:
player_records = []

for player in root.iter("Player"):
    team_code = player.get("teamId")[-1]
    squad_num = int(player.findtext("ShirtNumber"))
    player_code = f"{team_code}{squad_num:02d}"

    for param in player.iter("ProviderParameter"):
        if param.findtext("Name") == "position_type":
            position = param.findtext("Value")

    player_records.append([squad_num, player_code, position])

player_records = pd.DataFrame(player_records, columns=["squad_num", "code", "position"]).set_index("squad_num")
player_records

In [None]:
phase_records = []

for i, data_spec in enumerate(root[1]):
    start_frame = int(data_spec.get("startFrame"))
    end_frame = int(data_spec.get("endFrame"))
    session = 1 if i == 0 else 2

    player_codes = []
    gk_codes = []

    for player_xy in data_spec[1]:
        squad_num = int(player_xy[0].get("playerChannelId")[6:-2])
        player_code = player_records.at[squad_num, "code"]
        player_codes.append(player_code)

        position = player_records.at[squad_num, "position"]
        if position == "Goalkeeper":
            gk_codes.append(player_code)
    
    player_codes = player_codes[10:11] + player_codes[:10] + player_codes[-1:] + player_codes[11:-1]
    phase_records.append([i + 1, session, start_frame, end_frame, player_codes, gk_codes])

header = ["phase", "session", "start_frame", "end_frame", "player_codes", "gk_codes"]
phase_records = pd.DataFrame(phase_records, columns=header).set_index("phase")
phase_records

In [None]:
time_cols = ["frame", "session", "time"]
xy_cols = np.array([[f"{p}_x", f"{p}_y"] for p in player_records["code"].tolist() + ["ball"]]).flatten().tolist()

traces_txt = pd.read_csv("data/metrica_traces/Sample_Game_3/Sample_Game_3_tracking.txt", sep=";", header=None)
traces = pd.DataFrame(index=traces_txt.index, columns=time_cols + xy_cols)

for phase in tqdm(phase_records.index):
    i0 = phase_records.at[phase, "start_frame"] - 1
    i1 = phase_records.at[phase, "end_frame"] - 1
    player_codes = phase_records.at[phase, "player_codes"]

    phase_traces = traces_txt.loc[i0:i1]
    phase_traces.columns = player_codes
    leftmost = phase_traces[player_codes[0]].str.split(":", expand=True)
    leftmost.columns = ["frame", player_codes[0]]
    rightmost = phase_traces[player_codes[-1]].str.split(":", expand=True)
    rightmost.columns = [player_codes[-1], "ball"]
    phase_traces = pd.concat([leftmost, phase_traces[player_codes[1:-1]], rightmost], axis=1)

    traces.loc[phase_traces.index, "frame"] = phase_traces["frame"].astype(int)
    traces.loc[phase_traces.index, "session"] = phase_records.at[phase, "session"]

    for p in phase_traces.columns[1:]:
        xy = phase_traces[p].str.split(",", expand=True).astype(float).values
        traces.loc[phase_traces.index, [f"{p}_x", f"{p}_y"]] = xy

traces["time"] = (traces["frame"] * 0.04).astype(float).round(2)
traces

In [None]:
traces.to_csv(f"data/metrica_traces/Sample_Game_3/Sample_Game_3_RawTrackingData.csv", index=False)

### Processing Metrica Data

In [None]:
match_id = 3

event_file = f"data/metrica_traces/Sample_Game_{match_id}/Sample_Game_{match_id}_RawEventsData.csv"
events = pd.read_csv(event_file)

if match_id <= 2:
    team1_file = f"data/metrica_traces/Sample_Game_{match_id}/Sample_Game_{match_id}_RawTrackingData_Home_Team.csv"
    team2_file = f"data/metrica_traces/Sample_Game_{match_id}/Sample_Game_{match_id}_RawTrackingData_Away_Team.csv"
    team1_traces = pd.read_csv(team1_file, header=[0, 1, 2])
    team2_traces = pd.read_csv(team2_file, header=[0, 1, 2])
    helper = MetricaHelper(team1_traces, team2_traces, events=events)
else:  # match_id == 3
    trace_file = f"data/metrica_traces/Sample_Game_{match_id}/Sample_Game_{match_id}_RawTrackingData.csv"
    traces = pd.read_csv(trace_file, index_col=0)
    helper = MetricaHelper(traces_from_txt=traces, events=events)

helper.traces

In [None]:
helper.generate_phase_records()
helper.phase_records

In [None]:
helper.downsample_to_10fps()
helper.split_into_episodes()
helper.calc_running_features(remove_outliers=True, smoothing=True)
helper.find_gt_player_poss()
helper.find_gt_team_poss()
if match_id == 3:
    helper.correct_event_player_ids()

helper.traces

In [None]:
helper.traces.to_csv(f"data/metrica_traces/match{match_id}_.csv", index=False)
helper.events.to_csv(f"data/metrica_events/match{match_id}_.csv", index=False)

### Visualization for Metrica Data

##### Animating Trajectories

In [None]:
traces = helper.traces
traces

In [None]:
i0 = 0
i1 = 2000
# traces = helper.traces[i0:i1]

animator = TraceAnimator(
    trace_dict={"main": traces},
    show_episodes=True,
    show_events=True,
    annot_cols=["team_poss", "player_poss", "event_type"]
)
anim = animator.run()

t0 = traces["time"].iloc[0]
t1 = traces["time"].iloc[-1]
t0_str = f"{int(t0 // 60):02d}.{int(t0 % 60):02d}"
t1_str = f"{int(t1 // 60):02d}.{int(t1 % 60):02d}"

path = f"animations/metrica_match{match_id}_test.mp4"
# path = f"animations/metrica_match{match_id}_{t0_str}-{t1_str}.mp4"
writer = animation.FFMpegWriter(fps=10)
anim.save(path, writer=writer)

##### Animating Feature Plots

In [None]:
session = 1
traces = helper.traces[helper.traces["session"] == session]
anim = TraceHelper.plot_speeds_and_accels(traces, helper.team1_players)
writer = animation.FFMpegWriter(fps=5)

smoothing = True
if smoothing:
    path = f"animations/feature_plots/metrica_match{match_id}_s{session}_smooth.mp4"
else:
    path = f"animations/feature_plots/metrica_match{match_id}_s{session}_noisy.mp4"
    
anim.save(path, writer=writer)

### Validation-Test Split of Metrica Sample Game 3 Data

In [None]:
traces = pd.read_csv("data/metrica_traces/match3_.csv", header=0)
events = pd.read_csv("data/metrica_events/match3_.csv", header=0)
events

In [None]:
traces_valid = traces[traces["session"] == 1]
traces_test = traces[traces["session"] == 2]
traces_valid.to_csv("data/metrica_traces/match3_valid_.csv", index=False)
traces_test.to_csv("data/metrica_traces/match3_test_.csv", index=False)

In [None]:
events_valid = events[events["session"] == 1]
events_test = events[events["session"] == 2]
events_valid = events_valid.to_csv("data/metrica_events/match3_valid_.csv", index=False)
events_test = events_test.to_csv("data/metrica_events/match3_test_.csv", index=False)

### Testing SoccerDataset-GK Based on Metrica Data

In [None]:
dir = "data/metrica_traces"
train_files = ["match1.csv", "match2.csv", "match3_valid.csv"]
test_files = ["match3_test.csv"]
train_paths = [f"{dir}/{f}" for f in train_files]
test_paths = [f"{dir}/{f}" for f in test_files]
train_paths, test_paths

In [None]:
dataset = SoccerDataset(test_paths, target_type="gk", train=False, flip_pitch=True)
len(dataset)

In [None]:
idx = 3
input_tensor, target_tensor = dataset[idx]
input_tensor.shape, target_tensor.shape

In [None]:
team1_cols = [f"A{i:02d}{s}" for i in np.arange(2, 12) for s in dataset.feature_types]
team2_cols = [f"B{i:02d}{s}" for i in np.arange(2, 12) for s in dataset.feature_types]
cols = team1_cols + team2_cols + ["A01_x", "A01_y", "B01_x", "B01_y"]

traces = pd.DataFrame(np.concatenate([input_tensor, target_tensor], axis=1), columns=cols)
traces["time"] = (np.arange(dataset.ws) + 1) * 0.1
traces.head()

In [None]:
anim = TraceHelper.plot_scene(traces)
writer = animation.FFMpegWriter(fps=10)
path = f"animations/sample_metrica_gk_{idx}.mp4"
anim.save(path, writer=writer)

### Testing SoccerDataset-GK-Macro Based on Metrica Data

In [None]:
dir = "data/metrica_traces"
train_files = ["match1.csv", "match2.csv", "match3_train.csv"]
test_files = ["match3_test.csv"]
train_paths = [f"{dir}/{f}" for f in train_files]
test_paths = [f"{dir}/{f}" for f in test_files]
train_paths, test_paths

In [None]:
dataset = SoccerDataset(test_paths, target_type="gk", macro_type="team_poss", train=False, flip_pitch=True)
dataset.input_data.shape, dataset.macro_data.shape, dataset.target_data.shape

In [None]:
idx = 3
input_tensor, macro_tensor, target_tensor = dataset[idx]
input_tensor.shape, macro_tensor.shape, target_tensor.shape