In [1]:
# Copyright 2018 Esref Ozdemir
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Train/Test Set Construction
In this document, we construct training and test sets from already computed feature sets. The sets are computed according to the following directory layout:

```
data
├── test
├── test_events
├── test_feature
├── test_hasball
├── train
├── train_events
├── train_feature
└── train_hasball
```

When ```dataset='test'```, events are read from test_events, features are read from test_feature, hasball data is read from test_hasball and the resulting test dataset is written to test. Same logic applies to ```dataset='train'```.

In [1]:
%matplotlib inline
from os import listdir
from os.path import join
import pickle
import numpy as np
import pandas as pd
import multiprocessing
from collections import Counter
from random import shuffle

from sklearn.metrics import f1_score, confusion_matrix
from utils import plot_hbar_nameval, plot_confusion_matrix
from sklearn.ensemble import RandomForestClassifier


pd.set_option('compute.use_bottleneck', True)
pd.set_option('compute.use_numexpr', True)

### Event IDs

In [2]:
event_names = {
       0 : 'Null',
      60 : 'Corner', 
      62 : 'Freekick',
      80 : 'Goal',
      93 : 'Penalty',
}

with open('../data/event_names.pkl', 'wb') as f:
    pickle.dump(event_names, f)

**Null event(0)** represents all the event categories, apart from the ones we are interested in, **that occur when the game stops**. If events we want to predict are possession, corner, penalty, freekick, and goal, then other events may correspond to throw-in, out, goal-kick, etc.

### Complete Event List

In [4]:
event_names = pd.read_csv('../doc/event_definitions_en.csv')
print('Number of events: {}'.format(len(event_names)))

Number of events: 38


## Event Data

In [4]:
event_df = pd.read_csv('../data/train_events/20165_event.csv')
display(event_df.head())
event_ids = np.sort(event_df['eventId'].unique())
print('Event ids: {}'.format(event_ids))
print('Size: {}'.format(len(event_ids)))

event_df[event_df['eventId'] == 93]

Unnamed: 0,teamId,eventId,jersey,half,minute,second,location,bodyPart,postLocation,custom
0,11,2,23,1,0,0,-1,-1,-1,-1
1,11,2,53,1,0,1,-1,-1,-1,-1
2,68,20,77,1,0,20,6,-1,-1,-1
3,11,21,11,1,0,20,6,-1,-1,-1
4,11,62,10,1,1,23,2,-1,-1,-1


Event ids: [ 2  4 10 11 12 20 21 30 31 32 40 41 60 62 65 66 70 71 72 80 81 90 91 92 93
 97 98]
Size: 27


Unnamed: 0,teamId,eventId,jersey,half,minute,second,location,bodyPart,postLocation,custom
478,11,93,20,1,30,30,-1,-1,-1,1
479,68,93,3,1,30,44,-1,-1,-1,0
491,11,93,5,1,31,38,-1,-1,-1,2
492,68,93,83,1,31,42,-1,-1,-1,3


## Dataset Construction
In this section, we construct a combined dataset containing event ids, coming from event data, and corresponding feature data, coming from feature data.

### Parameters

#### Intervals
When obtaining feature data for a given event, we get all the feature rows in a predefined time interval for that particular event type. The main rationale behind this is that events we try to predict spread over time.

* $+$: More efficient data usage.
* $-$: In the end, time intervals are yet another hyperparameter that needs to be optimized in order to obtain an optimal model.
* $-$: Arbitrary initial values may be totally different than reality.
* $-$: Too large intervals would lead to **greater label noise**.

Choose the dataset to construct

In [5]:
dataset = 'test'

In [6]:
import re


event_intervals = {
    60 : ( 2,  0),  # corner
    61 : ( 1,  2),  # out
    62 : ( 2,  0),  # freekick
    63 : ( 2,  1),  # indirect freekick
    64 : ( 1,  0),  # throw-in
    65 : ( 2,  0),  # offside
    80 : ( 0, 15),  # goal
    93 : ( 0, 15), # penalty
    98 : ( 0,  2),  # injury
}
predict_event_ids = {60, 62, 80, 93}
other_event_ids = {61, 64, 65, 98}
# ratio of event_count/other_count
EVENT_TO_OTHER_RATIO = 8
# ratio of (other_count + event_count)/possession_count
EVENT_TO_POSSESSION_RATIO = 8

# file based constants
event_dir     = '../data/{}_events'.format(dataset)
event_regex   = re.compile(r'\d+_event.csv')
feature_dir   = '../data/{}_feature'.format(dataset)
hasball_dir   = '../data/{}_hasball'.format(dataset)

### Utility Functions
Generic utility functions used throughout the construction.

In [7]:
from utils import hms_to_sec, hms_to_sec_vec, separate_home_away


def get_event_seconds(feature_df, hasball_df, hms, span):
    """
    Returns all frames from feature_df within time limits
    [hms - span[0], hms + span[1]] at which the game is stopped.
    
    Parameters
    ----------
    feature_df: `pandas.DataFrame` containing the feature data.
    hasball_df: `pandas.DataFrame` containing the hasball data.
    hms: (half, minute, second) triple indicating when the event happened.
    span: (past_limit, future_limit) pair.
    """
    sec = hms_to_sec(hms)
    begin_sec = sec - span[0]
    end_sec   = sec + span[1]  
    hms_vec = hms_to_sec_vec(feature_df[['half', 'minute', 'second']].values)
    
    sec_mask = (hms_vec >= begin_sec) & (hms_vec <= end_sec)
    result_df = feature_df[sec_mask]
    for index, row in result_df.iterrows():
        hasball_row = hasball_df[(hasball_df['half'] == row['half']) & (hasball_df['minute'] == row['minute']) & (hasball_df['second'] == row['second'])]
        if hasball_row.empty or hasball_row.at[hasball_row.index[0], 'teamPoss'] != -1:
            result_df = result_df.drop(index)
    #gamestop_mask = hasball_df['teamPoss'] == -1
    
    return result_df#[sec_mask & gamestop_mask]

### Construction Related Functions
These are functions that are heavily coupled with the construction code, and are mainly intended for local code reuse and code readability.

In [8]:
def collect_event_features(all_events_df, events,
                           feature_df, hasball_df, max_samples=None):
    """  
    Collects features for the given events for the duration
    specified in event_intervals. Puts collected events to
    all_events_df and corresponding second values to event_seconds.
    
    If max_samples is specified, then collection stops as soon as
    the number of collected samples is greater than max_samples.
    """
    num_samples = 0
    for eid, event_df in events.items():
        span = event_intervals[eid]
        for _, row in event_df.iterrows():
            
            # get all the features from the time interval                
            features = get_event_seconds(
                feature_df,
                hasball_df,
                row[['half', 'minute', 'second']],
                span
            )
            num_samples += len(features)
            features.insert(0, 'eventId', row['eventId'])
            
            # accumulate data
            all_events_df = all_events_df.append(
                features.drop(['half', 'minute', 'second'], axis=1),
                ignore_index=True
            )
        
            if max_samples is not None and num_samples >= max_samples:
                return num_samples, all_events_df
    return num_samples, all_events_df

### Construction
Here we construct the combined dataset from all match data we have in the given data directories.

In [9]:
def construct(event_file):
    pd.options.mode.chained_assignment = None
    all_events_df = pd.DataFrame()
   
    # get event data
    event_df = pd.read_csv(join(event_dir, event_file))

    # get corresponding feature data
    match_id = event_file.split('_')[0]
    feature_file = match_id + '_feature.csv'
    try:
        with open(join(feature_dir, feature_file), 'r') as f:
            feature_df = pd.read_csv(f)
            feature_df = feature_df.drop(0).reset_index(drop=True)
    except FileNotFoundError:
        print('No feature data for {}'.format(match_id))
        return
    
    # get corresponding hasball data
    hasball_file = match_id + '_hasball.csv'
    try:
        hasball_df = pd.read_csv(join(hasball_dir, hasball_file))
    except FileNotFoundError:
        print('No hasball data for {}'.format(match_id))
        return

    # get events we are interested in
    predict_events = {eid: event_df[event_df['eventId'] == eid]
                      for eid in predict_event_ids}

    # special treatment for events that need it
    ## start collecting goal frames 3 seconds after the event
    predict_events[80].loc[:, 'second'] = predict_events[80]['second'] + 5
    predict_events[93].loc[:, 'second'] = predict_events[93]['second'] + 10
    ## use multiple custom events for penalty to get as many frames as possible
    custom_mask = (predict_events[80]['custom'] == 0)
    predict_events[80] = predict_events[80][custom_mask]
    custom_mask = (predict_events[93]['custom'] == 0) | (predict_events[93]['custom'] == 1)
    predict_events[93] = predict_events[93][custom_mask]
    
    
    # get "other" events
    other_events = {eid: event_df[event_df['eventId'] == eid]
                    for eid in other_event_ids}
    for eid in other_events.keys():
        if not other_events[eid].empty:
            other_events[eid] = other_events[eid].sample(frac=1)

    # collect feature data corresponding to specified events
    num_event_samples = 0
    count, all_events_df = collect_event_features(
        all_events_df,
        predict_events,
        feature_df,
        hasball_df,
    )
    num_event_samples += count

    # collect feature data corresponding to "other" events
    count, all_events_df = collect_event_features(
        all_events_df,
        other_events,
        feature_df,
        hasball_df,
        max_samples=int(num_event_samples//EVENT_TO_OTHER_RATIO)
    )
    num_event_samples += count

    # first and second half begin and end seconds
    home_mask = hasball_df['teamPoss'] == 1
    away_mask = hasball_df['teamPoss'] == 0

    # number of samples to collect for possession
    num_possession_samples = int(num_event_samples//EVENT_TO_POSSESSION_RATIO)
    num_home_samples = num_possession_samples//2
    num_away_samples = num_possession_samples - num_home_samples

    # collect possession samples for home team
    try:
        home_features = feature_df[home_mask].sample(frac=1)#n=num_home_samples)
        home_features.insert(0, 'eventId', 0)
        all_events_df = all_events_df.append(
            home_features.drop(['half', 'minute', 'second'], axis=1),
            ignore_index=True
        )
    except:
        pass
    
    # collect possession samples for away team
    try:
        away_features = feature_df[away_mask].sample(frac=1)#n=num_away_samples)
        away_features.insert(0, 'eventId', 0)
        all_events_df = all_events_df.append(
            away_features.drop(['half', 'minute', 'second'], axis=1),
            ignore_index=True
        )
    except:
        pass
    
    for i, row in all_events_df.iterrows():
        if row['eventId'] in other_event_ids:
            all_events_df.loc[i, 'eventId'] = 0
    
    pd.options.mode.chained_assignment = 'warn'
    
    return all_events_df

We speed up the computation by using all the CPU cores via multiprocessing module.

In [10]:
pool = multiprocessing.Pool()
event_csv_files = [f for f in listdir(event_dir) if event_regex.match(f)]
shuffle(event_csv_files)
print(len(event_csv_files))

df = pd.concat(pool.map(construct, event_csv_files))
df.sort_values('eventId', inplace=True)
df.reset_index(inplace=True, drop=True)

df.shape

20




(61638, 45)

In [11]:
Counter(df.values[:, 0])

Counter({0.0: 60007, 60.0: 439, 62.0: 290, 80.0: 712, 93.0: 190})

In [12]:
display(df.head())
print('n_samples\t= {}\nn_features\t= {}'.format(*df.shape))

Unnamed: 0,eventId,awayAvgX,awayAvgY,awayConvexCenterX,awayConvexCenterY,awayConvexClosestDistance,awayConvexFarDistance,awayConvexMaxSpeed,awayConvexMaxX,awayConvexMaxY,...,playerConvexMaxX,playerConvexMaxY,playerConvexMinX,playerConvexMinY,playerDenseClusterDensity,playerSparseClusterDensity,playerVerticalLinearity,refSpeed,refX,refY
0,0,77.793636,31.352727,80.976,28.052,8.634157,28.883452,4.502085,90.46,53.27,...,90.46,57.89,50.05,0.77,0.517354,0.468609,2.532833,2.959187,78.4,38.13
1,0,63.196364,29.464545,63.275,32.475,14.847453,34.510567,6.067635,76.35,56.13,...,76.35,66.2,36.97,0.42,0.45882,0.348719,3.28084,3.948721,59.61,34.85
2,0,54.965,27.724,51.8875,31.405,13.872879,25.026157,2.854645,65.59,45.27,...,65.59,51.4,27.59,9.66,0.698171,0.440077,3.587116,0.96026,52.38,38.89
3,0,28.19,39.857,28.285714,37.308571,18.798221,32.27596,5.808244,51.91,62.24,...,51.91,62.24,0.21,18.98,0.513904,0.493153,1.182147,2.147859,23.42,40.64
4,0,62.302,23.14,63.983333,24.126667,8.855736,24.49528,1.697587,73.53,42.73,...,74.17,46.95,47.18,5.17,0.858259,0.382574,2.110818,1.889047,58.85,21.98


n_samples	= 61638
n_features	= 45


## Exporting

In [17]:
df.to_csv('../data/{dataset}/all_{dataset}.csv'.format(dataset=dataset), index=False)