In [1]:
# Copyright 2018 Esref Ozdemir
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Feature Construction
In this notebook we will construct features from raw match data (match_id.csv files).

## Raw Data File Format
Each raw match **.txt** file has the following structure:
```
<is_converted> <home_left>
<match_id>\t<timestamp>\t<half>\t<minute>\t<second>\t<player_list>\n
```
where a ```<player_list>``` has the following structure:
```
<player_1> <player_2> ... <player_N>
```
and ```<player_i>``` has the following structure:
```
<type_i>,<id_i>,<jersey_i>,<x_i>,<y_i>
```
for $i \in \{1, 2, \dots, N\}$.

In this format,
* ```<is_converted>``` denotes whether the match is converted so that home team is always on the left, regardless of the match half. ```<home_left>``` denotes whether home team has started the match on the left half ($x \in [0, 52.5]$).

  * If ```<is_converted>``` is **true**, then ```<home_left>``` is not used. Home team coordinates are always on the left.
  * If ```<is_converted>``` is **false**, then the match should be manually converted so that home team is always on the left regardless of the match half. ```<home_left>``` should be used during this conversion.
  
* ```<player_type>``` is a key of the following mapping:
```python
    {
        0 : 'home_player',
        1 : 'away_player',
        2 : 'referee',
        3 : 'home_goalkeeper',
        4 : 'away_goalkeeper'
    }
```

In [99]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
from scipy.spatial import ConvexHull
from scipy.spatial.qhull import QhullError
from sklearn.cluster import KMeans

from utils import (group_indices, inner_dist, gini_impurity,parse_raw_file,
                   raw_to_sec_df, remove_missing_raw_rows, hms_to_sec, dist)


pd.set_option('compute.use_bottleneck', True)
pd.set_option('compute.use_numexpr', True)

In [100]:
data_dir = '../data/train_raw'
raw_files = [f for f in os.listdir(data_dir) if f.endswith('.txt')]
raw_file = os.path.join(data_dir, raw_files[0])
print(raw_file)
df = parse_raw_file(raw_file)
df.head(10)

../data/train_raw/20125_raw.txt


Unnamed: 0,match_id,timestamp,half,minute,second,player_type,player_id,x,y
0,20125,68471899,1,0,0,2,1446,53.0,41.43
1,20125,68471899,1,0,0,0,1263,36.9,16.46
2,20125,68471899,1,0,0,0,1341,49.66,39.12
3,20125,68471899,1,0,0,0,1230,67.3,11.44
4,20125,68471899,1,0,0,0,1415,58.01,29.93
5,20125,68471899,1,0,0,1,1489,57.74,40.26
6,20125,68471899,1,0,0,4,1490,98.02,34.41
7,20125,68471899,1,0,0,0,1491,54.23,52.23
8,20125,68471899,1,0,0,0,1241,43.2,11.49
9,20125,68471899,1,0,0,0,1466,71.87,38.45


## Convert 100ms DataFrame to Second DataFrame
We will classify snapshots per second. Therefore, we use only the first snapshot out of 10 snapshots for a given second.

In [23]:
sec_df = raw_to_sec_df(df)
print('Raw dataframe shape: {}'.format(df.shape))
print('Second dataframe shape: {}'.format(sec_df.shape))

Raw dataframe shape: (1298887, 9)
Second dataframe shape: (129904, 9)


## Computing Features For a Single Match
In this section, we compute several features for a whole match. Each feature is computed for each second of the game. Thus, we iterate over the seconds of a match and compute each feature per second.

### Definitions

In [13]:
features = [
    'homeAvgX', 'awayAvgX',
    'homeAvgY', 'awayAvgY',
    'refX',
    'refY',
    'refSpeed',
    'playerConvexMaxX',            'homeConvexMaxX',            'awayConvexMaxX',
    'playerConvexMaxY',            'homeConvexMaxY',            'awayConvexMaxY',
    'playerConvexMinX',            'homeConvexMinX',            'awayConvexMinX',
    'playerConvexMinY',            'homeConvexMinY',            'awayConvexMinY',
    'playerConvexCenterX',         'homeConvexCenterX',         'awayConvexCenterX',
    'playerConvexCenterY',         'homeConvexCenterY',         'awayConvexCenterY',
    'playerConvexMaxSpeed',        'homeConvexMaxSpeed',        'awayConvexMaxSpeed',
    'playerConvexFarDistance',     'homeConvexFarDistance',     'awayConvexFarDistance',
    'playerConvexClosestDistance', 'homeConvexClosestDistance', 'awayConvexClosestDistance', 
    'homeInnerDistance',           'awayInnerDistance',
    'playerDenseClusterDensity',   'homeDenseClusterDensity',   'awayDenseClusterDensity',
    'playerSparseClusterDensity',  'homeSparseClusterDensity',  'awaySparseClusterDensity',
    'playerVerticalLinearity',
    'maxClusterImpurity',
]
print(len(features))

44


### Computation Functions

In [60]:
def speed(x0, y0, x1, y1, t):
    assert(t > 0)
    return dist(x0, y0, x1, y1)/t

In [93]:
def avg_min_max_stats(df, prefix, out_df, i):
    out_df.at[i, prefix + 'AvgX'] = df['x'].mean()
    out_df.at[i, prefix + 'AvgY'] = df['y'].mean()

    
def convex_stats(df, prev_df, prefix, out_df, i):
    try:
        xy = df[['x', 'y']].values
        all_ids = df['player_id'].values
        convex_indices = ConvexHull(xy).vertices
        points = xy[convex_indices]
        player_ids = all_ids[convex_indices]
        
        # convex max, min, center
        out_df.at[i, prefix + 'ConvexMaxX'] = points[:, 0].max()
        out_df.at[i, prefix + 'ConvexMaxY'] = points[:, 1].max()
        out_df.at[i, prefix + 'ConvexMinX'] = points[:, 0].min()
        out_df.at[i, prefix + 'ConvexMinY'] = points[:, 1].min()
        out_df.at[i, prefix + 'ConvexCenterX'] = points[:, 0].mean()
        out_df.at[i, prefix + 'ConvexCenterY'] = points[:, 1].mean()
        
        # convex max speed
        if i > 0:
            max_speed = -1e60
            for pid in player_ids:
                prev_row = prev_df[prev_df['player_id'] == pid]
                if prev_row.empty:
                    continue

                curr_row = df[df['player_id'] == pid]
                
                t = (hms_to_sec(curr_row[['half', 'minute', 'second']].values[0, :]) -
                     hms_to_sec(prev_row[['half', 'minute', 'second']].values[0, :]))
                
                curr_speed = speed(
                    curr_row['x'].values[0],
                    curr_row['y'].values[0],
                    prev_row['x'].values[0],
                    prev_row['y'].values[0],
                    t
                )
                max_speed = max(max_speed, curr_speed)
        else:
            max_speed = 0
        
        out_df.at[i, prefix + 'ConvexMaxSpeed'] = max_speed
        # convex distance stats
        center = np.array([out_df.loc[i, prefix + 'ConvexCenterX'], out_df.loc[i, prefix + 'ConvexCenterY']])
        norms = np.linalg.norm(points - center, axis=1)
        out_df.at[i, prefix + 'ConvexFarDistance'] = norms.max()
        out_df.at[i, prefix + 'ConvexClosestDistance'] = norms.min()
    except QhullError:
        out_df.at[i, prefix + 'ConvexMaxX'] = np.nan
        out_df.at[i, prefix + 'ConvexMaxY'] = np.nan
        out_df.at[i, prefix + 'ConvexMinX'] = np.nan
        out_df.at[i, prefix + 'ConvexMinY'] = np.nan
        out_df.at[i, prefix + 'ConvexCenterX'] = np.nan
        out_df.at[i, prefix + 'ConvexCenterY'] = np.nan
        out_df.at[i, prefix + 'ConvexMaxSpeed'] = np.nan
        out_df.at[i, prefix + 'ConvexFarDistance'] = np.nan
        out_df.at[i, prefix + 'ConvexClosestDistance'] = np.nan
        
        
def cluster_stats(df, prefix, out_df, index):
    xy = df[['x', 'y']].values
    n_clusters = 2
    
    kmeans = KMeans(n_clusters=n_clusters, n_init=10)
    try:
        kmeans = kmeans.fit(xy)
    except:
        out_df.at[index, prefix + 'DenseClusterDensity'] = np.nan
        out_df.at[index, prefix + 'SparseClusterDensity'] = np.nan
        return
    
    centroids = kmeans.cluster_centers_
    
    accum = []
    for i in range(n_clusters):
        cluster = xy[kmeans.labels_ == i]
        n_points = len(cluster)
        
        diff = np.sqrt(np.sum((cluster - centroids[i])**2, axis=1))
        max_dist = np.max(diff)
        
        different_players = not np.isclose(max_dist, 0)
        density = n_points/max_dist if different_players else 0
        accum.append(density)
        
    out_df.at[index, prefix + 'DenseClusterDensity'] = max(accum)
    out_df.at[index, prefix + 'SparseClusterDensity'] = min(accum)


def linearity_stats(df, prefix, out_df, index):
    n_clusters = 4
    xy = df[['x', 'y']].values
    x = xy[:, 0].reshape(-1, 1)
    
    kmeans = KMeans(n_clusters=n_clusters, n_init=10)
    try:
        kmeans = kmeans.fit(x)
    except:
        out_df.at[index, prefix + 'VerticalLinearity'] = np.nan
        return
        
    centroids = kmeans.cluster_centers_
    max_vert_density = -1
    
    for i in range(n_clusters):
        cluster_x = x[kmeans.labels_ == i]
        n_points = len(cluster_x)
        
        vert_diff = np.abs(cluster_x - centroids[i])
        max_dist = np.max(vert_diff)
        
        different_players = not np.isclose(max_dist, 0)
        vert_density = n_points/max_dist if different_players else 0
        
        max_vert_density = max(max_vert_density, vert_density)
    
    out_df.at[index, prefix + 'VerticalLinearity'] = max_vert_density
    
    
def player_mixing_stats(player_df, out_df, index):
    n_clusters = 4
    xy = player_df[['x', 'y']].values
    player_type = player_df['player_type'].values
    
    kmeans = KMeans(n_clusters=n_clusters, n_init=10)
    try:
        kmeans = kmeans.fit(xy)
    except:
        out_df.at[index, 'maxClusterImpurity'] = np.nan
        return
        
    centroids = kmeans.cluster_centers_
    
    max_impurity = -1
    
    for i in range(n_clusters):
        cluster_team = player_type[kmeans.labels_ == i]
        gini = gini_impurity(cluster_team)
        max_impurity = max(max_impurity, gini)
        
    out_df.at[index, 'maxClusterImpurity'] = max_impurity

### Computation

In [70]:
def compute_features(raw_df):
    '''
    Given a pandas.DataFrame for raw data, computes a pandas.DataFrame containing
    predetermined features, for each second.
    '''
    # get rid of frames with missing player values
    raw_df = remove_missing_raw_rows(raw_df)
    
    # convert to second apart df
    raw_df = raw_to_sec_df(raw_df)
    
    # init
    time = raw_df[['half', 'minute', 'second']].values
    feature_df = pd.DataFrame(
        data=np.zeros((len(time), len(features) + 3), dtype='float'),
        columns=['half', 'minute', 'second'] + sorted(features)
    )
    int_cols = ['half', 'minute', 'second']
    feature_df[int_cols] = feature_df[int_cols].astype(int)
    indices = group_indices(time)
    
    # loop over seconds
    sec_df = pd.DataFrame()
    prev_player = pd.DataFrame()
    prev_home = pd.DataFrame()
    prev_away = pd.DataFrame()
    for i in range(len(indices) - 1):
        # set half, min, sec
        feature_df.at[i, :3] = time[indices[i]]
        
        # dataframe for the current second
        prev_df = sec_df
        sec_df = raw_df.iloc[indices[i]:indices[i + 1], :]
        
        # player groups
        player = sec_df[(sec_df['player_type'] == 0) | (sec_df['player_type'] == 1)]
        home = sec_df[sec_df['player_type'] == 0]
        away = sec_df[sec_df['player_type'] == 1]
        
        # player groups for previous second
        if i > 0:
            prev_player = prev_df[(prev_df['player_type'] == 0) | (prev_df['player_type'] == 1)]
            prev_home = prev_df[prev_df['player_type'] == 0]
            prev_away = prev_df[prev_df['player_type'] == 1]
        
        ref = sec_df[sec_df['player_type'] == 2]

        # avg, min, max stats
        avg_min_max_stats(home, 'home', feature_df, i)
        avg_min_max_stats(away, 'away', feature_df, i)

        # referee stats
        feature_df.at[i, 'refX'] = np.nan if ref['x'].empty else ref['x']
        feature_df.at[i, 'refY'] = np.nan if ref['y'].empty else ref['y']
        # referee speed
        if i > 0:
            timespan = (hms_to_sec(feature_df[['half', 'minute', 'second']].values[i, :]) -
                        hms_to_sec(feature_df[['half', 'minute', 'second']].values[i - 1, :]))
            feature_df.at[i, 'refSpeed'] = speed(
                feature_df.at[i, 'refX'],
                feature_df.at[i, 'refY'],
                feature_df.at[i - 1, 'refX'],
                feature_df.at[i - 1, 'refY'],
                timespan
            )
        
        # convex stats
        convex_stats(player, prev_player, 'player', feature_df, i)
        convex_stats(home, prev_player, 'home', feature_df, i)
        convex_stats(away, prev_player, 'away', feature_df, i)

        # inner distance stats
        feature_df.at[i, 'homeInnerDistance'] = np.sum(inner_dist(home[['x', 'y']].values))
        feature_df.at[i, 'awayInnerDistance'] = np.sum(inner_dist(away[['x', 'y']].values))

        # cluster stats
        cluster_stats(player, 'player', feature_df, i)
        cluster_stats(home, 'home', feature_df, i)
        cluster_stats(away, 'away', feature_df, i)
        
        # linearity stats
        linearity_stats(player, 'player', feature_df, i)
        
        # player mixing stats
        player_mixing_stats(player, feature_df, i)

    # get rid of zero rows and return
    return feature_df[(feature_df.T != 0).any()]

## Computing Features with Multiple Cores via Multiprocessing

In [90]:
import multiprocessing


def compute_features_multiprocessing(df):
    df_indices = group_indices(df[['half', 'minute', 'second']].values)
    n = len(df_indices)
    n_cpu = multiprocessing.cpu_count()
    x = n//n_cpu
    sec_indices = [x*i for i in range(n_cpu)] + [n - 1]
    split_indices = [df_indices[sec] for sec in sec_indices]
    args = []
    for i in range(1, len(split_indices)):
        args.append(df.iloc[split_indices[i - 1]:split_indices[i], :])
        
    pool = multiprocessing.Pool()
    res = pool.map(compute_features, args)
    out_df = pd.concat(res).reset_index(drop=True)
    
    # replace any NaN values with the most recently computed value
    out_df.fillna(method='ffill', inplace=True)
    return out_df

In [108]:
%%sh
lscpu | grep 'Model name:';
lscpu | grep 'Thread(s) per core:';
lscpu | grep 'Core(s) per socket:';
lscpu | grep '^CPU(s):'

Model name:          Intel(R) Core(TM) i5-2410M CPU @ 2.30GHz
Thread(s) per core:  2
Core(s) per socket:  2
CPU(s):              4


In [11]:
import psutil

print('Number of physical cores: {}'.format(psutil.cpu_count(logical=False)))
print('Number of logical cores: {}'.format(psutil.cpu_count(logical=True)))

Number of physical cores: 2
Number of logical cores: 4


In [98]:
%time out_df = compute_features_multiprocessing(df)

CPU times: user 5.31 s, sys: 7.63 s, total: 12.9 s
Wall time: 6min 30s


### C++ Feature Computer Speed
Here, we compare the speed of our Python code with C++ version. Even though C++ version is not multithreaded, it still can compute the features much faster than our naive Python implementation.

In [97]:
%%sh
time ../cpp_feature/feature ../data/train_raw/20125_raw.txt ../data/train_raw/20125_cpp_feature.csv


real	0m3.977s
user	0m3.040s
sys	0m0.879s


## Computing Features For All Raw Match Data
In this section, we compute several features for all the matches in the raw match data directory and store the resulting feature data in feature directory.

### Warning
Note that since the Python version is really slow, you shouldn't use this code anymore. Consider computing features using C++ feature computer program as mentioned in the README.md.

In [None]:
%%time
data_dir = '../data/test_raw'
out_dir = os.path.join(os.path.dirname(data_dir), 'test_feature')
for csv_file in raw_csv_files:
    match_id = csv_file.split('_')[0]
    df = pd.read_csv(os.path.join(data_dir, csv_file))
    feature_df = compute_features_multiprocessing(df)
    out_csv = os.path.join(out_dir, match_id + '_feature.csv')
    home = df[df['teamType'] == 1]['teamId'].iloc[0]
    away = df[df['teamType'] == 2]['teamId'].iloc[0]
    with open(out_csv, 'w') as f:
        f.write('# home = {}\n'.format(home))
        f.write('# away = {}\n'.format(away))        
        feature_df.to_csv(f, index=False)