In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# read the dataset and set the index
columns = ['timestamp', 'tag_id', 'x_pos', 'y_pos', 'heading', 'direction', 'energy', 'speed', 'total_distance']
data = pd.read_csv('../data/input/raw/tromso_tottenham.csv', names=columns)

# convert timestamp to second
data['timestamp'] = pd.to_datetime(data['timestamp'])
data['timestamp'] = data['timestamp'] - data['timestamp'][0]
data['timestamp'] = [time.total_seconds() for time in data['timestamp']]

# make the list of player id
player_ids = data['tag_id'].unique()

# remove the entries of the goalkeeper and substitute players
x_means = [] # a list of x-position mean for each player
y_means = [] # a list of y-position mean for each player
for player_id in player_ids:
    x_means.append(data['x_pos'][data['tag_id']==player_id].mean())
    y_means.append(data['y_pos'][data['tag_id']==player_id].mean())

# remove the id of goalkeeper
player_ids = np.delete(player_ids, np.argsort(x_means)[0]) # a goalkeeper has the lowest x-position mean
# remove the ids of a substitute player
player_ids = np.delete(player_ids, np.argsort(y_means)[-3:]) # substitute players have the highest y-position mean
data = data[data['tag_id'].isin(player_ids)]

print(data)

        timestamp  tag_id      x_pos      y_pos   heading  direction  \
1             0.0       4  66.376432  34.829683 -1.920454   2.462961   
2             0.0       5  49.250835  37.848381 -3.093733   2.746578   
3             0.0       6  50.108018   7.670564 -2.723989   2.800532   
4             0.0       8  54.919299  47.613906  3.029670   2.900680   
5             0.0      11  57.584265  38.440233  2.832617   2.683097   
...           ...     ...        ...        ...       ...        ...   
493657     2305.0      11  54.195137  40.247683 -1.500097  -1.046535   
493658     2305.0      12  40.058306  43.957064  0.210195  -0.437417   
493660     2305.0      14  59.547065  31.256923 -0.439922   0.397525   
493661     2305.0       1  56.626668  22.002915 -1.447710  -1.502751   
493662     2305.0       2  57.256046  43.943287 -1.797140  -0.129198   

             energy     speed  total_distance  
1       1215.564210  1.088930      832.342371  
2        940.508332  2.880416      939.

In [4]:
timestamp_quarter = [time for time in data['timestamp'].unique() if time % 0.25 == 0]
x_quarter = []
y_quarter = []

timestamp_new = []
x_new = []
y_new = []

for time in timestamp_quarter:
    x_quarter = []
    y_quarter = []
    for tag_id in data['tag_id'].unique():
        x_quarter.append(data['x_pos'][(data['tag_id']==tag_id) & (time<=data['timestamp']) & (data['timestamp']<time+0.25)].mean())
        y_quarter.append(data['y_pos'][(data['tag_id']==tag_id) & (time<=data['timestamp']) & (data['timestamp']<time+0.25)].mean())
    if (np.nan not in x_quarter) and (np.nan not in y_quarter):
        timestamp_new.append(time)
        x_new.append(x_quarter)
        y_new.append(y_quarter)

In [10]:
new_data = pd.DataFrame({'timestamp' : sorted(timestamp_new*10), 
              'tag_id' : [i for i in data['tag_id'].unique()] * len(timestamp_new),
              'x_pos' : [x_pos for sublist in x_new for x_pos in sublist],
              'y_pos' : [y_pos for sublist in y_new for y_pos in sublist]})

In [13]:
new_data.head()

Unnamed: 0,timestamp,tag_id,x_pos,y_pos
0,2.0,4,66.950929,33.928899
1,2.0,5,51.175881,32.666592
2,2.0,6,51.662418,6.631294
3,2.0,8,56.301009,44.716114
4,2.0,11,58.838487,33.070221


In [14]:
new_data.to_csv('../data/input/processed/prepped_tromso_tottenham.csv', index=False)