In [1]:
# Copyright 2018 Esref Ozdemir
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# pkl to csv Conversion
In this notebook we convert **.pkl** files containing raw and event data to csv files.

In [2]:
import csv
import pickle
from os import listdir
from os.path import join
import pandas as pd
import re

In [3]:
pickle_raw_dir = '../data/pickle/processed'
pickle_event_dir = '../data/pickle/event'
csv_raw_dir = '../data/processed'
csv_event_dir = '../data/event'

event_cols = [
    'teamId',
    'eventId',
    'jersey',
    'half',
    'minute',
    'second',
    'location',
    'bodyPart',
    'postLocation',
    'custom'
]

raw_cols = [
    'half',
    'minute',
    'second',
    'isHomeTeam',
    'teamId',
    'jerseyNumber',
    'x',
    'y',
    'distance',
    'speed',
    'hasballTeam',
    'teamPoss',
    'jerseyPoss',
    'teamType'
]

event_pkl_regex = re.compile(r'\d+_event.pkl')
raw_pkl_regex = re.compile(r'\d+_raw.pkl')

In [3]:
def write_csv(data, cols, out):
    '''
    Write a given list of tuples to a csv file.
    
    
    Parameters
    ----------
    data: A list of tuples containing the values for the specified columns.
    cols: A list of strings containing the column names.
    out: Output csv file.
    '''
    with open(out, 'w') as f:
        csv_out = csv.writer(f)
        csv_out.writerow(cols)
        for row in data:
            csv_out.writerow(row)
            
            
def convert_event(pkl_filename):
    with open(join(pickle_event_dir, pkl_filename), 'rb') as f:
        event_data = pickle.load(f)
    csv_filename = pkl_filename.split('.')[0] + '.csv'
    write_csv(event_data, event_cols, join(csv_event_dir, csv_filename))
    
    
def convert_raw(pkl_filename):
    with open(join(pickle_raw_dir, pkl_filename), 'rb') as f:
        raw_data = pickle.load(f)
    accum_list = []
    for key, val in raw_data.items():
        key_tuple = tuple((int(num) for num in key.split('-')))
        for pos_data in val:
            accum_list.append(key_tuple + pos_data)
    accum_list.sort(key=lambda x: x[:3])  # sort lexicographically with respect to half, minute, second.
    csv_filename = pkl_filename.split('.')[0] + '.csv'
    write_csv(accum_list, raw_cols, join(csv_raw_dir, csv_filename))

# Conversion

In [4]:
import multiprocessing

## Event Conversion

### Event .pkl Format
An event **.pkl** file contains a list of tuples. Each tuple has the following format:

```tuple = (teamId, eventId, jersey, half, minute, second, location, bodyPart, postLocation, custom)```

In [5]:
event_pkl_files = (f for f in listdir(pickle_event_dir) if event_pkl_regex.match(f))

pool = multiprocessing.Pool()
pool.map(convert_event, event_pkl_files);

## Raw Data Conversion

### Raw .pkl Format
A rawdata **.pkl** file contains a dictionary with
* **key:** ```'%d-%d-%d'.format(half, minute, second)```
* **val:** A list of tuples. Each tuple is
  * ```tuple = (isHomeTeam, teamId, jerseyNumber, x, y, distance, speed, hasballteam, teamPoss, jerseyPoss, teamType)```
    * **hasballTeam = teamId**
    * **teamPoss = 1** if home team has possession.
    * **teamType**
      1. home team player
      2. away team player
      3. referee
      4. home goalkeeper
      5. away goalkeeper
      
We first convert the dictionary structure to a single list of tuples and then write the data to corresponding csv files.
* Keys are written as three separate columns.
* Each list is written sequentially.
* Rows are written in increasing half-minute-second order for easier data processing, later.

In [7]:
raw_pkl_files = (f for f in listdir(pickle_raw_dir) if raw_pkl_regex.match(f))

pool = multiprocessing.Pool()
pool.map(convert_raw, raw_pkl_files);