In [1]:
# Copyright 2018 Esref Ozdemir
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Constructing Datasets with Only hasball Data
In this notebook, we create csv files which contains hasball data for each second of the match.

In [2]:
import pandas as pd
from os import listdir
from os.path import join
import re

In [3]:
raw_dir = '../data/processed'
out_dir = '../data/hasball'

out_cols = ['half', 'minute', 'second', 'teamPoss', 'hasballTeam']

raw_regex = re.compile(r'\d+_raw.csv')

In [4]:
from utils import remove_missing_raw_rows


def raw_to_hasball(raw_filename):
    """
    Creates a hasball csv file in out_dir from a given raw csv filename.
    
    Parameters
    ----------
    raw_filename: Name of raw csv file in raw_dir.
    """
    df = pd.read_csv(join(raw_dir, raw_filename))
    df = remove_missing_raw_rows(df)
    data = df[out_cols].values
    data_uniq = np.unique(data, axis=0)
    out_df = pd.DataFrame(data=data_uniq, columns=out_cols)
    
    match_id = raw_filename.split('_')[0]
    out_file = '{}_hasball.csv'.format(match_id)
    out_df.to_csv(join(out_dir, out_file), index=False)

In [5]:
import multiprocessing


raw_filenames = (f for f in listdir(raw_dir) if raw_regex.match(f))

pool = multiprocessing.Pool()
pool.map(raw_to_hasball, raw_filenames);