In [103]:
import pandas as pd
import os
import numpy as np

pd.options.mode.chained_assignment = None  # default='warn'

In [104]:
def load_files(practice_num):
    
    practice_num = str(practice_num)
    ez = pd.read_csv(os.path.join('practice' + practice_num, 'ez.csv')) 
    sl = pd.read_csv(os.path.join('practice' + practice_num, 'sl.csv'))
    
    return ez, sl

In [105]:
# Name columns

def name_columns(ez, sl):
    columns = ['hr_start', 'min_start', 'sec_start', 'frame_start', 'hr_end', 'min_end', 'sec_end', 'frame_end', 'filename', 'label', 'contacting object', 'opposing object', 'comments']
    ez.columns = columns
    sl.columns = columns
    
    return ez, sl

In [106]:
# Get columns of interest

def columns_of_interest(ez, sl):
    columns_of_interest = ['hr_start', 'min_start', 'sec_start', 'label', 'contacting object', 'opposing object']
    ez = ez[columns_of_interest]
    sl = sl[columns_of_interest]
    
    return ez, sl

In [107]:
# Cut off first index because Google Docs included titles as a row

def remove_first(ez, sl):
    ez = ez[1:]
    sl = sl[1:]
    
    return ez, sl

In [108]:
# Get only instances with head contact (HC label)

def isolate_hc(ez, sl):
    ez = ez[ez['label'] == 'HC']
    sl = sl[sl['label'] == 'HC']
    
    return ez, sl

In [109]:
# Combine both into one DataFrame

def combine(ez, sl):

    df = pd.concat([ez, sl])
    df.sort_values(['hr_start', 'min_start', 'sec_start'])
    
    return df

In [110]:
# Remove duplicate values

def drop_duplicates(df):

    df = df.drop_duplicates(subset=['hr_start', 'min_start', 'sec_start'], keep='last')
    
    return df

In [111]:
def labels_as_strings(ez, sl):
    ez['label'] = ez['label'].astype(str)
    sl['label'] = sl['label'].astype(str)
    
    return ez, sl

In [144]:
# Return number of head collisions

def count_head_impacts(df):

    return df['label'].value_counts()['HC']

In [131]:
def play_counter_row(df):
    
    y_cond = (df['label'].shift(1) == 'NC') & (df['label'] != 'NC')

    df['play'] = np.where(y_cond, '1', '0')

    return df

In [132]:
def prep_data_pipeline(ez, sl):
    
    ez, sl = name_columns(ez, sl)
    
    ez, sl = columns_of_interest(ez, sl)
    
    ez, sl = labels_as_strings(ez, sl)
    
    ez, sl = remove_first(ez, sl)
    
    return ez, sl
    

In [150]:
def average_hc_practice(practice_list):
    total = 0

    for i in practice_list:
        total += hc_per_practice(i)
    
    return total / len(practices)

In [151]:
# Returns (mean, std)
def average_hc_play(practice_list):
    total = 0
    lyst = []

    for i in practice_list:
        lyst.append(hc_per_play(i))
        total += hc_per_play(i)
    
    mean = total / len(practices)
    std = np.std(lyst)
    
    return mean, std

In [152]:
# Calculates head contacts per practice

def hc_per_practice(number):
    
    ez, sl = load_files(number)
    
    ez, sl = prep_data_pipeline(ez, sl)
    
    ez, sl = isolate_hc(ez, sl)
    
    df = combine(ez, sl)
    
    df = drop_duplicates(df)
    
    count = count_head_impacts(df)
    
    return count
    
    

In [153]:
# Calculates average head impacts per play

def hc_per_play(number):
    
    ez, sl = load_files(number)
    
    ez, sl = prep_data_pipeline(ez, sl)
    
    df = combine(ez, sl)
    
    df = drop_duplicates(df)
    
    df = play_counter_row(df)
    
    plays = df.shape[0] - (df['play'].value_counts()['0'])
    
    hc = df['label'].value_counts()['HC']
    
    return hc / plays

In [154]:
practices = [1, 2, 3]

print(average_hc_practice(practices), "head contacts per practice")

12.3333333333 head contacts per practice


In [156]:
print(average_hc_play(practices)[1], "head contacts per play")

0.0669637708005 head contacts per play
