# Expected Score Model - Data Preparation

In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split

import sys
sys.path.append("..")
from expected_score_model.config import raw_data_file_path

pd.options.display.max_rows = 999
pd.options.display.max_columns = 999

Import data

In [2]:
chain_data = pd.read_csv(raw_data_file_path, low_memory=False)
chain_data.head()

Unnamed: 0,Chain_Number,Initial_State,Final_State,Order,Quarter,Quarter_Duration_Chain_Start,Quarter_Duration,Team_Chain,Team,Player,AFL_API_Player_ID,Description,x,y,Disposal,Shot_At_Goal,Behind_Detail,Venue_Width,Venue_Length,Home_Team,Away_Team,Home_Team_Direction_Q1,Match_ID,Round_ID,Year
0,1,centreBounce,goal,1.0,1,13,13.0,Brisbane Lions,,,,Centre Bounce,0.0,0.0,,,,138,156,Brisbane Lions,Sydney,right,202101_BrisbaneLions_Sydney,202101,2021
1,1,centreBounce,goal,2.0,1,13,24.0,Brisbane Lions,Brisbane Lions,Dayne Zorko,Dayne_Zorko,Hard Ball Get,8.0,-5.0,,,,138,156,Brisbane Lions,Sydney,right,202101_BrisbaneLions_Sydney,202101,2021
2,1,centreBounce,goal,3.0,1,13,24.0,Brisbane Lions,Brisbane Lions,Dayne Zorko,Dayne_Zorko,Handball,9.0,-6.0,ineffective,,,138,156,Brisbane Lions,Sydney,right,202101_BrisbaneLions_Sydney,202101,2021
3,1,centreBounce,goal,4.0,1,13,28.0,Brisbane Lions,Sydney,Oliver Florent,Oliver_Florent,Loose Ball Get,11.0,-7.0,,,,138,156,Brisbane Lions,Sydney,right,202101_BrisbaneLions_Sydney,202101,2021
4,1,centreBounce,goal,5.0,1,13,29.0,Brisbane Lions,Sydney,Oliver Florent,Oliver_Florent,Handball,12.0,-5.0,effective,,,138,156,Brisbane Lions,Sydney,right,202101_BrisbaneLions_Sydney,202101,2021


Mapping Response

In [3]:
final_state_map = {
    'goal':'goal',
    'behind':'behind',
    'turnover':'miss',
    'rushed':'miss',
    'outOfBounds':'miss',
    'ballUpCall':'miss',
    'endQuarter':'miss',
    'rushedOpp':'miss',
}
chain_data['Final_State'] = chain_data['Final_State'].replace(final_state_map)

chain_data['Goal'] = np.where((chain_data['Shot_At_Goal'] == True) & (chain_data['Final_State'] == "goal"), 1, 0)
chain_data['Behind'] = np.where((chain_data['Shot_At_Goal'] == True) & (chain_data['Final_State'] == "behind"), 1, 0)
chain_data['Miss'] = np.where((chain_data['Shot_At_Goal'] == True) & (chain_data['Final_State'] == "miss"), 1, 0)

chain_data['Score'] = np.where(chain_data['Goal']==1, 6,
                               np.where(chain_data['Behind']==1, 1, 
                                        0))

Getting Current Score

In [4]:
chain_data['Home_Score'] = np.where(chain_data['Team'] == chain_data['Home_Team'], chain_data['Score'], 0)
chain_data['Away_Score'] = np.where(chain_data['Team'] == chain_data['Away_Team'], chain_data['Score'], 0)

In [5]:
chain_data['Home_Current_Score'] = chain_data.groupby(['Match_ID'])['Home_Score'].cumsum().shift(1)
chain_data['Away_Current_Score'] = chain_data.groupby(['Match_ID'])['Away_Score'].cumsum().shift(1)
chain_data['Current_Margin'] = chain_data['Home_Current_Score'] - chain_data['Away_Current_Score']

Getting previous actions in chain

In [6]:
chain_data['Event_Type0'] = chain_data['Description']
chain_data['Event_Type1'] = chain_data['Description'].shift(1)
chain_data['Event_Type2'] = chain_data['Description'].shift(2)
chain_data['Event_Type3'] = chain_data['Description'].shift(3)

chain_data['x0'] = chain_data['x']
chain_data['x1'] = chain_data['x'].shift(1)
chain_data['x2'] = chain_data['x'].shift(2)
chain_data['x3'] = chain_data['x'].shift(3)

chain_data['y0'] = chain_data['y']
chain_data['y1'] = chain_data['y'].shift(1)
chain_data['y2'] = chain_data['y'].shift(2)
chain_data['y3'] = chain_data['y'].shift(3)

chain_data['Quarter_Duration0'] = chain_data['Quarter_Duration']
chain_data['Quarter_Duration1'] = chain_data['Quarter_Duration'].shift(1)
chain_data['Quarter_Duration2'] = chain_data['Quarter_Duration'].shift(2)
chain_data['Quarter_Duration3'] = chain_data['Quarter_Duration'].shift(3)

Time in Chain

In [7]:
chain_data['Chain_Duration'] = chain_data['Quarter_Duration'] - chain_data['Quarter_Duration_Chain_Start']

Time and Distance to Last Action

In [8]:
chain_data['Time_Since_Last_Action'] = chain_data['Quarter_Duration0'] - chain_data['Quarter_Duration1']
chain_data['Distance_Since_Last_Action'] = ((chain_data['x1'] - chain_data['x0'])**2 + (chain_data['y1'] - chain_data['y0'])**2)**0.5

Angle and Distance to Goal

In [9]:
chain_data['Distance_to_Right_Goal_x'] = chain_data['Venue_Length']/2 - chain_data['x0']
chain_data['Distance_to_Middle_y'] = abs(chain_data['y0'])

chain_data['Distance_to_Middle_Goal'] = (chain_data['Distance_to_Right_Goal_x']**2 + chain_data['Distance_to_Middle_y']**2)**0.5
chain_data['Angle_to_Middle_Goal'] = np.arctan2(chain_data['Distance_to_Middle_y'], chain_data['Distance_to_Right_Goal_x'])
chain_data['Angle_to_Middle_Goal_degrees'] = np.degrees(chain_data['Angle_to_Middle_Goal'])

chain_data['Visible_Goal_Angle'] = (6.4*chain_data['Distance_to_Right_Goal_x']) / (chain_data['Distance_to_Right_Goal_x']**2 + chain_data['Distance_to_Middle_y']**2-(6.4/2)**2)
chain_data['Visible_Goal_Angle_degrees'] = np.degrees(chain_data['Visible_Goal_Angle'])

chain_data['Visible_Behind_Angle'] = ((6.4*3)*chain_data['Distance_to_Right_Goal_x']) / (chain_data['Distance_to_Right_Goal_x']**2 + chain_data['Distance_to_Middle_y']**2-((3*6.4)/2)**2)
chain_data['Visible_Behind_Angle_degrees'] = np.degrees(chain_data['Visible_Behind_Angle'])

Log(Distance) and Distance-squared

In [10]:
chain_data['Squared_Distance_to_Middle_Goal'] = chain_data['Distance_to_Right_Goal_x']**2
chain_data['Log_Distance_to_Middle_Goal'] = np.log(chain_data['Distance_to_Right_Goal_x'])

In [11]:
## Create definition
# Counter attack?
# travelling towards goal?

## Merge player information
# Player - number of kicks/shots, kick length, longest kick distance
# Player heigh / weight
# Left / right footed

## Merge Venue information
# Venue 
# Player kicking at home venue? 

## Merge weather data
# Weather?
# Wind Speed / Direction

Shots

In [12]:
df_shots = chain_data[chain_data['Shot_At_Goal'] == True]

In [13]:
df_shots.head(2)

Unnamed: 0,Chain_Number,Initial_State,Final_State,Order,Quarter,Quarter_Duration_Chain_Start,Quarter_Duration,Team_Chain,Team,Player,AFL_API_Player_ID,Description,x,y,Disposal,Shot_At_Goal,Behind_Detail,Venue_Width,Venue_Length,Home_Team,Away_Team,Home_Team_Direction_Q1,Match_ID,Round_ID,Year,Goal,Behind,Miss,Score,Home_Score,Away_Score,Home_Current_Score,Away_Current_Score,Current_Margin,Event_Type0,Event_Type1,Event_Type2,Event_Type3,x0,x1,x2,x3,y0,y1,y2,y3,Quarter_Duration0,Quarter_Duration1,Quarter_Duration2,Quarter_Duration3,Chain_Duration,Time_Since_Last_Action,Distance_Since_Last_Action,Distance_to_Right_Goal_x,Distance_to_Middle_y,Distance_to_Middle_Goal,Angle_to_Middle_Goal,Angle_to_Middle_Goal_degrees,Visible_Goal_Angle,Visible_Goal_Angle_degrees,Visible_Behind_Angle,Visible_Behind_Angle_degrees,Squared_Distance_to_Middle_Goal,Log_Distance_to_Middle_Goal
10,1,centreBounce,goal,11.0,1,13,39.0,Brisbane Lions,Brisbane Lions,Zac Bailey,Zac_Bailey,Kick,26.0,-21.0,effective,True,,138,156,Brisbane Lions,Sydney,right,202101_BrisbaneLions_Sydney,202101,2021,1,0,0,6,6,0,0.0,0.0,0.0,Kick,Handball Received,Handball,Gather from Opposition,26.0,18.0,11.0,6.0,-21.0,-23.0,-26.0,-27.0,39.0,38.0,37.0,36.0,26.0,1.0,8.246211,52.0,21.0,56.0803,0.383818,21.99113,0.106164,6.082774,0.32704,18.737997,2704.0,3.951244
60,6,possGain,goal,61.0,1,149,168.0,Brisbane Lions,Brisbane Lions,Zac Bailey,Zac_Bailey,Kick,35.0,19.0,effective,True,,138,156,Brisbane Lions,Sydney,right,202101_BrisbaneLions_Sydney,202101,2021,1,0,0,6,6,0,6.0,0.0,6.0,Kick,Handball Received,Handball,Loose Ball Get,35.0,28.0,25.0,22.0,19.0,19.0,26.0,29.0,168.0,167.0,165.0,165.0,19.0,1.0,7.0,43.0,19.0,47.010637,0.416065,23.83874,0.125105,7.167963,0.389831,22.33568,1849.0,3.7612


Filter Open Play v Set Shots

In [14]:
df_shots['Set_Shot'] = df_shots['Event_Type1'].apply(lambda x: ("Mark" in x) or ("Free" in x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_shots['Set_Shot'] = df_shots['Event_Type1'].apply(lambda x: ("Mark" in x) or ("Free" in x))


In [15]:
df_shots['Set_Shot'].value_counts()

True     11057
False     9216
Name: Set_Shot, dtype: int64

In [31]:
df_set_shots = df_shots[df_shots['Set_Shot']]
df_open_shots = df_shots[~df_shots['Set_Shot']]

Training, Test, Validation Sets

In [32]:
def get_stratified_train_test_val_columns(data, response):
    
    X, y = data.drop(columns=[response]), data[response]
    X_modelling, X_test, y_modelling, y_test = train_test_split(X, y, test_size = 0.2, random_state=2407, stratify=y)
    X_train, X_val, y_train, y_val = train_test_split(X_modelling, y_modelling, test_size = 0.2, random_state=2407, stratify=y_modelling)
    X_train[response+'TrainingSet'] = True
    X_test[response+'TestSet'] = True
    X_val[response+'ValidationSet'] = True
    
    if [response+'TrainingSet', response+'TestSet', response+'ValidationSet'] not in list(data):
        data = pd.merge(data, X_train[response+'TrainingSet'], how="left", left_index=True, right_index=True) 
        data = pd.merge(data, X_test[response+'TestSet'], how="left", left_index=True, right_index=True) 
        data = pd.merge(data, X_val[response+'ValidationSet'], how="left", left_index=True, right_index=True)
        data[[response+'TrainingSet', response+'TestSet', response+'ValidationSet']] = data[[response+'TrainingSet', response+'TestSet', response+'ValidationSet']].fillna(False) 
        
    return data

In [33]:
df_set_shots = get_stratified_train_test_val_columns(df_set_shots, response = "Goal")
df_set_shots = get_stratified_train_test_val_columns(df_set_shots, response = "Behind")
df_set_shots = get_stratified_train_test_val_columns(df_set_shots, response = "Miss")

In [34]:
df_open_shots = get_stratified_train_test_val_columns(df_open_shots, response = "Goal")
df_open_shots = get_stratified_train_test_val_columns(df_open_shots, response = "Behind")
df_open_shots = get_stratified_train_test_val_columns(df_open_shots, response = "Miss")

In [38]:
df_set_shots[df_set_shots['GoalTrainingSet']]['Goal'].mean(), df_set_shots[df_set_shots['GoalTestSet']]['Goal'].mean(), df_set_shots[df_set_shots['GoalValidationSet']]['Goal'].mean()

(0.5412662521198417, 0.5415913200723327, 0.5415488976823064)

In [39]:
df_set_shots[df_set_shots['BehindTrainingSet']]['Behind'].mean(), df_set_shots[df_set_shots['BehindTestSet']]['Behind'].mean(), df_set_shots[df_set_shots['BehindValidationSet']]['Behind'].mean()

(0.33352176370830977, 0.3336347197106691, 0.33352176370830977)

In [40]:
df_set_shots[df_set_shots['MissTrainingSet']]['Miss'].mean(), df_set_shots[df_set_shots['MissTestSet']]['Miss'].mean(), df_set_shots[df_set_shots['MissValidationSet']]['Miss'].mean()

(0.12507066139061618, 0.1252260397830018, 0.12492933860938384)

In [41]:
df_open_shots[df_open_shots['GoalTrainingSet']]['Goal'].mean(), df_open_shots[df_open_shots['GoalTestSet']]['Goal'].mean(), df_open_shots[df_open_shots['GoalValidationSet']]['Goal'].mean()

(0.4280142445311175, 0.42787418655097614, 0.4277966101694915)

In [42]:
df_open_shots[df_open_shots['BehindTrainingSet']]['Behind'].mean(), df_open_shots[df_open_shots['BehindTestSet']]['Behind'].mean(), df_open_shots[df_open_shots['BehindValidationSet']]['Behind'].mean()

(0.3647617432592844, 0.36496746203904556, 0.36474576271186443)

In [43]:
df_open_shots[df_open_shots['MissTrainingSet']]['Miss'].mean(), df_open_shots[df_open_shots['MissTestSet']]['Miss'].mean(), df_open_shots[df_open_shots['MissValidationSet']]['Miss'].mean()

(0.2072240122095981, 0.20715835140997832, 0.20745762711864407)

Export .csv

In [None]:
df_set_shots.to_csv("../data/set_shots.csv", index=False)
df_open_shots.to_csv("../data/open_shots.csv", index=False)