# Expected Score Model - Data Preparation

In [1]:
import pandas as pd
import numpy as np
import joblib
import math
from sklearn.model_selection import train_test_split

import sys
sys.path.append("..")
from expected_score_model.config import chain_file_path
from expected_score_model.domain.preprocessing.data_preprocessor import DataPreprocessor
from expected_score_model.domain.preprocessing.preprocessing import expected_score_response_processing, final_state_map, get_stratified_train_test_val_columns

pd.options.display.max_rows = 999
pd.options.display.max_columns = 999

Import data

In [2]:
chain_data = pd.read_csv(chain_file_path, low_memory=False)
chain_data.head()

Unnamed: 0,Chain_Number,Initial_State,Final_State,Order,Quarter,Quarter_Duration_Chain_Start,Quarter_Duration,Team_Chain,Team,Player,AFL_API_Player_ID,Description,x,y,Disposal,Shot_At_Goal,Behind_Detail,Venue_Width,Venue_Length,Home_Team,Away_Team,Home_Team_Direction_Q1,Match_ID,Round_ID,Year,Season
0,1,centreBounce,goal,1.0,1,13,13.0,Brisbane Lions,,,,Centre Bounce,0.0,0.0,,,,138,156,Brisbane Lions,Sydney,right,202101_BrisbaneLions_Sydney,202101,2021.0,
1,1,centreBounce,goal,2.0,1,13,24.0,Brisbane Lions,Brisbane Lions,Dayne Zorko,Dayne_Zorko,Hard Ball Get,8.0,-5.0,,,,138,156,Brisbane Lions,Sydney,right,202101_BrisbaneLions_Sydney,202101,2021.0,
2,1,centreBounce,goal,3.0,1,13,24.0,Brisbane Lions,Brisbane Lions,Dayne Zorko,Dayne_Zorko,Handball,9.0,-6.0,ineffective,,,138,156,Brisbane Lions,Sydney,right,202101_BrisbaneLions_Sydney,202101,2021.0,
3,1,centreBounce,goal,4.0,1,13,28.0,Brisbane Lions,Sydney,Oliver Florent,Oliver_Florent,Loose Ball Get,11.0,-7.0,,,,138,156,Brisbane Lions,Sydney,right,202101_BrisbaneLions_Sydney,202101,2021.0,
4,1,centreBounce,goal,5.0,1,13,29.0,Brisbane Lions,Sydney,Oliver Florent,Oliver_Florent,Handball,12.0,-5.0,effective,,,138,156,Brisbane Lions,Sydney,right,202101_BrisbaneLions_Sydney,202101,2021.0,


Mapping Response

In [3]:
chain_data = expected_score_response_processing(chain_data)
chain_data.head()

Unnamed: 0,Chain_Number,Initial_State,Final_State,Order,Quarter,Quarter_Duration_Chain_Start,Quarter_Duration,Team_Chain,Team,Player,AFL_API_Player_ID,Description,x,y,Disposal,Shot_At_Goal,Behind_Detail,Venue_Width,Venue_Length,Home_Team,Away_Team,Home_Team_Direction_Q1,Match_ID,Round_ID,Year,Season,Goal,Behind,Miss,Score
0,1,centreBounce,goal,1.0,1,13,13.0,Brisbane Lions,,,,Centre Bounce,0.0,0.0,,,,138,156,Brisbane Lions,Sydney,right,202101_BrisbaneLions_Sydney,202101,2021.0,,0,0,0,0
1,1,centreBounce,goal,2.0,1,13,24.0,Brisbane Lions,Brisbane Lions,Dayne Zorko,Dayne_Zorko,Hard Ball Get,8.0,-5.0,,,,138,156,Brisbane Lions,Sydney,right,202101_BrisbaneLions_Sydney,202101,2021.0,,0,0,0,0
2,1,centreBounce,goal,3.0,1,13,24.0,Brisbane Lions,Brisbane Lions,Dayne Zorko,Dayne_Zorko,Handball,9.0,-6.0,ineffective,,,138,156,Brisbane Lions,Sydney,right,202101_BrisbaneLions_Sydney,202101,2021.0,,0,0,0,0
3,1,centreBounce,goal,4.0,1,13,28.0,Brisbane Lions,Sydney,Oliver Florent,Oliver_Florent,Loose Ball Get,11.0,-7.0,,,,138,156,Brisbane Lions,Sydney,right,202101_BrisbaneLions_Sydney,202101,2021.0,,0,0,0,0
4,1,centreBounce,goal,5.0,1,13,29.0,Brisbane Lions,Sydney,Oliver Florent,Oliver_Florent,Handball,12.0,-5.0,effective,,,138,156,Brisbane Lions,Sydney,right,202101_BrisbaneLions_Sydney,202101,2021.0,,0,0,0,0


Shots

In [4]:
chain_data['Event_Type1'] = chain_data['Description'].shift(1)
df_shots = chain_data[chain_data['Shot_At_Goal'] == True]
df_shots['Set_Shot'] = df_shots['Event_Type1'].apply(lambda x: ("Mark" in x) or ("Free" in x))
df_set_shots = df_shots[df_shots['Set_Shot']]
df_open_shots = df_shots[~df_shots['Set_Shot']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_shots['Set_Shot'] = df_shots['Event_Type1'].apply(lambda x: ("Mark" in x) or ("Free" in x))


In [5]:
df_set_shots.shape

(16310, 32)

Set Goal

In [6]:
set_goal_preproc = DataPreprocessor(set_shot=True, model_response = "Goal")
set_goal_preproc.fit()
set_goal_features = set_goal_preproc.transform(chain_data)
set_goal_modelling_data = pd.concat([df_set_shots, set_goal_features], axis=1)

set_goal_modelling_data = get_stratified_train_test_val_columns(set_goal_modelling_data, response = "Goal")
set_goal_modelling_data.to_csv("../data/set_goal_modelling_data.csv", index=False)
joblib.dump(set_goal_preproc, "../model_outputs/preprocessors/set_goal_preproc.joblib")

  result = getattr(ufunc, method)(*inputs, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_shots['Set_Shot'] = X_shots['Event_Type1'].apply(lambda x: ("Mark" in x) or ("Free" in x))


['../model_outputs/preprocessors/set_goal_preproc.joblib']

Set Behind

In [7]:
set_behind_preproc = DataPreprocessor(set_shot=True, model_response = "Behind")
set_behind_preproc.fit()
set_behind_features = set_behind_preproc.transform(chain_data)
set_behind_modelling_data = pd.concat([df_set_shots, set_behind_features], axis=1)

set_behind_modelling_data = get_stratified_train_test_val_columns(set_behind_modelling_data, response = "Behind")
set_behind_modelling_data.to_csv("../data/set_behind_modelling_data.csv", index=False)
joblib.dump(set_behind_preproc, "../model_outputs/preprocessors/set_behind_preproc.joblib")

  result = getattr(ufunc, method)(*inputs, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_shots['Set_Shot'] = X_shots['Event_Type1'].apply(lambda x: ("Mark" in x) or ("Free" in x))


['../model_outputs/preprocessors/set_behind_preproc.joblib']

Set Miss

In [8]:
set_miss_preproc = DataPreprocessor(set_shot=True, model_response = "Miss")
set_miss_preproc.fit()
set_miss_features = set_miss_preproc.transform(chain_data)
set_miss_modelling_data = pd.concat([df_set_shots, set_miss_features], axis=1)

set_miss_modelling_data = get_stratified_train_test_val_columns(set_miss_modelling_data, response = "Miss")
set_miss_modelling_data.to_csv("../data/set_miss_modelling_data.csv", index=False)
joblib.dump(set_miss_preproc, "../model_outputs/preprocessors/set_miss_preproc.joblib")

  result = getattr(ufunc, method)(*inputs, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_shots['Set_Shot'] = X_shots['Event_Type1'].apply(lambda x: ("Mark" in x) or ("Free" in x))


['../model_outputs/preprocessors/set_miss_preproc.joblib']

Open Goal

In [9]:
open_goal_preproc = DataPreprocessor(set_shot=False, model_response = "Goal")
open_goal_preproc.fit()
open_goal_features = open_goal_preproc.transform(chain_data)
open_goal_modelling_data = pd.concat([df_open_shots, open_goal_features], axis=1)

open_goal_modelling_data = get_stratified_train_test_val_columns(open_goal_modelling_data, response = "Goal")
open_goal_modelling_data.to_csv("../data/open_goal_modelling_data.csv", index=False)
joblib.dump(open_goal_preproc, "../model_outputs/preprocessors/open_goal_preproc.joblib")

  result = getattr(ufunc, method)(*inputs, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_shots['Set_Shot'] = X_shots['Event_Type1'].apply(lambda x: ("Mark" in x) or ("Free" in x))


['../model_outputs/preprocessors/open_goal_preproc.joblib']

Open Behind

In [10]:
open_behind_preproc = DataPreprocessor(set_shot=False, model_response = "Behind")
open_behind_preproc.fit()
open_behind_features = open_behind_preproc.transform(chain_data)
open_behind_modelling_data = pd.concat([df_open_shots, open_behind_features], axis=1)

open_behind_modelling_data = get_stratified_train_test_val_columns(open_behind_modelling_data, response = "Behind")
open_behind_modelling_data.to_csv("../data/open_goal_modelling_data.csv", index=False)
joblib.dump(open_behind_preproc, "../model_outputs/preprocessors/open_behind_preproc.joblib")

  result = getattr(ufunc, method)(*inputs, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_shots['Set_Shot'] = X_shots['Event_Type1'].apply(lambda x: ("Mark" in x) or ("Free" in x))


['../model_outputs/preprocessors/open_behind_preproc.joblib']

Open Miss

In [11]:
open_miss_preproc = DataPreprocessor(set_shot=False, model_response = "Miss")
open_miss_preproc.fit()
open_miss_features = open_miss_preproc.transform(chain_data)
open_miss_modelling_data = pd.concat([df_open_shots, open_miss_features], axis=1)

open_miss_modelling_data = get_stratified_train_test_val_columns(open_miss_modelling_data, response = "Miss")
open_miss_modelling_data.to_csv("../data/open_goal_modelling_data.csv", index=False)
joblib.dump(open_miss_preproc, "../model_outputs/preprocessors/open_miss_preproc.joblib")

  result = getattr(ufunc, method)(*inputs, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_shots['Set_Shot'] = X_shots['Event_Type1'].apply(lambda x: ("Mark" in x) or ("Free" in x))


['../model_outputs/preprocessors/open_miss_preproc.joblib']