## 0. Libraries and Personal Tools

In [1]:
import sys

from gc import collect
from os.path import abspath
from pandas import read_feather, read_csv
from pandas.api.types import CategoricalDtype

from sklearn.model_selection import train_test_split

In [2]:
# add absolute path from root to sys.path to use custom modules
sys.path.insert(0, abspath('..'))

In [3]:
from src.utils import reduce_mem_usage

## 1. Get Train Data

In [4]:
df = read_feather(f"../data/interim/small_train_compressed.ftr")
df.set_index(["game_num", "event_id", "event_time"], inplace=True)

In [5]:
# Free some memory after loading the data
collect()

0

## 2. Get Features and Targets

In [6]:
model_features = read_csv("../data/raw/test_dtypes.csv")
FEATURES = [feature for feature in model_features.column.tolist() if feature != "id"]

In [7]:
len(FEATURES)

54

In [8]:
TEAM = "A"
TARGET = f"team_{TEAM}_scoring_within_10sec"
df[TARGET].value_counts(normalize=True, dropna=False)

0    0.941689
1    0.058311
Name: team_A_scoring_within_10sec, dtype: float64

## 3. Train and Test Split

In [9]:
from sklearn.model_selection import GroupShuffleSplit

In [10]:
gsp = GroupShuffleSplit(n_splits=2, test_size=0.3, random_state=777)
train_index, test_index = next(gsp.split(df, groups=df.index.get_level_values("game_num")))

X_train = df[FEATURES].iloc[train_index]
y_train = df[TARGET].iloc[train_index]

X_test = df[FEATURES].iloc[test_index]
y_test = df[TARGET].iloc[test_index]

In [11]:
del df

In [12]:
collect()

0

## 4. Feature Engineering

1. Join distance from two team members to the ball
2. Distance from the ball to the goal post
3. Speed of the ball calculated from velocity vector
4. Speed for each team member calculated from velocity vector
5. Distance from team members to orbs positions
6. Angles between team members and the ball

In [13]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector

from sklearn.impute import SimpleImputer

### 4.1 Missing Values (Player Features)

In [14]:
player_imputer = Pipeline(
    steps=[("player_imputer", SimpleImputer(strategy="constant", fill_value=0.0))]
)

### 4.3 Distance from the ball to the goal post

In [15]:
# Check Mahdee's discussion for more details about the goal post
# Source: https://www.kaggle.com/competitions/tabular-playground-series-oct-2022/discussion/357633
GOAL_POST1 = (0.0, -100.0, 6.8)
GOAL_POST2 = (0.0, 100.0, 6.8)

In [16]:
from scipy.spatial.distance import euclidean
from sklearn.base import BaseEstimator, TransformerMixin, _ClassNamePrefixFeaturesOutMixin

class DistanceBallGoalPosts(BaseEstimator, TransformerMixin, _ClassNamePrefixFeaturesOutMixin):
    
    def __init__(self, posts):
        self.posts = posts

    def get_feature_names_out(self, input_features=None):
        return [f"ball_distance_to_post{i+1}" for i in range(len(self.posts))]
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):

        X_trans = X.copy()
        ball_postions = X_trans[["ball_pos_x", "ball_pos_y", "ball_pos_z"]]
        
        for i, post in enumerate(self.posts):
            diff_x = (X_trans["ball_pos_x"] - post[0]).pow(2)
            diff_y = (X_trans["ball_pos_y"] - post[1]).pow(2)
            diff_z = (X_trans["ball_pos_z"] - post[2]).pow(2)

            diff_x.add(diff_y).add(diff_z).pow(0.5)
            X_trans[f"ball_distance_to_post{i}"] = ball_postions.apply(lambda x: euclidean(x, GOAL_POST1), axis=1)

        return X_trans

In [17]:
# TODO: Another implementation can be found here:
# https://github.com/scikit-learn/scikit-learn/discussions/23992
# 
# from sklearn.preprocessing import FunctionTransformer
# 
# def get_distance_ball_to_goal_post(X, posts):
#     X_trans = X.copy()
#     ball_postions = X_trans[["ball_pos_x", "ball_pos_y", "ball_pos_z"]]
#     
#     for i, post in enumerate(posts):
#         diff_x = (X_trans["ball_pos_x"] - post[0]).pow(2)
#         diff_y = (X_trans["ball_pos_y"] - post[1]).pow(2)
#         diff_z = (X_trans["ball_pos_z"] - post[2]).pow(2)
# 
#         diff_x.add(diff_y).add(diff_z).pow(0.5)
#         X_trans[f"ball_distance_to_post{i}"] = ball_postions.apply(lambda x: euclidean(x, GOAL_POST1), axis=1)
# 
#     return X_trans
# 
# def double_feature_names(transformer, feature_names):
#     return [f'{col}_doubled' for col in feature_names]
# 
# FunctionTransformer(get_distance_ball_to_goal_post, feature_names_out=double_feature_names)

In [41]:
transformations = ColumnTransformer(
    transformers=[
        ("player_imputer", player_imputer, selector(pattern="p[0-5]_")),
        ("distance_ball_goal_posts", DistanceBallGoalPosts(posts=(GOAL_POST1, GOAL_POST2)), selector(pattern="ball_pos")),
        ], 
    n_jobs=-1,
    remainder="passthrough",
    # verbose_feature_names_out=False,
)

In [42]:
pipe = Pipeline(
    steps=[
        ("transformations", transformations),
        ]
    )

In [43]:
pipe

In [21]:
X_trans = pipe.fit_transform(X_train)

In [40]:
pipe.get_feature_names_out().tolist()

['p0_pos_x',
 'p0_pos_y',
 'p0_pos_z',
 'p0_vel_x',
 'p0_vel_y',
 'p0_vel_z',
 'p0_boost',
 'p1_pos_x',
 'p1_pos_y',
 'p1_pos_z',
 'p1_vel_x',
 'p1_vel_y',
 'p1_vel_z',
 'p1_boost',
 'p2_pos_x',
 'p2_pos_y',
 'p2_pos_z',
 'p2_vel_x',
 'p2_vel_y',
 'p2_vel_z',
 'p2_boost',
 'p3_pos_x',
 'p3_pos_y',
 'p3_pos_z',
 'p3_vel_x',
 'p3_vel_y',
 'p3_vel_z',
 'p3_boost',
 'p4_pos_x',
 'p4_pos_y',
 'p4_pos_z',
 'p4_vel_x',
 'p4_vel_y',
 'p4_vel_z',
 'p4_boost',
 'p5_pos_x',
 'p5_pos_y',
 'p5_pos_z',
 'p5_vel_x',
 'p5_vel_y',
 'p5_vel_z',
 'p5_boost',
 'ball_distance_to_post0',
 'ball_distance_to_post1',
 'ball_vel_x',
 'ball_vel_y',
 'ball_vel_z',
 'boost0_timer',
 'boost1_timer',
 'boost2_timer',
 'boost3_timer',
 'boost4_timer',
 'boost5_timer']

In [37]:
from pandas import DataFrame
TRANS_FEATURES = pipe.get_feature_names_out().tolist()
# DataFrame(X_trans, columns=).head()