## 0. Libraries and Personal Tools

In [1]:
import sys

from gc import collect
from os.path import abspath
from pandas import read_feather, read_csv
from pandas.api.types import CategoricalDtype

from sklearn.model_selection import train_test_split

In [2]:
# add absolute path from root to sys.path to use custom modules
sys.path.insert(0, abspath('..'))

In [3]:
from src.utils import reduce_mem_usage

## 1. Get Train Data

In [4]:
df = read_feather(f"../data/interim/train_compressed.ftr")
df.set_index(["game_num", "event_id", "event_time"], inplace=True)

In [5]:
# Free some memory after loading the data
collect()

0

## 2. Get Features and Targets

In [6]:
model_features = read_csv("../data/raw/test_dtypes.csv")
FEATURES = [feature for feature in model_features.column.tolist() if feature != "id"]

model_targets = ["team_A_scoring_within_10sec", "team_B_scoring_within_10sec"]
df["scoring_within_10sec"] = df[model_targets].sum(axis=1).astype(CategoricalDtype(categories=[0, 1], ordered=False))

In [7]:
len(FEATURES)

54

In [8]:
TARGET = "scoring_within_10sec"
df[TARGET].value_counts(normalize=True, dropna=False)

0    0.887607
1    0.112393
Name: scoring_within_10sec, dtype: float64

## 3. Train and Test Split

In [9]:
from sklearn.model_selection import GroupShuffleSplit

In [15]:
gsp = GroupShuffleSplit(n_splits=2, test_size=0.3, random_state=777)
train_index, test_index = next(gsp.split(df, groups=df.index.get_level_values("game_num")))

X_train = df[FEATURES].iloc[train_index]
y_train = df[TARGET].iloc[train_index]

X_test = df[FEATURES].iloc[test_index]
y_test = df[TARGET].iloc[test_index]

In [21]:
collect()

481

## 4. Missing Values

In [22]:
# Missing values for player features can occur when a player is not on the field (aka was demolished) 
X_train.isna().sum(axis=0).sort_values(ascending=False).head() / len(X_train)

p3_pos_x    0.009167
p3_boost    0.009167
p3_pos_z    0.009167
p3_vel_x    0.009167
p3_vel_y    0.009167
dtype: float64

In [23]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector

from sklearn.impute import SimpleImputer

In [24]:
imputers = ColumnTransformer(
    transformers=[
        ("player_pos_imputer", SimpleImputer(strategy="constant", fill_value=0.0), selector(pattern="p[0-5]_pos_")), 
        ("player_vel_imputer", SimpleImputer(strategy="constant", fill_value=0.0), selector(pattern="p[0-5]_vel_")), 
        ("player_boost_imputer", SimpleImputer(strategy="constant", fill_value=0.0), selector(pattern="p[0-5]_boost")), 
        ], 
    n_jobs=-1,
    remainder="passthrough"
)

In [25]:
pipe = Pipeline(
    steps=[
        ("impute_missings", imputers), 
        ]
    )

In [26]:
pipe

In [27]:
X_trans = pipe.fit_transform(X_train)

In [28]:
# Free some memory after loading the data
collect()

125