## Data Preprocessing Code

`Version Control: 0.0.1`

In [1]:
__VERSION__ = "0.0.1"

`Prep: Libraries Used`

In [2]:
# Environment related libraries
import io
import os
import warnings

In [3]:
# Web related libraries
import requests
from requests.packages.urllib3.exceptions import InsecureRequestWarning

In [4]:
# Data related libraries
import math
import numpy as np
import pandas as pd

In [5]:
# Type related libraries
from typing import Dict, List, Tuple, Any

In [6]:
# Deep Neural Network related libraries
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [7]:
# Metrics related libraries
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score

In [8]:
# Python File dumper
# version 0.0.1.2 revesion 250328

import gzip
import lzma
import pickle
from typing import Any

# Kompress can be:
#  gzip
#  lzma

def save(obj: Any, filename: str, *, kompress: Any = None, protocol: int | None = None, **kwargs):
    """
    Save a Python object to a file using pickle.
    Directly save without wrapping.
    """
    # Uncompress
    if kompress is None:
        with open(filename, 'wb') as f:
            pickle.dump(obj, f, protocol = protocol)
    # Compress
    else:
        with kompress.open(filename, 'wb', **kwargs) as f:
            pickle.dump(obj, f, protocol = protocol)
            
def dump(obj: Any, filename: str, *, kompress: Any = None, protocol: int | None = None, **kwargs):
    """
    Save a Python object to a file using pickle.
    Dump with wrapper and check.
    """
    wrapper = {}
    wrapper["~attr~"] = "~dump~"
    wrapper["~hash~"] = str(hash(obj))
    wrapper["data"] = obj
    
    # Uncompress
    if kompress is None:
        with open(filename, 'wb') as f:
            pickle.dump(wrapper, f, protocol = protocol)
    # Compress
    else:
        with kompress.open(filename, 'wb', **kwargs) as f:
            pickle.dump(obj, f, protocol = protocol)

def load(filename: str, *, kompress: Any = None) -> Any:
    """
    Load a Python object from a pickle file.
    Generally loading. Try to unwrap if possible
    
    Exception:
        Throw a ValueError when in the dumping mode and failed to
        pass the hash test.
    """
    # Uncompress
    if kompress is None:
        with open(filename, 'rb') as f:
            obj = pickle.load(f)
    else:
        with kompress.open(filename, 'rb') as f:
            obj = pickle.load(f)
    
    # No need to unwrap
    if isinstance(obj, dict) == False:
        return obj
    elif isinstance(obj, dict) == True and obj.get("~attr~", None) is None:
        return obj
    
    # Need to unwrap
    if isinstance(obj, dict) == True and obj.get("~attr~", None) == "~dump~":
        if obj.get("~hash~", None) is None:
            raise ValueError("Corrupted dumpped file. Hash attribute has Nonetype.")
        elif isinstance(obj.get("~hash~", None), str) == False:
            raise ValueError("Corrupted dumpped file. Hash attribute has Non-string type.")
        if obj.get("data", None) is None:
            raise ValueError("Corrupted dumpped file. Data attribute is Nonetype.")
        if obj.get("~hash~", None) != str(hash(obj.get("data"))):
            raise ValueError("Corrupted dumpped file. Data hash mismatched.")
        return obj["data"] 
    
    else:
        return obj


In [9]:
# Disable warnings
warnings.filterwarnings('ignore')

`Section: Data Importing`

In [10]:
# Data importer. Import all data either from a local address or from the remote site.
def import_all_data_v1(source: str = 'local') -> Tuple[Any, Any, Any, Any, Any, Any, Any, Any, Any]:
    """
    Import all data from CSV files either from a local drive or from an online source.
    
    Parameters:
    source (str): Determines the source of the files. Accepted values are 'local' or 'online'.
                  - 'local': The CSV files are read from a fixed relative local directory.
                  - 'online': The CSV files are read from a fixed online URL.
                  
    Returns:
    tuple: A tuple containing nine pandas DataFrames in the following order:
           1. train_away_player_statistics_df
           2. train_away_team_statistics_df
           3. train_home_player_statistics_df
           4. train_home_team_statistics_df
           5. test_away_player_statistics_df
           6. test_away_team_statistics_df
           7. test_home_player_statistics_df
           8. test_home_team_statistics_df
           9. Y_train
    """
    
    # If source is not `local` or `remote`
    if source not in ["local", "remote"]:
        raise ValueError(f"Arg `source` must be one of `local` or `remote` but you have `{source}`.")
    
    # Define the file names
    file_names = [
        "train_away_player_statistics_df.csv",
        "train_away_team_statistics_df.csv",
        "train_home_player_statistics_df.csv",
        "train_home_team_statistics_df.csv",
        "test_away_player_statistics_df.csv",
        "test_away_team_statistics_df.csv",
        "test_home_player_statistics_df.csv",
        "test_home_team_statistics_df.csv",
        "Y_train.csv"
    ]
    
    # Define fixed addresses for local and online sources.
    local_dir = "./data/"
    
    # For the online source, set a base URL where the CSV files are hosted.
    online_base_url = "https://huggingface.co/datasets/bh2821/soccer_pred/resolve/main/"
    # MIT Licensed data source.
    
    # List to hold the DataFrames
    dataframes = [] # a List of pd.DataFrames
    
    # Iterate over the file names and import the CSV files accordingly
    for file_name in file_names:
        if source == 'local':
            # Construct the full file path for the local files
            file_path = os.path.join(local_dir, file_name)
            try:
                df = pd.read_csv(file_path)
            except Exception as e:
                raise FileNotFoundError(f"Error reading local file '{file_path}': {e}")
        
        elif source == 'remote':
            # Construct the full URL for the remote files
            file_url = online_base_url + file_name
            try:
                # Disable warnings for insecure requests if certificate verification is disabled
                warnings.simplefilter('ignore', InsecureRequestWarning)
                # Fetch the content of the CSV file with SSL verification disabled
                response = requests.get(file_url, verify = False)
                response.raise_for_status()  # Raise an error for bad responses
                # Use io.StringIO to convert the response text into a file-like object for pandas
                df = pd.read_csv(io.StringIO(response.text), )
            except Exception as e:
                raise ConnectionError(f"Error reading online file '{file_url}': {e}")
        else:
            raise ValueError("Invalid source. Choose 'local' or 'remote'.")
        
        # Append the DataFrame to the list
        dataframes.append(df)
    
    # Return the DataFrames as a tuple
    return tuple(dataframes)


In [None]:
# Load the data from the HF remote source.
(train_away_player, train_away_team, train_home_player, train_home_team,
 test_away_player, test_away_team, test_home_player, test_home_team,
 train_labels) = import_all_data_v1(source='remote')


In [None]:
# Drop the extra columns for train data
train_away_team = train_away_team.drop(["LEAGUE", "TEAM_NAME"], axis=1)
train_home_team = train_home_team.drop(["LEAGUE", "TEAM_NAME"], axis=1)
train_away_player = train_away_player.drop(["LEAGUE", "TEAM_NAME", "PLAYER_NAME"], axis=1)
train_home_player = train_home_player.drop(["LEAGUE", "TEAM_NAME", "PLAYER_NAME"], axis=1)

In [None]:
#save((train_away_player, train_away_team, train_home_player, train_home_team,
# test_away_player, test_away_team, test_home_player, test_home_team,
# train_labels), "./imported_data.bin")
#(train_away_player, train_away_team, train_home_player, train_home_team,
# test_away_player, test_away_team, test_home_player, test_home_team,
# train_labels)  = load("./imported_data.bin")

`Section: Data Preprocessing`

v1

In [None]:
# Preprocesses team and player data for training or testing.
def preprocess_data_v1(data_type='train'):
    """
    Preprocesses team and player data for training or testing.

    For teams:
      - Renames columns (except "ID") with prefix "AWAY_" or "HOME_"
      - Inner joins the away and home team dataframes on "ID"
      - Fills NaN with 0 and divides numerical values by 10.

    For players:
      - One-hot encodes the "POSITION" column (treating NaN as a separate category).
      - For each team, pivots the player rows into a single wide row (max 30 players per team, padding with zeros if needed)
      - Renames columns for each player with prefix "AWAY" or "HOME" and an index suffix.
      - Joins away and home wide tables on "ID", fills NaNs with 0, and divides by 100.

    Args:
      data_type (str): 'train' or 'test'
      
    Returns:
      If data_type=='train': (team_data, player_data, labels)
      Else: (team_data, player_data)
    """
    
    # Select data based on train/test
    if data_type == 'train':
        away_player = train_away_player.copy()
        home_player = train_home_player.copy()
        away_team   = train_away_team.copy()
        home_team   = train_home_team.copy()
        labels      = train_labels.copy()
    elif data_type == 'test':
        away_player = test_away_player.copy()
        home_player = test_home_player.copy()
        away_team   = test_away_team.copy()
        home_team   = test_home_team.copy()
    else:
        raise ValueError("data_type must be 'train' or 'test'")
    
    # --- TEAM DATA PROCESSING ---
    # Function to prefix columns (except 'ID')
    def prefix_columns(df, prefix):
        new_columns = {col: f"{prefix}{col}" for col in df.columns if col != "ID"}
        return df.rename(columns=new_columns)
    
    # Rename columns for team data
    away_team_pref = prefix_columns(away_team, "AWAY_")
    home_team_pref = prefix_columns(home_team, "HOME_")
    
    # Merge on ID (inner join)
    team_data = pd.merge(away_team_pref, home_team_pref, on="ID", how="inner")
    # Standardize: fill NaNs and divide by 10
    team_data = team_data.fillna(0)
    # Assuming that the "ID" column is not numeric to standardize; we apply division to numeric columns only.
    numeric_cols = team_data.select_dtypes(include=[np.number]).columns
    team_data[numeric_cols] = team_data[numeric_cols] / 10.0

    # --- PLAYER DATA PROCESSING ---
    # One-hot encoding for "POSITION", treating NaN as separate category.
    def encode_positions(df):
        position_dummies = pd.get_dummies(df["POSITION"], prefix="POSITION", dummy_na=True)
        df = df.drop(columns=["POSITION"])
        return pd.concat([df, position_dummies], axis=1)
    
    away_player = encode_positions(away_player)
    home_player = encode_positions(home_player)
    
    # Function to pivot player rows into a wide table per team.
    def pivot_players(df, prefix, max_players=30):
        wide_rows = []
        # Group players by team ID
        for team_id, group in df.groupby("ID"):
            # For reproducibility, sort the group by index (or any specific order you need)
            group = group.sort_index().reset_index(drop=True)
            # Limit to max_players (pad if less than max_players)
            if len(group) < max_players:
                # Create a DataFrame with zeros for padding with same columns as group
                pad_df = pd.DataFrame(0, index=range(max_players - len(group)), columns=group.columns)
                group = pd.concat([group, pad_df], ignore_index=True)
            else:
                group = group.head(max_players).reset_index(drop=True)
            
            # Create a single row for this team
            wide_data = {"ID": team_id}
            # For each player row, add columns with a suffix indicating the player index (1-indexed)
            for i in range(max_players):
                player_data = group.iloc[i]
                for col in group.columns:
                    if col == "ID":
                        continue
                    new_col_name = f"{prefix}{col}_{i+1}"
                    wide_data[new_col_name] = player_data[col]
            wide_rows.append(wide_data)
        return pd.DataFrame(wide_rows)
    
    # Pivot away and home players separately
    away_players_wide = pivot_players(away_player, "AWAY_")
    home_players_wide = pivot_players(home_player, "HOME_")
    
    # Merge the wide tables on "ID"
    player_data = pd.merge(away_players_wide, home_players_wide, on="ID", how="inner")
    # Fill NaNs and standardize by dividing by 100.
    player_data = player_data.fillna(0)
    numeric_cols = player_data.select_dtypes(include=[np.number]).columns
    player_data[numeric_cols] = player_data[numeric_cols] / 100.0

    # Return outputs according to data_type
    if data_type == 'train':
        return team_data, player_data, labels
    else:
        return team_data, player_data


v2

In [None]:
# Preprocesses team and player data for training or testing.
def preprocess_data_v2(data_type='train'):
    """
    Preprocess team and player data for training or testing.
    
    TEAM DATA:
      - Rename team columns (except "ID") with prefixes "AWAY_" or "HOME_".
      - Outer join the two team dataframes on "ID".
      - Fill NaNs with 0 and standardize numeric columns (except "ID") by dividing by 10.
    
    PLAYER DATA (for both AWAY and HOME):
      - Work on the raw player dataframe (without any prefix).
      - For each team, fill missing POSITION with 'missing'.
      - Split players into five groups by POSITION with allowed limits:
            goalkeeper: max 8, defender: max 12, midfielder: max 12, attacker: max 12, missing: max 4.
      - Within each group, sort descending by "PLAYER_MINUTES_PLAYED_season_sum".
      - Concatenate the groups in fixed order: goalkeeper, defender, midfielder, attacker, missing.
      - Pad with rows (of zeros) so that each team has exactly 48 players.
      - One-hot encode the (post-alignment) POSITION column.
      - Finally, pivot the 48 player rows into a single wide row. Only after this pivot add the prefix 
        ("AWAY_PLAYER_" or "HOME_PLAYER_") to the resulting columns.
      - Standardize numeric columns (except "ID") by dividing by 100.
    
    FINAL STEP:
      - Outer join the processed team table with the processed (wide) player table on "ID".
      - Return the full merged dataframe (and also labels for training).
    
    Args:
      data_type (str): 'train' or 'test'
    
    Returns:
      For 'train': (full_data, labels)
      For 'test': full_data
    """
    
    # ===== Import Data =====
    # It is assumed that the following variables are already imported:
    # (train_away_player, train_away_team, train_home_player, train_home_team,
    #  test_away_player, test_away_team, test_home_player, test_home_team,
    #  train_labels) = import_all_data(source='remote')
    if data_type == 'train':
        away_player = train_away_player.copy()
        home_player = train_home_player.copy()
        away_team   = train_away_team.copy()
        home_team   = train_home_team.copy()
        labels      = train_labels.copy()
    elif data_type == 'test':
        away_player = test_away_player.copy()
        home_player = test_home_player.copy()
        away_team   = test_away_team.copy()
        home_team   = test_home_team.copy()
    else:
        raise ValueError("data_type must be 'train' or 'test'")
    
    # ===== TEAM DATA PROCESSING =====
    def prefix_team_columns(df, prefix):
        # Rename all columns except "ID" with the given prefix.
        new_cols = {col: f"{prefix}{col}" for col in df.columns if col != "ID"}
        return df.rename(columns=new_cols)
    
    away_team_pref = prefix_team_columns(away_team, "AWAY_")
    home_team_pref = prefix_team_columns(home_team, "HOME_")
    
    # Outer join teams on "ID" so that missing IDs on either side are retained.
    team_data = pd.merge(away_team_pref, home_team_pref, on="ID", how="outer")
    team_data = team_data.fillna(0)
    # Standardize numeric columns except "ID"
    num_cols = team_data.select_dtypes(include=[np.number]).columns.tolist()
    if "ID" in num_cols:
        num_cols.remove("ID")
    team_data[num_cols] = team_data[num_cols] / 10.0
    
    # ===== PLAYER DATA PROCESSING =====
    # Alignment parameters
    pos_order_map = {'goalkeeper': 0, 'defender': 1, 'midfielder': 2, 'attacker': 3, 'missing': 4}
    max_allowed   = {'goalkeeper': 8, 'defender': 12, 'midfielder': 12, 'attacker': 12, 'missing': 4}
    
    def process_team_group(group):
        # Process players for one team.
        # Fill missing POSITION.
        group = group.copy()
        group['POSITION'] = group['POSITION'].fillna('missing')
        # Map position to ordering.
        group['pos_order'] = group['POSITION'].map(pos_order_map)
        # Within each position, rank by PLAYER_MINUTES_PLAYED_season_sum (descending)
        group['pos_rank'] = group.groupby('POSITION')['PLAYER_MINUTES_PLAYED_season_sum']\
                                  .rank(method='first', ascending=False)
        # Filter out players that exceed the allowed count per POSITION.
        group = group[group.apply(lambda r: r['pos_rank'] <= max_allowed[r['POSITION']], axis=1)]
        # Sort by fixed order then by rank.
        group = group.sort_values(by=['pos_order', 'pos_rank'], ascending=[True, True]).reset_index(drop=True)
        # Assign a sequential player index.
        group['player_index'] = np.arange(1, len(group) + 1)
        # Pad with rows of zeros so that there are exactly 48 rows.
        if len(group) < 48:
            pad = pd.DataFrame(0, index=range(48 - len(group)), columns=group.columns)
            pad['player_index'] = np.arange(len(group) + 1, 49)
            # Ensure that the team ID is filled in the padding.
            if 'ID' in group.columns and not group['ID'].empty:
                pad['ID'] = group['ID'].iloc[0]
            group = pd.concat([group, pad], ignore_index=True)
        else:
            group = group.head(48)
        # One-hot encode the POSITION column.
        group['POSITION'] = group['POSITION'].replace(0, 'missing')
        pos_oh = pd.get_dummies(group['POSITION'], prefix="POSITION").astype(int)
        group = pd.concat([group.drop(columns=['POSITION']), pos_oh], axis=1)
        return group
    
    # We'll define a function that pivots players for each team.
    def process_players_df(df, prefix):
        # Process each team using groupby-apply.
        proc = df.groupby("ID", group_keys=True).apply(process_team_group)
        # The result has a MultiIndex: first level is team ID, second level is the row number within that team.
        # Drop helper columns that are not needed.
        proc = proc.drop(columns=['pos_order', 'pos_rank'])
        # Remove the constant 'ID' column (it will be reattached after pivoting).
        proc_no_id = proc.drop(columns=["ID"])
        # Set 'player_index' as an additional index level.
        proc_no_id = proc_no_id.set_index('player_index', append=True)
        # Now drop the extra index level created by groupby (the original row number) so that the index is [ID, player_index]
        proc_no_id.index = proc_no_id.index.droplevel(1)
        # Pivot: unstack the 'player_index' level so that each player's data becomes columns.
        wide_table = proc_no_id.unstack('player_index')
        # Flatten the multi-index columns and add the desired prefix.
        wide_table.columns = [f"{prefix}{col}_{idx}" for col, idx in wide_table.columns]
        wide_table = wide_table.reset_index()
        return wide_table
    
    # Process away and home players.
    away_players_wide = process_players_df(away_player.copy(), "AWAY_PLAYER_")
    home_players_wide = process_players_df(home_player.copy(), "HOME_PLAYER_")
    
    # Outer join the two wide player tables on "ID"
    player_data = pd.merge(away_players_wide, home_players_wide, on="ID", how="outer")
    player_data = player_data.fillna(0)
    # Standardize numeric columns (except "ID") by dividing by 100.
    num_cols = player_data.select_dtypes(include=[np.number]).columns.tolist()
    if "ID" in num_cols:
        num_cols.remove("ID")
    player_data[num_cols] = player_data[num_cols] / 100.0
    
    # ===== FINAL MERGE =====
    full_data = pd.merge(team_data, player_data, on="ID", how="outer")
    full_data = full_data.fillna(0)
    
    if data_type == 'train':
        return full_data, labels
    else:
        return full_data


v3

In [None]:
# Preprocesses team and player data for training or testing.
def preprocess_data_v3(data_type='train'):
    """
    Process raw team and player data into a 3D array for CNN models.
    
    For each match (unique ID) we gather:
      [HOME_TEAM_ROW, AWAY_TEAM_ROW, HOME_PLAYER_ROWS, AWAY_PLAYER_ROWS]
    (Here team rows and player rows are kept in their raw form without renaming.)
    
    Then for each match we build a 2D array (shape: 147 x W) where:
      - The first 3 rows are team info:
          Row 1: HOME team (padded to width W)
          Row 2: AWAY team (padded to width W)
          Row 3: HOME - AWAY difference (padded to width W)
      - The next 144 rows are built from players:
          * For each team we build a player table with exactly 48 rows.
            For each POSITION category, players are sorted descending by 
            "PLAYER_MINUTES_PLAYED_season_sum" and padded to fixed numbers:
                goalkeeper: 8, defender: 12, midfielder: 12, attacker: 12, missing: 4.
            The padded rows have all numeric features = 0 but the POSITION one-hot 
            (later) is set to that category.
          * Then we compute DIFF = HOME_player_table - AWAY_player_table.
          * Finally, we interleave row-wise: for each of the 48 indices, stack:
              HOME row, AWAY row, and DIFF row.
    
    Standardization:
      - The team section (first 3 rows) is divided by 10.
      - The player section (remaining 144 rows) is divided by 100.
    
    Returns:
      A 3D numpy array of shape (n_matches, 147, W).
    """
    # --- Import Data ---
    # It is assumed that the following variables have been loaded:
    #   train_home_team, train_away_team, train_home_player, train_away_player,
    #   test_home_team, test_away_team, test_home_player, test_away_player, train_labels
    if data_type == 'train':
        home_team_df = train_home_team.copy()
        away_team_df = train_away_team.copy()
        home_player_df = train_home_player.copy()
        away_player_df = train_away_player.copy()
        labels      = train_labels.copy()
    elif data_type == 'test':
        home_team_df = test_home_team.copy()
        away_team_df = test_away_team.copy()
        home_player_df = test_home_player.copy()
        away_player_df = test_away_player.copy()
    else:
        raise ValueError("data_type must be 'train' or 'test'")
    
    # --- Build dictionaries for team info ---
    # Keyed by ID; drop the "ID" later.
    home_team_dict = {row['ID']: row.drop('ID') for _, row in home_team_df.iterrows()}
    away_team_dict = {row['ID']: row.drop('ID') for _, row in away_team_df.iterrows()}
    
    # --- Build dictionaries for player data ---
    # Group by ID for players.
    home_players_dict = {k: v.copy() for k, v in home_player_df.groupby('ID')}
    away_players_dict = {k: v.copy() for k, v in away_player_df.groupby('ID')}
    
    # All match IDs:
    match_ids = set(home_team_dict.keys()) | set(away_team_dict.keys()) | \
                set(home_players_dict.keys()) | set(away_players_dict.keys())
    
    # --- Function to build a 48-row player table for one team ---
    def build_player_table(team_players):
        """
        Given a DataFrame of players for one team, build a table of exactly 48 rows.
        For each POSITION, sort descending by "PLAYER_MINUTES_PLAYED_season_sum" and pad
        to the fixed allowed number:
          goalkeeper: 8, defender: 12, midfielder: 12, attacker: 12, missing: 4.
        Padded rows have numeric features = 0 and POSITION set appropriately.
        """
        # Fixed order and allowed counts.
        pos_order = ['goalkeeper', 'defender', 'midfielder', 'attacker', 'missing']
        allowed = {'goalkeeper': 8, 'defender': 12, 'midfielder': 12, 'attacker': 12, 'missing': 4}
        
        # Work on a copy and fill missing POSITION.
        df = team_players.copy()
        df['POSITION'] = df['POSITION'].fillna('missing')
        
        # List to hold per-category DataFrames.
        parts = []
        for pos in pos_order:
            sub = df[df['POSITION'] == pos].copy()
            # Sort descending by "PLAYER_MINUTES_PLAYED_season_sum"
            if not sub.empty and "PLAYER_MINUTES_PLAYED_season_sum" in sub.columns:
                sub = sub.sort_values(by="PLAYER_MINUTES_PLAYED_season_sum", ascending=False)
            sub = sub.head(allowed[pos])
            n = len(sub)
            if n < allowed[pos]:
                # Create a DataFrame of padded rows.
                pad = pd.DataFrame(0, index=np.arange(allowed[pos]-n), columns=df.columns)
                pad['POSITION'] = pos  # mark the category for one-hot later
                sub = pd.concat([sub, pad], ignore_index=True)
            parts.append(sub)
        # Concatenate in fixed order: total rows = 48.
        table = pd.concat(parts, axis=0).reset_index(drop=True)
        table = table.iloc[:48, :]  # ensure exactly 48 rows
        
        # Drop ID if present.
        if 'ID' in table.columns:
            table = table.drop(columns=['ID'])
        # --- One-hot encode POSITION ---
        # Do it on the entire table so that padded rows get the proper indicator.
        pos_dummies = pd.get_dummies(table['POSITION'], prefix='POSITION').astype(int)
        # Ensure all five columns exist.
        for pos in pos_order:
            col = f"POSITION_{pos}"
            if col not in pos_dummies.columns:
                pos_dummies[col] = 0
        # Order the one-hot columns in a fixed order.
        pos_dummies = pos_dummies[[f"POSITION_{p}" for p in pos_order]]
        table = table.drop(columns=['POSITION'])
        table = pd.concat([table, pos_dummies], axis=1)
        table = table.fillna(0) 
        return table.reset_index(drop=True)
    
    # --- Process one match ---
    def process_match(match_id):
        # TEAM SECTION:
        # Get team info (as Series); if missing, create a Series of zeros.
        def pad_team_series(s, target_length):
            arr = s.to_numpy(dtype=float)
            if len(arr) < target_length:
                arr = np.concatenate([arr, np.zeros(target_length - len(arr))])
            else:
                arr = arr[:target_length]
            return arr
        
        # Build a dummy player table to know the target width.
        dummy = build_player_table(pd.DataFrame(columns=home_player_df.columns))
        target_width = dummy.shape[1]
        
        home_team = home_team_dict.get(match_id, pd.Series(0, index=dummy.columns))
        away_team = away_team_dict.get(match_id, pd.Series(0, index=dummy.columns))
        # Convert Series to numpy after filling NaN
        home_team_arr = pad_team_series(home_team.fillna(0), target_width)
        away_team_arr = pad_team_series(away_team.fillna(0), target_width)
        team_diff_arr = home_team_arr - away_team_arr
        team_section = np.vstack([home_team_arr, away_team_arr, team_diff_arr])  # shape (3, target_width)
        
        # PLAYER SECTION:
        # Build player tables (48 rows each) for home and away.
        home_players = home_players_dict.get(match_id, pd.DataFrame(columns=home_player_df.columns))
        away_players = away_players_dict.get(match_id, pd.DataFrame(columns=away_player_df.columns))
        home_table = build_player_table(home_players)
        away_table = build_player_table(away_players)
        home_arr = home_table.to_numpy(dtype=float)
        away_arr = away_table.to_numpy(dtype=float)
        diff_arr = home_arr - away_arr
        
        # Interleave rows: for each index i, rows are:
        # home_table[i], away_table[i], diff_arr[i]
        interleaved = np.empty((48*3, home_arr.shape[1]), dtype=float)
        interleaved[0::3, :] = home_arr
        interleaved[1::3, :] = away_arr
        interleaved[2::3, :] = diff_arr
        
        # Final 2D array: 3 team rows + 144 player rows = 147 rows.
        final_2d = np.vstack([team_section, interleaved])
        # Standardize: team section (first 3 rows) divided by 10, players (remaining) by 100.
        final_2d[:3, :] = final_2d[:3, :] / 10.0
        final_2d[3:, :] = final_2d[3:, :] / 100.0
        return final_2d
    
    # --- Process all matches ---
    results = [process_match(mid) for mid in match_ids]
    final_3d = np.stack(results, axis=0)  # shape: (n_matches, 147, target_width)

    if data_type == "train":
        return final_3d, labels
    elif data_type == "test":
        return final_3d, 
    else:
        raise ValueError("data_type must be 'train' or 'test'")
    

In [None]:
# Preprocesses team and player data for training or testing.
def preprocess_data_v3_1(data_type='train'):
    """
    Process raw team and player data into a 3D array for CNN models.
    
    For each match (unique ID) we gather:
      [HOME_TEAM_ROW, AWAY_TEAM_ROW, HOME_PLAYER_ROWS, AWAY_PLAYER_ROWS]
    (Here team rows and player rows are kept in their raw form without renaming.)
    
    Then for each match we build a 2D array (shape: 151 x W) where:
      - The first 3 rows are team info:
          Row 1: HOME team (padded to width W)
          Row 2: AWAY team (padded to width W)
          Row 3: HOME - AWAY difference (padded to width W)
      - The next 148 rows are built from players:
          * For each team we build a player table with exactly 48 rows.
            For each POSITION category, players are sorted descending by 
            "PLAYER_MINUTES_PLAYED_season_sum" and padded to fixed numbers:
                goalkeeper: 8, defender: 12, midfielder: 12, attacker: 12, missing: 8.
            The padded rows have all numeric features = 0 but the POSITION one-hot 
            (later) is set to that category.
          * Then we compute DIFF = HOME_player_table - AWAY_player_table.
          * Finally, we interleave row-wise: for each of the 48 indices, stack:
              HOME row, AWAY row, and DIFF row.
    
    Standardization:
      - The team section (first 3 rows) is divided by 10.
      - The player section (remaining 148 rows) is divided by 100.
    
    Returns:
      A 3D numpy array of shape (n_matches, 151, W).
    """
    # --- Import Data ---
    # It is assumed that the following variables have been loaded:
    #   train_home_team, train_away_team, train_home_player, train_away_player,
    #   test_home_team, test_away_team, test_home_player, test_away_player, train_labels
    if data_type == 'train':
        home_team_df = train_home_team.copy()
        away_team_df = train_away_team.copy()
        home_player_df = train_home_player.copy()
        away_player_df = train_away_player.copy()
        labels      = train_labels.copy()
    elif data_type == 'test':
        home_team_df = test_home_team.copy()
        away_team_df = test_away_team.copy()
        home_player_df = test_home_player.copy()
        away_player_df = test_away_player.copy()
    else:
        raise ValueError("data_type must be 'train' or 'test'")
    
    # --- Build dictionaries for team info ---
    # Keyed by ID; drop the "ID" later.
    home_team_dict = {row['ID']: row.drop('ID') for _, row in home_team_df.iterrows()}
    away_team_dict = {row['ID']: row.drop('ID') for _, row in away_team_df.iterrows()}
    
    # --- Build dictionaries for player data ---
    # Group by ID for players.
    home_players_dict = {k: v.copy() for k, v in home_player_df.groupby('ID')}
    away_players_dict = {k: v.copy() for k, v in away_player_df.groupby('ID')}
    
    # All match IDs:
    match_ids = set(home_team_dict.keys()) | set(away_team_dict.keys()) | \
                set(home_players_dict.keys()) | set(away_players_dict.keys())
    
    # --- Function to build a 52-row player table for one team ---
    def build_player_table(team_players):
        """
        Given a DataFrame of players for one team, build a table of exactly 52 rows.
        For each POSITION, sort descending by "PLAYER_MINUTES_PLAYED_season_sum" and pad
        to the fixed allowed number:
          goalkeeper: 8, defender: 12, midfielder: 12, attacker: 12, missing: 8.
        Padded rows have numeric features = 0 and POSITION set appropriately.
        """
        # Fixed order and allowed counts.
        pos_order = ['goalkeeper', 'defender', 'midfielder', 'attacker', 'missing']
        allowed = {'goalkeeper': 8, 'defender': 12, 'midfielder': 12, 'attacker': 12, 'missing': 8}
        
        # Work on a copy and fill missing POSITION.
        df = team_players.copy()
        df['POSITION'] = df['POSITION'].fillna('missing')
        
        # List to hold per-category DataFrames.
        parts = []
        for pos in pos_order:
            sub = df[df['POSITION'] == pos].copy()
            # Sort descending by "PLAYER_MINUTES_PLAYED_season_sum"
            if not sub.empty and "PLAYER_MINUTES_PLAYED_season_sum" in sub.columns:
                sub = sub.sort_values(by="PLAYER_MINUTES_PLAYED_season_sum", ascending=False)
            sub = sub.head(allowed[pos])
            n = len(sub)
            if n < allowed[pos]:
                # Create a DataFrame of padded rows.
                pad = pd.DataFrame(0, index=np.arange(allowed[pos]-n), columns=df.columns)
                pad['POSITION'] = pos  # mark the category for one-hot later
                sub = pd.concat([sub, pad], ignore_index=True)
            parts.append(sub)
        # Concatenate in fixed order: total rows = 52.
        table = pd.concat(parts, axis=0).reset_index(drop=True)
        table = table.iloc[:52, :]  # ensure exactly 52 rows
        
        # Drop ID if present.
        if 'ID' in table.columns:
            table = table.drop(columns=['ID'])
        # --- One-hot encode POSITION ---
        # Do it on the entire table so that padded rows get the proper indicator.
        pos_dummies = pd.get_dummies(table['POSITION'], prefix='POSITION').astype(int)
        # Ensure all five columns exist.
        for pos in pos_order:
            col = f"POSITION_{pos}"
            if col not in pos_dummies.columns:
                pos_dummies[col] = 0
        # Order the one-hot columns in a fixed order.
        pos_dummies = pos_dummies[[f"POSITION_{p}" for p in pos_order]]
        table = table.drop(columns=['POSITION'])
        table = pd.concat([table, pos_dummies], axis=1)
        table = table.fillna(0) 
        return table.reset_index(drop=True)
    
    # --- Process one match ---
    def process_match(match_id):
        # TEAM SECTION:
        # Get team info (as Series); if missing, create a Series of zeros.
        def pad_team_series(s, target_length):
            arr = s.to_numpy(dtype=float)
            if len(arr) < target_length:
                arr = np.concatenate([arr, np.zeros(target_length - len(arr))])
            else:
                arr = arr[:target_length]
            return arr
        
        # Build a dummy player table to know the target width.
        dummy = build_player_table(pd.DataFrame(columns=home_player_df.columns))
        target_width = dummy.shape[1]
        
        home_team = home_team_dict.get(match_id, pd.Series(0, index=dummy.columns))
        away_team = away_team_dict.get(match_id, pd.Series(0, index=dummy.columns))
        # Convert Series to numpy after filling NaN
        home_team_arr = pad_team_series(home_team.fillna(0), target_width)
        away_team_arr = pad_team_series(away_team.fillna(0), target_width)
        team_diff_arr = home_team_arr - away_team_arr
        team_section = np.vstack([home_team_arr, away_team_arr, team_diff_arr])  # shape (3, target_width)
        
        # PLAYER SECTION:
        # Build player tables (52 rows each) for home and away.
        home_players = home_players_dict.get(match_id, pd.DataFrame(columns=home_player_df.columns))
        away_players = away_players_dict.get(match_id, pd.DataFrame(columns=away_player_df.columns))
        home_table = build_player_table(home_players)
        away_table = build_player_table(away_players)
        home_arr = home_table.to_numpy(dtype=float)
        away_arr = away_table.to_numpy(dtype=float)
        diff_arr = home_arr - away_arr

        # Interleave rows: for each index i, rows are:
        # home_table[i], away_table[i], diff_arr[i]
        interleaved = np.empty((52*3, home_arr.shape[1]), dtype=float)
        interleaved[0::3, :] = home_arr
        interleaved[1::3, :] = away_arr
        interleaved[2::3, :] = diff_arr
        
        # Final 2D array: 3 team rows + 148 player rows = 151 rows.
        final_2d = np.vstack([team_section, interleaved])
        # Standardize: team section (first 3 rows) divided by 10, players (remaining) by 100.
        final_2d[:3, :] = final_2d[:3, :] / 10.0
        final_2d[3:, :] = final_2d[3:, :] / 100.0
        return final_2d
    
    # --- Process all matches ---
    results = [process_match(mid) for mid in match_ids]
    final_3d = np.stack(results, axis=0)  # shape: (n_matches, 151, target_width)

    if data_type == "train":
        return final_3d, labels
    elif data_type == "test":
        return final_3d, 
    else:
        raise ValueError("data_type must be 'train' or 'test'")
    

In [10]:
# Preprocesses team and player data for training or testing.
def preprocess_data_v3_2(data_type='train'):
    """
    Process raw team and player data into a 4D array for CNN models.
    
    For each match (unique ID) we gather:
      [HOME_TEAM_ROW, AWAY_TEAM_ROW, HOME_PLAYER_ROWS, AWAY_PLAYER_ROWS]
    (Here team rows and player rows are kept in their raw form without renaming.)
    
    For each match we return TWO 2‑D blocks shaped (53, W):
      block[0]  -> home team  (1 team‑row  + 52 player‑rows)
      block[1]  -> away team  (1 team‑row  + 52 player‑rows)

    Detail: for each match we build TWO 2D array (shape: 53 x W) where:
      - The [0] is for home, [1] is for away
      - The first is team info:
          Row 1: Team information
      - The next 52 rows are team players:
          * For each team we build a player table with exactly 52 rows.
            For each POSITION category, players are sorted descending by 
            "PLAYER_MINUTES_PLAYED_season_sum" and padded to fixed numbers:
                goalkeeper: 8, defender: 12, midfielder: 12, attacker: 12, missing: 8.
            The padded rows have all numeric features = 0 but the POSITION one-hot 
            (later) is set to that category.
    
    Standardization:
      - The team section (first row) is divided by 10.
      - The player section (remaining 152 rows) is divided by 100.
    
    Returns:
      A 4D numpy array of shape (n_matches, 2, 53, W).
    """
    # 1. Select the correct raw tables
    if data_type == "train":
        home_team_df  = train_home_team.copy()
        away_team_df  = train_away_team.copy()
        home_pl_df    = train_home_player.copy()
        away_pl_df    = train_away_player.copy()
        labels        = train_labels.copy()
    elif data_type == "test":
        home_team_df  = test_home_team.copy()
        away_team_df  = test_away_team.copy()
        home_pl_df    = test_home_player.copy()
        away_pl_df    = test_away_player.copy()
    else:
        raise ValueError("data_type must be 'train' or 'test'")

    # 2. Fast dictionaries keyed by match‑ID
    home_team = {row.ID: row.drop("ID") for _, row in home_team_df.iterrows()}
    away_team = {row.ID: row.drop("ID") for _, row in away_team_df.iterrows()}

    home_pl   = {k: v.copy() for k, v in home_pl_df.groupby("ID")}
    away_pl   = {k: v.copy() for k, v in away_pl_df.groupby("ID")}

    match_ids = sorted(set(home_team) | set(away_team) | set(home_pl) | set(away_pl))

    # 3. Helpers
    POS_ORDER = ["goalkeeper", "defender", "midfielder", "attacker", "missing"]
    LIMITS    = {"goalkeeper": 8, "defender": 12, "midfielder": 12,
                 "attacker": 12, "missing": 8} 

    # Build once so we know the final column layout for players
    _empty_player_tbl = pd.DataFrame(columns=home_pl_df.columns)
    _dummy            = pd.get_dummies(pd.Series(POS_ORDER, name="POSITION"), prefix="POSITION")
    PLAYER_COLS       = (
        _empty_player_tbl.drop(columns=["ID", "POSITION"], errors="ignore").columns.tolist()
        + [f"POSITION_{p}" for p in POS_ORDER]
    )

    PLAYER_WIDTH = len(PLAYER_COLS)
    TEAM_WIDTH   = len(home_team_df.drop(columns=["ID"]).columns)
    WIDTH        = max(PLAYER_WIDTH, TEAM_WIDTH)

    def _build_player_block(df: pd.DataFrame) -> np.ndarray:
        """
        Return (52, WIDTH) player matrix for a single team.
        """
        if df.empty:
            # All‑zero except the one‑hot that marks the padded position
            blocks = []
            for p in POS_ORDER:
                blk = np.zeros((LIMITS[p], WIDTH), dtype=np.float32)
                col_idx = PLAYER_COLS.index(f"POSITION_{p}")
                blk[:, col_idx] = 1.0
                blocks.append(blk)
            return np.vstack(blocks)

        df = df.copy()
        df["POSITION"] = df["POSITION"].fillna("missing")

        mats = []
        for p in POS_ORDER:
            sub = df[df["POSITION"] == p]
            if "PLAYER_MINUTES_PLAYED_season_sum" in sub.columns and not sub.empty:
                sub = sub.nlargest(LIMITS[p], "PLAYER_MINUTES_PLAYED_season_sum")
            # Pad/truncate to exact limit
            if len(sub) < LIMITS[p]:
                pad = pd.DataFrame(
                    0,
                    index=np.arange(LIMITS[p] - len(sub)),
                    columns=sub.columns,
                )
                pad["POSITION"] = p
                sub = pd.concat([sub, pad], ignore_index=True)
            else:
                sub = sub.iloc[: LIMITS[p]]

            # numeric block (no ID / POSITION) + one‑hot
            X_num = sub.drop(columns=["ID", "POSITION"], errors="ignore").to_numpy(float)
            # make sure numeric has WIDTH – len(one_hot) columns
            if X_num.shape[1] < PLAYER_WIDTH - len(POS_ORDER):
                X_num = np.pad(
                    X_num, ((0, 0), (0, PLAYER_WIDTH - len(POS_ORDER) - X_num.shape[1]))
                )

            oh = pd.get_dummies(sub["POSITION"], prefix="POSITION").reindex(
                columns=[f"POSITION_{q}" for q in POS_ORDER], fill_value=0
            ).to_numpy(int)

            mats.append(np.hstack([X_num, oh]))

        out = np.vstack(mats)
        # pad to WIDTH
        if PLAYER_WIDTH < WIDTH:
            out = np.pad(out, ((0, 0), (0, WIDTH - PLAYER_WIDTH)))
        return out.astype(np.float32)

    def _team_vector(s: pd.Series) -> np.ndarray:
        """
        Return (WIDTH,) vector for a team row, padded/truncated as needed.
        """
        if s.empty:
            vec = np.zeros(WIDTH, dtype=np.float32)
        else:
            v = s.to_numpy(float)
            if len(v) < WIDTH:
                v = np.pad(v, (0, WIDTH - len(v)))
            elif len(v) > WIDTH:
                v = v[:WIDTH]
            vec = v.astype(np.float32)
        return vec

    # 4. Main loop over matches
    blocks = []
    for mid in match_ids:
        home_team_vec = _team_vector(home_team.get(mid, pd.Series(dtype=float)))
        away_team_vec = _team_vector(away_team.get(mid, pd.Series(dtype=float)))

        home_players  = _build_player_block(home_pl.get(mid, pd.DataFrame()))
        away_players  = _build_player_block(away_pl.get(mid, pd.DataFrame()))

        # Assemble 2×53×W
        home_block = np.vstack([home_team_vec, home_players])   # (53, W)
        away_block = np.vstack([away_team_vec, away_players])   # (53, W)

        # Scaling
        home_block[0] /= 10.0
        away_block[0] /= 10.0
        home_block[1:] /= 100.0
        away_block[1:] /= 100.0

        # ── NEW: wipe remaining NaNs ──────────────────────────
        np.nan_to_num(home_block, copy=False)
        np.nan_to_num(away_block, copy=False)

        blocks.append(np.stack([home_block, away_block], axis=0))  # (2,53,W)

    X = np.stack(blocks, axis=0)       # (n_matches, 2, 53, W)

    if data_type == "train":
        return X, labels
    return X,
    

In [9]:
# Create augmented data for training
# 3. Future: Enhanced samples:
# 1. The results are swapped after the host and guest are swapped,
# 2. Players in the same position can be randomly mixed

def augment_match_dataset_v3_2(X, y=None,
        *,
        swap_teams: bool = True,
        shuffle_positions: bool = True,
        n_row_shuffles: int = 2,
        include_shuffled_swapped: bool = True,
        random_state: int | None = None,
):
    """
    Data‑augmentation for tensors shaped (n_matches, 2, 53, W).

    Parameters
    ----------
    X : ndarray
        Original tensor from `preprocess_data_v3_2` with shape (N, 2, 53, W).
    y : ndarray | None, default=None
        1‑D label np.array.  Binary (0/1) or 3‑class (0‑draw, 1‑home‑win, 2‑away‑win)
        are both handled automatically.  If ``None`` only `X_aug` is returned.
    swap_teams : bool, default=True
        When ``True`` adds a copy with the two team blocks [home, away]
        swapped to [away, home].
    shuffle_positions : bool, default=True
        Randomly permute players *within* each position block
        (size = 8 + 12 + 12 + 12 + 8 = 52) **without** touching the order of
        the blocks themselves.
    n_row_shuffles : int, default=1
        How many independent shuffles to generate *per* original sample.
    include_shuffled_swapped : bool, default=False
        If ``True`` also produces a “swap‑then‑shuffle” version.
    random_state : int | None
        Seed for reproducibility.

    Returns
    -------
    X_aug : ndarray, shape (N_aug, 2, 53, W)
    y_aug : ndarray, shape (N_aug,)   (only if *y* supplied)
    """
    rng = np.random.default_rng(random_state)

    N, _, R, W = X.shape
    assert R == 53, "Expected second dimension size 53."

    # indices (1–52) for each position group, **within a single 53‑row block**
    pos_idx = {
        "GK":  np.arange(1, 1 + 8),
        "DEF": np.arange(9, 9 + 12),
        "MID": np.arange(21, 21 + 12),
        "ATT": np.arange(33, 33 + 12),
        "MIS": np.arange(45, 45 + 8),
    }

    def _permute_positions(block: np.ndarray) -> np.ndarray:
        """Return a copy where rows inside each position are shuffled."""
        blk = block.copy()
        for idx in pos_idx.values():
            perm = rng.permutation(idx)
            blk[idx] = blk[perm]
        return blk

    # Label‑swap helper
    def _swap_label(lbl):
        # binary {0,1}           → 1 - lbl
        # tri‑class {0,1,2}      → 0 stays, 1 exchanges 2
        if lbl in (0, 1):
            return 1 - lbl
        if lbl in (0, 1, 2):
            return {0: 0, 1: 2, 2: 1}[lbl]
        raise ValueError(
            "Unrecognised label value. "
            "Pass `y=None` or handle labels outside this function."
        )

    # Build the augmented lists
    X_out, y_out = [], []

    for i in range(N):
        x0 = X[i]

        # 1. original
        X_out.append(x0)
        if y is not None:
            y_out.append(y[i])

        # 2. swapped home and away
        if swap_teams:
            x_sw = x0[::-1].copy()        # reverse first axis
            X_out.append(x_sw)
            if y is not None:
                y_out.append(_swap_label(y[i]))

        # 3. row‑shuffles
        if shuffle_positions:
            for _ in range(n_row_shuffles):
                x_sh = x0.copy()
                x_sh[0] = _permute_positions(x_sh[0])   # home
                x_sh[1] = _permute_positions(x_sh[1])   # away
                X_out.append(x_sh)
                if y is not None:
                    y_out.append(y[i])

                # optionally shuffle the swapped version too
                if swap_teams and include_shuffled_swapped:
                    x_sws = x_sw.copy()
                    x_sws[0] = _permute_positions(x_sws[0])
                    x_sws[1] = _permute_positions(x_sws[1])
                    X_out.append(x_sws)
                    if y is not None:
                        y_out.append(_swap_label(y[i]))

    X_aug = np.stack(X_out, axis=0)
    if y is None:
        return X_aug,
    return X_aug, np.asarray(y_out)


Do the data processing - v1

In [None]:
team_train, player_train, labels = preprocess_data_v1(data_type='train')

In [10]:
team_test, player_test = preprocess_data_v1(data_type="test")

In [None]:
# Only for v1, since the IDs are divided by 100/10
team_train["ID"] = team_train["ID"] * 10
team_train["ID"] = team_train["ID"].astype(int)
player_train["ID"] = player_train["ID"] * 100
player_train["ID"] = player_train["ID"].astype(int)

In [None]:
# Only for v1, since the IDs are divided by 100/10
team_test["ID"] = team_test["ID"] * 10
team_test["ID"] = team_test["ID"].astype(int)
player_test["ID"] = player_test["ID"] * 100
player_test["ID"] = player_test["ID"].astype(int)

team_test.index = team_test["ID"]
player_test.index = player_test["ID"]

In [None]:
#save((team_train, player_train, labels), "./processed_train_v1.bin")
#(team_train, player_train, labels) = load("./processed_train_v1.bin")

In [None]:
#save((team_test, player_test), "./processed_test_v1.bin")
#(team_test, player_test) = load("./processed_test_v1.bin")

In [None]:
X_train = pd.merge(team_train, player_train, how = "inner", on = "ID")
X_train

In [17]:
player_test.index.name = ""
team_test.index.name = ""
X_test = pd.merge(team_test, player_test, how = "inner", on = "ID")
X_test.index = X_test["ID"].to_list()
X_test

Unnamed: 0,ID,AWAY_TEAM_SHOTS_TOTAL_season_sum,AWAY_TEAM_SHOTS_INSIDEBOX_season_sum,AWAY_TEAM_SHOTS_OFF_TARGET_season_sum,AWAY_TEAM_SHOTS_ON_TARGET_season_sum,AWAY_TEAM_SHOTS_OUTSIDEBOX_season_sum,AWAY_TEAM_PASSES_season_sum,AWAY_TEAM_SUCCESSFUL_PASSES_season_sum,AWAY_TEAM_SAVES_season_sum,AWAY_TEAM_CORNERS_season_sum,...,HOME_PLAYER_YELLOWCARDS_5_last_match_std_30,HOME_PLAYER_PUNCHES_5_last_match_std_30,HOME_PLAYER_LONG_BALLS_5_last_match_std_30,HOME_PLAYER_LONG_BALLS_WON_5_last_match_std_30,HOME_PLAYER_SHOTS_OFF_TARGET_5_last_match_std_30,HOME_POSITION_attacker_30,HOME_POSITION_defender_30,HOME_POSITION_goalkeeper_30,HOME_POSITION_midfielder_30,HOME_POSITION_nan_30
12303,12303,0.6,0.6,0.5,0.7,0.7,0.4,0.3,0.6,0.3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12304,12304,0.4,0.4,0.4,0.4,0.3,0.8,0.8,0.4,0.7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12305,12305,0.1,0.1,0.2,0.2,0.2,0.6,0.5,0.4,0.2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12306,12306,0.6,0.9,0.6,0.9,0.4,0.4,0.4,0.1,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12307,12307,0.5,0.5,0.2,0.4,0.4,0.1,0.2,0.3,0.8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37666,37666,0.8,0.8,0.6,1.0,0.7,0.8,0.8,1.0,0.3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
37667,37667,0.7,0.4,0.4,0.9,0.8,0.7,0.6,0.3,0.9,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
37668,37668,0.3,0.4,0.2,0.3,0.1,0.1,0.1,0.3,0.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
37669,37669,0.5,0.5,0.5,0.3,0.4,0.3,0.3,0.3,0.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
#save((X_test,), "./merged_X_test_v1.bin")
#(X_test,) = load("./merged_X_test_v1.bin")

Do the data processing - v2

No Extra Processing Needed

In [None]:
X_train, labels = preprocess_data_v2(data_type='train')

In [27]:
X_train

Unnamed: 0,ID,AWAY_TEAM_SHOTS_TOTAL_season_sum,AWAY_TEAM_SHOTS_INSIDEBOX_season_sum,AWAY_TEAM_SHOTS_OFF_TARGET_season_sum,AWAY_TEAM_SHOTS_ON_TARGET_season_sum,AWAY_TEAM_SHOTS_OUTSIDEBOX_season_sum,AWAY_TEAM_PASSES_season_sum,AWAY_TEAM_SUCCESSFUL_PASSES_season_sum,AWAY_TEAM_SAVES_season_sum,AWAY_TEAM_CORNERS_season_sum,...,HOME_PLAYER_POSITION_attacker_39,HOME_PLAYER_POSITION_attacker_40,HOME_PLAYER_POSITION_attacker_41,HOME_PLAYER_POSITION_attacker_42,HOME_PLAYER_POSITION_attacker_43,HOME_PLAYER_POSITION_attacker_44,HOME_PLAYER_POSITION_attacker_45,HOME_PLAYER_POSITION_attacker_46,HOME_PLAYER_POSITION_attacker_47,HOME_PLAYER_POSITION_attacker_48
0,0,0.4,0.1,0.4,0.1,0.5,0.4,0.3,0.6,0.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.4,0.3,0.4,0.3,0.8,0.8,0.7,0.8,0.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,0.4,0.3,0.4,0.2,0.6,0.4,0.4,0.1,0.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,0.0,0.0,0.0,0.0,0.1,0.3,0.3,0.8,0.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,0.5,0.4,0.5,0.6,0.5,0.6,0.6,0.4,0.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12298,12298,0.6,0.6,0.5,0.7,0.7,0.4,0.3,0.4,0.8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12299,12299,0.3,0.2,0.6,0.1,0.3,0.4,0.4,0.7,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12300,12300,0.7,0.7,0.6,0.7,0.3,0.3,0.2,0.4,0.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12301,12301,0.1,0.0,0.6,0.7,0.0,0.0,0.0,0.0,0.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
labels

Unnamed: 0,ID,HOME_WINS,DRAW,AWAY_WINS
0,0,0,0,1
1,1,0,1,0
2,2,0,0,1
3,3,1,0,0
4,4,0,1,0
...,...,...,...,...
12298,12298,0,0,1
12299,12299,0,0,1
12300,12300,0,0,1
12301,12301,1,0,0


Do the data processing - v3_2

No extra processing needed

In [13]:
X_train, labels = preprocess_data_v3_2(data_type='train')

In [18]:
X_train

array([[[[0.3 , 0.2 , 0.5 , ..., 0.  , 0.  , 0.  ],
         [0.  , 0.33, 0.02, ..., 0.  , 0.  , 0.  ],
         [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
         ...,
         [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.01],
         [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.01],
         [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.01]],

        [[0.4 , 0.1 , 0.4 , ..., 0.  , 0.  , 0.  ],
         [0.  , 0.27, 0.05, ..., 0.  , 0.  , 0.  ],
         [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
         ...,
         [0.  , 0.01, 0.  , ..., 0.  , 0.  , 0.01],
         [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.01],
         [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.01]]],


       [[[0.6 , 0.8 , 0.3 , ..., 0.  , 0.  , 0.  ],
         [0.  , 0.28, 0.06, ..., 0.  , 0.  , 0.  ],
         [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
         ...,
         [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.01],
         [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.01],
         [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.01]],

        [[0.4 

In [19]:
labels

Unnamed: 0,ID,HOME_WINS,DRAW,AWAY_WINS
0,0,0,0,1
1,1,0,1,0
2,2,0,0,1
3,3,1,0,0
4,4,0,1,0
...,...,...,...,...
12298,12298,0,0,1
12299,12299,0,0,1
12300,12300,0,0,1
12301,12301,1,0,0


In [None]:
#save((X_train, labels), "./processed_train_v3_2.bin")
#(X_train, labels) = load("./processed_train_v3_2.bin")

In [12]:
X_test = preprocess_data_v3_2(data_type="test")

In [13]:
X_test[0]

array([[[[0.3 , 0.6 , 0.5 , ..., 0.  , 0.  , 0.  ],
         [0.  , 0.22, 0.01, ..., 0.  , 0.  , 0.  ],
         [0.  , 0.04, 0.01, ..., 0.  , 0.  , 0.  ],
         ...,
         [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.01],
         [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.01],
         [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.01]],

        [[0.6 , 0.6 , 0.5 , ..., 0.  , 0.  , 0.  ],
         [0.  , 0.2 , 0.05, ..., 0.  , 0.  , 0.  ],
         [0.  , 0.03, 0.  , ..., 0.  , 0.  , 0.  ],
         ...,
         [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.01],
         [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.01],
         [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.01]]],


       [[[0.3 , 0.2 , 0.4 , ..., 0.  , 0.  , 0.  ],
         [0.  , 0.28, 0.06, ..., 0.  , 0.  , 0.  ],
         [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
         ...,
         [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.01],
         [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.01],
         [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.01]],

        [[0.4 

In [None]:
#save((X_test,), "./processed_test_v3_2.bin")
#(X_test,) = load("./processed_test_v3_2.bin")

In [None]:
#save((X_test,), "./processed_test_v3_2.gz.bin", kompress=gzip, protocol = 5)
#(X_test,) = load("./processed_test_v3_2.gz.bin", kompress=gzip)

Train X y process v1

In [29]:
y_train = labels["HOME_WINS"] - labels["AWAY_WINS"] + 1
y_train = pd.DataFrame(y_train, index=y_train.index, columns=["y"])
y_train["ID"] = y_train.index

In [30]:
merged = pd.merge(X_train, y_train, how="inner", on = "ID")

In [31]:
X_train = merged.drop(["ID", "y"], axis=1)
y_train = merged[["y"]]

In [32]:
X_train

Unnamed: 0,AWAY_TEAM_SHOTS_TOTAL_season_sum,AWAY_TEAM_SHOTS_INSIDEBOX_season_sum,AWAY_TEAM_SHOTS_OFF_TARGET_season_sum,AWAY_TEAM_SHOTS_ON_TARGET_season_sum,AWAY_TEAM_SHOTS_OUTSIDEBOX_season_sum,AWAY_TEAM_PASSES_season_sum,AWAY_TEAM_SUCCESSFUL_PASSES_season_sum,AWAY_TEAM_SAVES_season_sum,AWAY_TEAM_CORNERS_season_sum,AWAY_TEAM_FOULS_season_sum,...,HOME_PLAYER_POSITION_attacker_39,HOME_PLAYER_POSITION_attacker_40,HOME_PLAYER_POSITION_attacker_41,HOME_PLAYER_POSITION_attacker_42,HOME_PLAYER_POSITION_attacker_43,HOME_PLAYER_POSITION_attacker_44,HOME_PLAYER_POSITION_attacker_45,HOME_PLAYER_POSITION_attacker_46,HOME_PLAYER_POSITION_attacker_47,HOME_PLAYER_POSITION_attacker_48
0,0.4,0.1,0.4,0.1,0.5,0.4,0.3,0.6,0.4,0.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.4,0.3,0.4,0.3,0.8,0.8,0.7,0.8,0.5,0.8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.4,0.3,0.4,0.2,0.6,0.4,0.4,0.1,0.6,0.8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.1,0.3,0.3,0.8,0.1,0.3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.5,0.4,0.5,0.6,0.5,0.6,0.6,0.4,0.5,0.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12298,0.6,0.6,0.5,0.7,0.7,0.4,0.3,0.4,0.8,0.3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12299,0.3,0.2,0.6,0.1,0.3,0.4,0.4,0.7,0.0,0.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12300,0.7,0.7,0.6,0.7,0.3,0.3,0.2,0.4,0.4,0.9,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12301,0.1,0.0,0.6,0.7,0.0,0.0,0.0,0.0,0.6,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
#save((X_train, y_train), "./processed_train_ready_v2.bin")
#(X_train, y_train) = load("./processed_train_ready_v2.bin")

Train X, y process and Augmentation - v3_2

In [12]:
# Definition: 0 DRAW, 1 HOME WINS, 2 AWAY WINS
y_train = labels["AWAY_WINS"] - labels["DRAW"] + 1
y_train = pd.DataFrame(y_train, index=y_train.index, columns=["y"])
y_train = y_train.to_numpy()
y_train

array([[2],
       [0],
       [2],
       ...,
       [2],
       [1],
       [1]], dtype=int64)

In [None]:
# Augmentation
X_train_aug, y_train_aug = augment_match_dataset_v3_2(X_train, y_train.flatten())
y_train_aug = y_train_aug.reshape([-1, 1])


In [None]:
# Non-aug Normal Save

#save((X_train, y_train), "./processed_train_nonaugmented_v3_2.bin")
#(X_train, y_train) = load("./processed_train_nonaugmented_v3_2.bin")

In [None]:
# Non-augGZIP Save

#save((X_train, y_train), "./processed_train_nonaugmented_v3_2.gz.bin", kompress = gzip, protocol = 5)
#(X_train, y_train) = load("./processed_train_nonaugmented_v3_2.gz.bin", kompress = gzip)

In [None]:
# Normal Save

#save((X_train_aug, y_train_aug), "./processed_train_augmented_v3_2.bin")
#(X_train_aug, y_train_aug) = load("./processed_train_augmented_v3_2.bin")

In [None]:
# GZIP Save

#save((X_train_aug, y_train_aug), "./processed_train_augmented_v3_2.gz.bin", kompress = gzip, protocol = 5)
#(X_train_aug, y_train_aug) = load("./processed_train_augmented_v3_2.gz.bin", kompress = gzip)

Test X process v1

In [10]:
X_test = X_test.drop(["ID"], axis = 1)

In [11]:
X_test

Unnamed: 0,AWAY_TEAM_SHOTS_TOTAL_season_sum,AWAY_TEAM_SHOTS_INSIDEBOX_season_sum,AWAY_TEAM_SHOTS_OFF_TARGET_season_sum,AWAY_TEAM_SHOTS_ON_TARGET_season_sum,AWAY_TEAM_SHOTS_OUTSIDEBOX_season_sum,AWAY_TEAM_PASSES_season_sum,AWAY_TEAM_SUCCESSFUL_PASSES_season_sum,AWAY_TEAM_SAVES_season_sum,AWAY_TEAM_CORNERS_season_sum,AWAY_TEAM_FOULS_season_sum,...,HOME_PLAYER_YELLOWCARDS_5_last_match_std_30,HOME_PLAYER_PUNCHES_5_last_match_std_30,HOME_PLAYER_LONG_BALLS_5_last_match_std_30,HOME_PLAYER_LONG_BALLS_WON_5_last_match_std_30,HOME_PLAYER_SHOTS_OFF_TARGET_5_last_match_std_30,HOME_POSITION_attacker_30,HOME_POSITION_defender_30,HOME_POSITION_goalkeeper_30,HOME_POSITION_midfielder_30,HOME_POSITION_nan_30
12303,0.6,0.6,0.5,0.7,0.7,0.4,0.3,0.6,0.3,0.7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12304,0.4,0.4,0.4,0.4,0.3,0.8,0.8,0.4,0.7,0.3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12305,0.1,0.1,0.2,0.2,0.2,0.6,0.5,0.4,0.2,0.2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12306,0.6,0.9,0.6,0.9,0.4,0.4,0.4,0.1,1.0,0.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12307,0.5,0.5,0.2,0.4,0.4,0.1,0.2,0.3,0.8,0.2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37666,0.8,0.8,0.6,1.0,0.7,0.8,0.8,1.0,0.3,0.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
37667,0.7,0.4,0.4,0.9,0.8,0.7,0.6,0.3,0.9,0.3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
37668,0.3,0.4,0.2,0.3,0.1,0.1,0.1,0.3,0.4,0.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
37669,0.5,0.5,0.5,0.3,0.4,0.3,0.3,0.3,0.5,0.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


`Split the data into train-test set`

In [None]:
from sklearn import model_selection

X_spl_train, X_spl_test, y_spl_train, y_spl_test = model_selection.train_test_split(X_train, y_train, train_size=0.8, random_state=2821)

In [None]:
#save((X_spl_train, X_spl_test, y_spl_train, y_spl_test), "./splitready_v1(2821)_train.bin")
#(X_spl_train, X_spl_test, y_spl_train, y_spl_test) = load("./splitready_v1(2821)_train.bin")

In [None]:
#save((X_spl_train, X_spl_test, y_spl_train, y_spl_test), "./splitready_v2(2821)_train.bin")
#(X_spl_train, X_spl_test, y_spl_train, y_spl_test) = load("./splitready_v2(2821)_train.bin")

In [None]:
#save((X_spl_train, X_spl_test, y_spl_train, y_spl_test), "./splitready_v3(2821)_train.bin")
#(X_spl_train, X_spl_test, y_spl_train, y_spl_test) = load("./splitready_v3(2821)_train.bin")

In [15]:
from sklearn import model_selection

X_spl_train, X_spl_test, y_spl_train, y_spl_test = model_selection.train_test_split(X_train_aug, y_train_aug, train_size=0.9, random_state=2821)

In [None]:
#save((X_spl_train, X_spl_test, y_spl_train, y_spl_test), "./splitready_v3_2_Aug(2821)_train.bin")
#(X_spl_train, X_spl_test, y_spl_train, y_spl_test) = load("./splitready_v3_2_Aug(2821)_train.bin")

In [None]:
#save((X_spl_train, X_spl_test, y_spl_train, y_spl_test), "./splitready_v3_2_Aug(2821)_train.gz.bin", kompress = gzip, protocol = 5)
#(X_spl_train, X_spl_test, y_spl_train, y_spl_test) = load("./splitready_v3_2_Aug(2821)_train.gz.bin", kompress = gzip)