In [191]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelBinarizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import seaborn as sn
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
from mlxtend.plotting import plot_decision_regions
from scipy.stats import skew, kurtosis
from scipy.signal import welch
from scipy.signal import hilbert

import json 
import pandas as pd
import numpy as np
import os
import glob

from collections import Counter
from concurrent.futures import ThreadPoolExecutor, as_completed

import random

## Functions

In [None]:
def analyze_dataset(file_path):
    # Load dataset
    df = pd.read_csv(file_path)
    print("\n=== Basic Info ===")
    print(df.info())

    print("\n=== Statistical Summary ===")
    print(df.describe(include='all'))

    # Check for missing data
    print("\n=== Missing Values ===")
    print(df.isnull().sum())

    # Correlation matrix
    print("\n=== Correlation Matrix ===")
    print(df.corr(numeric_only=True))

    # Plot numeric columns
    numeric_cols = df.select_dtypes(include='number').columns
    if len(numeric_cols) > 0:
        df[numeric_cols].hist(figsize=(12, 8))
        plt.rcParams['font.size'] = 3
        plt.suptitle("Feature Distributions")
        plt.show()


## Code: 

In [None]:
df = pd.read_csv('battle.csv', nrows=10_000)
# This usually takes ~3 minutes for the 9.8GB dataset

In [None]:
df.head()

In [None]:
for col in df.columns: 
    print(col)

In [None]:
# Drop the columns that are not important 
cols_to_drop = ['winner.cards.list', 'loser.cards.list', 'average.startingTrophies', 'Unnamed: 0', 'battleTime', 'arena.id', 'gameMode.id', 'winner.tag', 'winner.clan.tag', 'winner.clan.badgeId', 'loser.tag', 'loser.clan.tag', 'loser.clan.badgeId', 'tournamentTag']
try: 
    df = df.drop(columns=cols_to_drop, axis=1)
except Exception as e: 
    print(f"Columns most likely already dropped:\n{e}")
df.head()

In [183]:
print(f"Shape: ({df.shape[0]:,}, {df.shape[1]:,})")

Shape: (10,000, 63)


## Convert to dictionary

In [253]:
def row_to_match(row):
    match = {} 
    
    # randomize the winner and loser 
    player_num = random.randint(0, 1)

    for side in ["winner", "loser"]:
        side_data = {}

        # extract all columns for this side (e.g., winner.*)
        side_cols = {k: v for k, v in row.items() if k.startswith(side + ".")}

        # cards go in a dict
        for i in range(1, 9):  # for 8 cards
            card_dict = {}
            # print(f"looking for: {side}.card{i}.id: \t{side_cols.get(f"{side}.card{i}.id")}")
            card_dict["id"] = side_cols.get(f"{side}.card{i}.id")
            card_dict["level"] = side_cols.get(f"{side}.card{i}.level")
            side_data["card" + str(i)] = card_dict

        # add all non-card columns
        for k, v in side_cols.items():
            if not k.startswith(f"{side}.card"):
                side_data[k.split(".", 1)[1]] = v # basically remove the winner. or the loser. 
        
        # mark the winner
        match['winner'] = player_num

        player_num = 1 if not player_num else 0 # switch the player number for the loser 
        match['player' +  str(player_num)] = side_data

        
    return dict(sorted(match.items())) # sorting this essentially hides winning order

In [278]:
records = df.to_dict(orient='records')
matches_list = [row_to_match(row) for row in records]

for match in matches_list[:2]: 
    print(match)

{'player0': {'card1': {'id': 26000036, 'level': 13}, 'card2': {'id': 28000015, 'level': 13}, 'card3': {'id': 26000050, 'level': 13}, 'card4': {'id': 26000044, 'level': 13}, 'card5': {'id': 26000054, 'level': 13}, 'card6': {'id': 28000016, 'level': 13}, 'card7': {'id': 26000043, 'level': 13}, 'card8': {'id': 26000062, 'level': 13}, 'startingTrophies': 6581.0, 'trophyChange': 31.0, 'crowns': 2.0, 'kingTowerHitPoints': 4768.0, 'princessTowersHitPoints': '[1218]', 'totalcard.level': 104, 'troop.count': 7, 'structure.count': 0, 'spell.count': 1, 'common.count': 1, 'rare.count': 2, 'epic.count': 3, 'legendary.count': 2, 'elixir.average': 3.625}, 'player1': {'card1': {'id': 28000004, 'level': 13}, 'card2': {'id': 26000041, 'level': 13}, 'card3': {'id': 26000026, 'level': 13}, 'card4': {'id': 26000030, 'level': 13}, 'card5': {'id': 26000000, 'level': 13}, 'card6': {'id': 28000011, 'level': 13}, 'card7': {'id': 28000003, 'level': 13}, 'card8': {'id': 27000006, 'level': 13}, 'startingTrophies': 

## Model

IDK WHAT THE BELOW CODE DOES??? CHATGPT!!

In [279]:
from typing import Dict, Any, List
import math
import ast
import numpy as np

from sklearn.feature_extraction import DictVectorizer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score

def _parse_possibly_stringified(value):
    # Handle things like '[1218]' or 'nan'
    if isinstance(value, str):
        v = value.strip()
        if v.lower() == 'nan':
            return np.nan
        if (v.startswith('[') and v.endswith(']')):
            try:
                lst = ast.literal_eval(v)
                # If it’s a 1-element list, return the element; else return length
                if isinstance(lst, list):
                    return lst[0] if len(lst) == 1 else len(lst)
            except Exception:
                pass
        # Not parseable → leave as is (will be ignored unless numeric)
        return value
    return value

def extract_features(match: Dict[str, Any]) -> Dict[str, float]:
    """
    Turn your nested match dict into a flat feature dict.
    """
    feat = {}

    def player_feats(pkey: str):
        p = match[pkey]

        # 1) Deck features: one-hot card ids; numeric level signals (optional)
        card_ids = []
        for i in range(1, 9):
            c = p.get(f'card{i}', None)
            if not c: 
                continue
            cid = c.get('id')
            lvl = c.get('level')
            if cid is not None:
                card_ids.append(int(cid))
                feat[f'{pkey}_card_id_{cid}'] = 1.0
                # Optional: level features keyed by card id (keeps numeric info)
                if lvl is not None:
                    feat[f'{pkey}_card_{cid}_level'] = float(lvl)

        # 2) Raw numeric stats
        numeric_keys = [
            'startingTrophies','trophyChange','crowns','kingTowerHitPoints',
            'princessTowersHitPoints','totalcard.level','troop.count','structure.count',
            'spell.count','common.count','rare.count','epic.count','legendary.count',
            'elixir.average'
        ]
        for k in numeric_keys:
            v = _parse_possibly_stringified(p.get(k, np.nan))
            # keep only numeric-ish
            if isinstance(v, (int, float)) and not (isinstance(v, float) and math.isnan(v)):
                feat[f'{pkey}_{k}'] = float(v)

        return set(card_ids)

    p0_cards = player_feats('player0')
    p1_cards = player_feats('player1')

    # 3) Matchup features
    overlap = len(p0_cards & p1_cards)
    feat['deck_overlap_count'] = float(overlap)

    # Differences (player1 - player0)
    for base in [
        'startingTrophies','elixir.average','totalcard.level','troop.count',
        'structure.count','spell.count','common.count','rare.count','epic.count','legendary.count',
        'kingTowerHitPoints','crowns'
    ]:
        v1 = feat.get(f'player1_{base}', np.nan)
        v0 = feat.get(f'player0_{base}', np.nan)
        if not (isinstance(v1, float) and isinstance(v0, float)):
            continue
        feat[f'diff_{base}'] = v1 - v0

    return feat

# ----- Example usage with a list of matches -----

# # Your single example (put many of these in a list for training):
# matches_list: List[Dict[str, Any]] = [
#     {
#         'player0': {
#             'card1': {'id': 28000004, 'level': 13},
#             'card2': {'id': 26000041, 'level': 13},
#             'card3': {'id': 26000026, 'level': 13},
#             'card4': {'id': 26000030, 'level': 13},
#             'card5': {'id': 26000000, 'level': 13},
#             'card6': {'id': 28000011, 'level': 13},
#             'card7': {'id': 28000003, 'level': 13},
#             'card8': {'id': 27000006, 'level': 13},
#             'startingTrophies': 6599.0,
#             'trophyChange': -31.0,
#             'crowns': 1.0,
#             'kingTowerHitPoints': 147.0,
#             'princessTowersHitPoints': float('nan'),
#             'totalcard.level': 104,
#             'troop.count': 4,
#             'structure.count': 1,
#             'spell.count': 3,
#             'common.count': 4,
#             'rare.count': 1,
#             'epic.count': 1,
#             'legendary.count': 2,
#             'elixir.average': 3.125
#         },
#         'player1': {
#             'card1': {'id': 26000036, 'level': 13},
#             'card2': {'id': 28000015, 'level': 13},
#             'card3': {'id': 26000050, 'level': 13},
#             'card4': {'id': 26000044, 'level': 13},
#             'card5': {'id': 26000054, 'level': 13},
#             'card6': {'id': 28000016, 'level': 13},
#             'card7': {'id': 26000043, 'level': 13},
#             'card8': {'id': 26000062, 'level': 13},
#             'startingTrophies': 6581.0,
#             'trophyChange': 31.0,
#             'crowns': 2.0,
#             'kingTowerHitPoints': 4768.0,
#             'princessTowersHitPoints': '[1218]',
#             'totalcard.level': 104,
#             'troop.count': 7,
#             'structure.count': 0,
#             'spell.count': 1,
#             'common.count': 1,
#             'rare.count': 2,
#             'epic.count': 3,
#             'legendary.count': 2,
#             'elixir.average': 3.625
#         },
#         'winner': 1
#     }
# ]

# Build X (feature dicts) and y (labels)
X_dicts = [extract_features(m) for m in matches_list]
y = np.array([1 if m['winner'] == 1 else 0 for m in matches_list], dtype=int)

# Pipeline: DictVectorizer -> Imputer -> HistGradientBoosting (tree boosting)
pipe = Pipeline([
    ('vect', DictVectorizer(sparse=False)),      # dense for HistGB; if too big, swap model to one that supports sparse
    ('imp', SimpleImputer(strategy='median')),
    ('clf', HistGradientBoostingClassifier(random_state=42))
])

# If you have many matches, split; with one example this is just for reference:
# X_train, X_test, y_train, y_test = train_test_split(X_dicts, y, test_size=0.2, random_state=42, stratify=y)

pipe.fit(X_dicts, y)

# Example prediction on the same data
pred_proba = pipe.predict_proba(X_dicts)[:, 1]
pred = (pred_proba >= 0.5).astype(int)

print("Predicted prob player1 wins:", pred_proba)
print("Predicted class:", pred)


Predicted prob player1 wins: [2.21297362e-05 2.21297362e-05 9.99978327e-01 ... 2.21297362e-05
 2.21297362e-05 9.99978327e-01]
Predicted class: [0 0 1 ... 0 0 1]
