In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelBinarizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import seaborn as sn
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
from mlxtend.plotting import plot_decision_regions
from scipy.stats import skew, kurtosis
from scipy.signal import welch
from scipy.signal import hilbert

import json 
import pandas as pd
import numpy as np
import os
import glob

from collections import Counter
from concurrent.futures import ThreadPoolExecutor, as_completed


## Functions

In [None]:
def analyze_dataset(file_path):
    # Load dataset
    df = pd.read_csv(file_path)
    print("\n=== Basic Info ===")
    print(df.info())

    print("\n=== Statistical Summary ===")
    print(df.describe(include='all'))

    # Check for missing data
    print("\n=== Missing Values ===")
    print(df.isnull().sum())

    # Correlation matrix
    print("\n=== Correlation Matrix ===")
    print(df.corr(numeric_only=True))

    # Plot numeric columns
    numeric_cols = df.select_dtypes(include='number').columns
    if len(numeric_cols) > 0:
        df[numeric_cols].hist(figsize=(12, 8))
        plt.rcParams['font.size'] = 3
        plt.suptitle("Feature Distributions")
        plt.show()


## Code: 

In [None]:
df = pd.read_csv('battle.csv', nrows=10_000)
# This usually takes ~3 minutes for the 9.8GB dataset

In [None]:
df.head()

In [None]:
for col in df.columns: 
    print(col)

In [None]:
# Drop the columns that are not important 
cols_to_drop = ['winner.cards.list', 'loser.cards.list', 'average.startingTrophies', 'Unnamed: 0', 'battleTime', 'arena.id', 'gameMode.id', 'winner.tag', 'winner.clan.tag', 'winner.clan.badgeId', 'loser.tag', 'loser.clan.tag', 'loser.clan.badgeId', 'tournamentTag']
try: 
    df = df.drop(columns=cols_to_drop, axis=1)
except Exception as e: 
    print(f"Columns most likely already dropped:\n{e}")
df.head()

In [183]:
print(f"Shape: ({df.shape[0]:,}, {df.shape[1]:,})")

Shape: (10,000, 63)


## Convert to dictionary

In [None]:
def row_to_match(row):
    match = {} 
    
    for side in ["winner", "loser"]:
        side_data = {}

        # extract all columns for this side (e.g., winner.*)
        side_cols = {k: v for k, v in row.items() if k.startswith(side + ".")}

        # cards go in a dict
        for i in range(1, 9):  # for 8 cards
            card_dict = {}
            # print(f"looking for: {side}.card{i}.id: \t{side_cols.get(f"{side}.card{i}.id")}")
            card_dict["id"] = side_cols.get(f"{side}.card{i}.id")
            card_dict["level"] = side_cols.get(f"{side}.card{i}.level")
            side_data["card" + str(i)] = card_dict

        # add all non-card columns
        for k, v in side_cols.items():
            if not k.startswith(f"{side}.card"):
                side_data[k.split(".", 1)[1]] = v # basically remove the winner. or the loser. 
                

        if side == "winner": 
            match['player0'] = side_data
        elif side == "loser": 
            match['player1'] = side_data
        
    return match


In [188]:
records = df.to_dict(orient='records')
matches = [row_to_match(row) for row in records]

for match in matches[:100]: 
    print(match)

{'player0': {'card1': {'id': 26000036, 'level': 13}, 'card2': {'id': 28000015, 'level': 13}, 'card3': {'id': 26000050, 'level': 13}, 'card4': {'id': 26000044, 'level': 13}, 'card5': {'id': 26000054, 'level': 13}, 'card6': {'id': 28000016, 'level': 13}, 'card7': {'id': 26000043, 'level': 13}, 'card8': {'id': 26000062, 'level': 13}, 'startingTrophies': 6581.0, 'trophyChange': 31.0, 'crowns': 2.0, 'kingTowerHitPoints': 4768.0, 'princessTowersHitPoints': '[1218]', 'totalcard.level': 104, 'troop.count': 7, 'structure.count': 0, 'spell.count': 1, 'common.count': 1, 'rare.count': 2, 'epic.count': 3, 'legendary.count': 2, 'elixir.average': 3.625}, 'player1': {'card1': {'id': 28000004, 'level': 13}, 'card2': {'id': 26000041, 'level': 13}, 'card3': {'id': 26000026, 'level': 13}, 'card4': {'id': 26000030, 'level': 13}, 'card5': {'id': 26000000, 'level': 13}, 'card6': {'id': 28000011, 'level': 13}, 'card7': {'id': 28000003, 'level': 13}, 'card8': {'id': 27000006, 'level': 13}, 'startingTrophies': 