In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelBinarizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import seaborn as sn
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
from mlxtend.plotting import plot_decision_regions
from scipy.stats import skew, kurtosis
from scipy.signal import welch
from scipy.signal import hilbert

import json 
import pandas as pd
import numpy as np
import os
import glob

from collections import Counter
from concurrent.futures import ThreadPoolExecutor, as_completed


## Functions

In [4]:
def analyze_dataset(file_path):
    # Load dataset
    df = pd.read_csv(file_path)
    print("\n=== Basic Info ===")
    print(df.info())

    print("\n=== Statistical Summary ===")
    print(df.describe(include='all'))

    # Check for missing data
    print("\n=== Missing Values ===")
    print(df.isnull().sum())

    # Correlation matrix
    print("\n=== Correlation Matrix ===")
    print(df.corr(numeric_only=True))

    # Plot numeric columns
    numeric_cols = df.select_dtypes(include='number').columns
    if len(numeric_cols) > 0:
        df[numeric_cols].hist(figsize=(12, 8))
        plt.rcParams['font.size'] = 3
        plt.suptitle("Feature Distributions")
        plt.show()


## Code: 

In [None]:
df = pd.read_csv('battle.csv')
# This usually takes ~3 minutes for the 9.8GB dataset

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,battleTime,arena.id,gameMode.id,average.startingTrophies,winner.tag,winner.startingTrophies,winner.trophyChange,winner.crowns,winner.kingTowerHitPoints,...,loser.cards.list,loser.totalcard.level,loser.troop.count,loser.structure.count,loser.spell.count,loser.common.count,loser.rare.count,loser.epic.count,loser.legendary.count,loser.elixir.average
0,0,2020-12-07 07:00:00+00:00,54000049.0,72000201.0,6590.0,#28RR8PJP0,6581.0,31.0,2.0,4768.0,...,"[26000000, 26000026, 26000030, 26000041, 27000...",104,4,1,3,4,1,1,2,3.125
1,1,2020-12-07 07:00:00+00:00,54000049.0,72000201.0,5582.5,#YV9VQUVP,5592.0,28.0,3.0,2014.0,...,"[26000000, 26000003, 26000007, 26000011, 26000...",104,6,0,2,2,3,3,0,4.125
2,2,2020-12-07 07:00:02+00:00,54000049.0,72000201.0,5684.0,#LPR2G0Q9L,5678.0,31.0,3.0,5304.0,...,"[26000011, 26000026, 26000030, 26000041, 27000...",103,4,1,3,3,2,2,1,2.875
3,3,2020-12-07 07:00:03+00:00,54000049.0,72000201.0,6031.0,#2GL899VCJ,6035.0,29.0,2.0,3368.0,...,"[26000032, 26000040, 26000041, 26000049, 26000...",104,6,1,1,3,2,1,2,3.375
4,4,2020-12-07 07:00:06+00:00,54000049.0,72000201.0,5140.0,#9Y2YJPGG2,5140.0,30.0,3.0,1507.0,...,"[26000012, 26000024, 26000045, 26000056, 26000...",93,5,1,2,3,1,4,0,3.875


In [None]:
for col in df.columns: 
    print(col)

Unnamed: 0
battleTime
arena.id
gameMode.id
average.startingTrophies
winner.tag
winner.startingTrophies
winner.trophyChange
winner.crowns
winner.kingTowerHitPoints
winner.princessTowersHitPoints
winner.clan.tag
winner.clan.badgeId
loser.tag
loser.startingTrophies
loser.trophyChange
loser.crowns
loser.kingTowerHitPoints
loser.clan.tag
loser.clan.badgeId
loser.princessTowersHitPoints
tournamentTag
winner.card1.id
winner.card1.level
winner.card2.id
winner.card2.level
winner.card3.id
winner.card3.level
winner.card4.id
winner.card4.level
winner.card5.id
winner.card5.level
winner.card6.id
winner.card6.level
winner.card7.id
winner.card7.level
winner.card8.id
winner.card8.level
winner.cards.list
winner.totalcard.level
winner.troop.count
winner.structure.count
winner.spell.count
winner.common.count
winner.rare.count
winner.epic.count
winner.legendary.count
winner.elixir.average
loser.card1.id
loser.card1.level
loser.card2.id
loser.card2.level
loser.card3.id
loser.card3.level
loser.card4.id
loser

In [None]:
# Drop the columns that are not important 
cols_to_drop = ['Unnamed: 0', 'battleTime', 'arena.id', 'gameMode.id', 'winner.tag', 'winner.clan.tag', 'winner.clan.badgeId', 'loser.tag', 'loser.clan.tag', 'loser.clan.badgeId', 'tournamentTag']
try: 
    df = df.drop(columns=cols_to_drop, axis=1)
except Exception as e: 
    print(f"Columns most likely already dropped:\n{e}")
df.head()

Unnamed: 0,average.startingTrophies,winner.startingTrophies,winner.trophyChange,winner.crowns,winner.kingTowerHitPoints,winner.princessTowersHitPoints,loser.startingTrophies,loser.trophyChange,loser.crowns,loser.kingTowerHitPoints,...,loser.cards.list,loser.totalcard.level,loser.troop.count,loser.structure.count,loser.spell.count,loser.common.count,loser.rare.count,loser.epic.count,loser.legendary.count,loser.elixir.average
0,6590.0,6581.0,31.0,2.0,4768.0,[1218],6599.0,-31.0,1.0,147.0,...,"[26000000, 26000026, 26000030, 26000041, 27000...",104,4,1,3,4,1,1,2,3.125
1,5582.5,5592.0,28.0,3.0,2014.0,[2349],5573.0,-28.0,1.0,,...,"[26000000, 26000003, 26000007, 26000011, 26000...",104,6,0,2,2,3,3,0,4.125
2,5684.0,5678.0,31.0,3.0,5304.0,"[3346, 3346]",5690.0,-31.0,0.0,,...,"[26000011, 26000026, 26000030, 26000041, 27000...",103,4,1,3,3,2,2,1,2.875
3,6031.0,6035.0,29.0,2.0,3368.0,[2368],6027.0,-29.0,1.0,5832.0,...,"[26000032, 26000040, 26000041, 26000049, 26000...",104,6,1,1,3,2,1,2,3.375
4,5140.0,5140.0,30.0,3.0,1507.0,[2236],5140.0,-30.0,1.0,,...,"[26000012, 26000024, 26000045, 26000056, 26000...",93,5,1,2,3,1,4,0,3.875


In [None]:
for col in df.columns:
    if "winner" in col:
        print(col)

winner.startingTrophies
winner.trophyChange
winner.crowns
winner.kingTowerHitPoints
winner.princessTowersHitPoints
winner.card1.id
winner.card1.level
winner.card2.id
winner.card2.level
winner.card3.id
winner.card3.level
winner.card4.id
winner.card4.level
winner.card5.id
winner.card5.level
winner.card6.id
winner.card6.level
winner.card7.id
winner.card7.level
winner.card8.id
winner.card8.level
winner.cards.list
winner.totalcard.level
winner.troop.count
winner.structure.count
winner.spell.count
winner.common.count
winner.rare.count
winner.epic.count
winner.legendary.count
winner.elixir.average


In [None]:
for col in df.columns:
    if "loser" in col:
        print(col)

loser.startingTrophies
loser.trophyChange
loser.crowns
loser.kingTowerHitPoints
loser.princessTowersHitPoints
loser.card1.id
loser.card1.level
loser.card2.id
loser.card2.level
loser.card3.id
loser.card3.level
loser.card4.id
loser.card4.level
loser.card5.id
loser.card5.level
loser.card6.id
loser.card6.level
loser.card7.id
loser.card7.level
loser.card8.id
loser.card8.level
loser.cards.list
loser.totalcard.level
loser.troop.count
loser.structure.count
loser.spell.count
loser.common.count
loser.rare.count
loser.epic.count
loser.legendary.count
loser.elixir.average


In [None]:
for col in df.columns:
    if "loser" not in col and "winner" not in col:
        print(col)

average.startingTrophies


In [None]:
print("Max average starting trophies:", df["average.startingTrophies"].max())
print("Min average starting trophies:", df["average.startingTrophies"].min())
print("Average average starting trophies:", df["average.startingTrophies"].mean())
print("--------------------------------------------------------------------")
print("Max winner trophies:", df["winner.startingTrophies"].max())
print("Min winner trophies:", df["winner.startingTrophies"].min())
print("Average winner trophies:", df["winner.startingTrophies"].mean())
print("--------------------------------------------------------------------")
print("Max loser trophies:", df["loser.startingTrophies"].max())
print("Min loser trophies:", df["loser.startingTrophies"].min())
print("Average loser trophies:", df["loser.startingTrophies"].mean())

Max average starting trophies: 8220.0
Min average starting trophies: 13.5
Average average starting trophies: 4596.092209262954
--------------------------------------------------------------------
Max winner trophies: 8212.0
Min winner trophies: 0.0
Average winner trophies: 4596.400062598391
--------------------------------------------------------------------
Max loser trophies: 8233.0
Min loser trophies: 24.0
Average loser trophies: 4595.784355927518


In [None]:
print(f"Shape: ({df.shape[0]:,}, {df.shape[1]:,})")

NameError: name 'df' is not defined

## Convert to dictionary

In [None]:
for index, row in df.iterrows():
    if index < 3: 
        print(f"Index: {index}, Col1: {row['winner.troop.count']}, Col2: {row['loser.troop.count']}")
    
    
match = {}