# MTG Archetype Discovery and Clustering

## Imports

In [57]:

import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pathlib import Path
from datetime import datetime
from docx import Document
from docx.shared import Inches

import hdbscan
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize
from sklearn.linear_model import LogisticRegression
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay


## Load and Normalize Deck Data

In [93]:

DATA_PATH = Path("M7_Decks/mtg/modern/mtg_modern.json")

# Define lands to skip during analysis to avoid clustering issues
basic_lands = ["Plains", "Island", "Swamp", "Mountain", "Forest", "Wastes"]
snow_lands = ["Snow-Covered Plains", "Snow-Covered Island", "Snow-Covered Swamp",
              "Snow-Covered Mountain", "Snow-Covered Forest", "Snow-Covered Wastes"]
fetch_lands = ["Flooded Strand", "Polluted Delta", "Bloodstained Mire", "Windswept Heath",
    "Wooded Foothills", "Scalding Tarn", "Marsh Flats", "Arid Mesa",
    "Verdant Catacombs", "Misty Rainforest", "Prismatic Vista"]
lands_to_skip = set([normalize_card_name(name) for name in basic_lands + snow_lands + fetch_lands])

with open(DATA_PATH, "r") as f:
    raw_decks = json.load(f)

def normalize_card_name(name):
    name = name.lower()
    if "//" in name:
        name = name.split("//")[0].strip()
    
    return (
        name.replace(" ", "_")
            .replace(",", "")
            .replace("'", "")
    )

def deck_to_text(deck, include_sideboard=False):
    cards = []
    for entry in deck["maindeck"]:
        token = normalize_card_name(entry["card"])
        if token not in lands_to_skip:
            cards.extend([token] * entry["quantity"])

    if include_sideboard:
        for entry in deck["sideboard"]:
            token = normalize_card_name(entry["card"])
            if token not in lands_to_skip:
                cards.extend([token] * entry["quantity"])

    return " ".join(cards)

rows = []
for deck in raw_decks:
    rows.append({
        "date": datetime.fromisoformat(deck["date"]),
        "deck_text": deck_to_text(deck),
        "record": deck.get("record"),
        "url": deck.get("url")
    })

df = pd.DataFrame(rows).sort_values("date").reset_index(drop=True)
df.head()


Unnamed: 0,date,deck_text,record,url
0,2025-09-21,ral_monsoon_mage ral_monsoon_mage ral_monsoon_...,4-1-2,https://melee.gg/Decklist/View/0e1bee94-da43-4...
1,2025-09-21,pinnacle_monk pinnacle_monk pinnacle_monk pinn...,3-2-1,https://melee.gg/Decklist/View/f05db1ce-f2ad-4...
2,2025-09-21,ral_monsoon_mage ral_monsoon_mage ral_monsoon_...,0-3-0,https://melee.gg/Decklist/View/ee7fd52a-f73c-4...
3,2025-09-21,overlord_of_the_balemurk overlord_of_the_balem...,1-1-2,https://melee.gg/Decklist/View/ed501546-9289-4...
4,2025-09-21,pinnacle_monk pinnacle_monk pinnacle_monk pinn...,0-4-0,https://melee.gg/Decklist/View/bb00eb8a-dbed-4...


## Split into Train and Test Sets

In [97]:
# Split into Train and Test Sets ranther than Temporal Split
split_index = int(0.8 * len(df))
df_train = df.iloc[:split_index].reset_index(drop=True)
df_test = df.iloc[split_index:].reset_index(drop=True)
print(f"Training set size: {len(df_train)} decks")
print(f"Test set size: {len(df_test)} decks")

Training set size: 10126 decks
Test set size: 2532 decks


## Vectorization & Dimensionality Reduction

In [98]:

vectorizer = TfidfVectorizer(token_pattern=r"[^ ]+", min_df=2, max_df=0.9)
X_train_tfidf = vectorizer.fit_transform(df_train["deck_text"])
X_test_tfidf = vectorizer.transform(df_test["deck_text"])

svd = TruncatedSVD(n_components=100, random_state=42)
X_train_reduced = normalize(svd.fit_transform(X_train_tfidf))
X_test_reduced  = normalize(svd.transform(X_test_tfidf))


## Clustering

In [99]:

clusterer = hdbscan.HDBSCAN(min_cluster_size=15)
df_train["cluster"] = clusterer.fit_predict(X_train_reduced)

feature_names = vectorizer.get_feature_names_out()

def top_cards_for_cluster(cluster_label, top_n=10):
    idx = df_train["cluster"] == cluster_label
    cluster_tfidf = X_train_tfidf[idx].mean(axis=0)
    cluster_tfidf = np.asarray(cluster_tfidf).flatten()

    top_indices = cluster_tfidf.argsort()[::-1][:top_n]
    return [(feature_names[i], cluster_tfidf[i]) for i in top_indices]

for cluster_label in sorted(df_train["cluster"].unique()):
    if cluster_label == -1:
        continue  # Skip noise
    print(f"Cluster {cluster_label}:")
    for card, score in top_cards_for_cluster(cluster_label):
        print(f"  {card}: {score:.4f}")
    print()

Cluster 0:
  irencrag_feat: 0.3174
  pinnacle_monk: 0.3065
  stormscale_scion: 0.3011
  shatterskull_smashing: 0.2888
  bitter_reunion: 0.2753
  sundering_eruption: 0.2751
  strike_it_rich: 0.2383
  pyretic_ritual: 0.2137
  desperate_ritual: 0.2133
  manamorphose: 0.2054

Cluster 1:
  ziatoras_proving_ground: 0.0000
  goblin_matron: 0.0000
  glasspool_mimic: 0.0000
  glasswing_grace: 0.0000
  glimmervoid: 0.0000
  glimpse_of_tomorrow: 0.0000
  glimpse_the_impossible: 0.0000
  glimpse_the_unthinkable: 0.0000
  glistener_elf: 0.0000
  gloomlake_verge: 0.0000

Cluster 2:
  disrupting_shoal: 0.2935
  lotus_bloom: 0.2933
  sea_gate_restoration: 0.2925
  suppression_ray: 0.2920
  tameshi_reality_architect: 0.2901
  whir_of_invention: 0.2900
  goblin_charbelcher: 0.2888
  hydroelectric_specimen: 0.2761
  jwari_disruption: 0.2597
  sink_into_stupor: 0.1785

Cluster 3:
  lava_spike: 0.2957
  goblin_guide: 0.2955
  skewer_the_critics: 0.2762
  boltwave: 0.2646
  searing_blaze: 0.2597
  boros_cha

## Merge similar clusters

In [105]:
# Parameters
similarity_threshold = 0.5

unique_clusters = sorted([c for c in df_train['cluster'].unique() if c != -1])

cluster_centroids = {}
for cluster in unique_clusters:
    cluster_indices = df_train.index[df_train['cluster'] == cluster].tolist()
    cluster_centroids[cluster] = X_train_reduced[cluster_indices].mean(axis=0)
centroids_matrix = np.vstack([cluster_centroids[c] for c in unique_clusters])

similarity_matrix = cosine_similarity(centroids_matrix)

merged_cluster_map = {}
current_new_label = 0
visited = set()
for i, cluster_i in enumerate(unique_clusters):
    if cluster_i in visited:
        continue
    # Start a new merged cluster
    merged_cluster_map[cluster_i] = current_new_label
    visited.add(cluster_i)
    # Merge similar clusters
    for j, cluster_j in enumerate(unique_clusters):
        if cluster_j in visited:
            continue
        if similarity_matrix[i, j] >= similarity_threshold:
            merged_cluster_map[cluster_j] = current_new_label
            visited.add(cluster_j)
    current_new_label += 1

# Apply merged clusters to the dataframe
df_train['merged_cluster'] = df_train['cluster'].map(lambda x: -1 if x == -1 else merged_cluster_map[x])


print("Old number of clusters:", len(unique_clusters))
print("New number of merged clusters:", df_train['merged_cluster'].nunique() - (1 if -1 in df_train['merged_cluster'].unique() else 0))

# Display top cards in merged clusters
for cluster_label in sorted(df_train["merged_cluster"].unique()):
    if cluster_label == -1:
        continue
    cluster_decks = df_train[df_train["merged_cluster"] == cluster_label]
    top_cards = {}
    for text in cluster_decks["deck_text"]:
        for card in text.split():
            top_cards[card] = top_cards.get(card, 0) + 1
    top_cards_sorted = sorted(top_cards.items(), key=lambda x: x[1], reverse=True)[:10]
    print(f"Cluster {cluster_label}: {top_cards_sorted}")


Old number of clusters: 118
New number of merged clusters: 42
Cluster 0: [('irencrag_feat', 92), ('strike_it_rich', 92), ('desperate_ritual', 92), ('pyretic_ritual', 92), ('goblin_charbelcher', 92), ('stormscale_scion', 91), ('pinnacle_monk', 90), ('manamorphose', 89), ('shatterskull_smashing', 86), ('sundering_eruption', 84)]
Cluster 1: []
Cluster 2: [('hydroelectric_specimen', 948), ('sink_into_stupor', 948), ('goblin_charbelcher', 948), ('lotus_bloom', 948), ('disrupting_shoal', 947), ('whir_of_invention', 947), ('sea_gate_restoration', 946), ('tameshi_reality_architect', 945), ('suppression_ray', 944), ('fallaji_archaeologist', 860)]
Cluster 3: [('lightning_bolt', 132), ('lava_spike', 120), ('goblin_guide', 119), ('monastery_swiftspear', 116), ('skewer_the_critics', 112), ('boltwave', 106), ('searing_blaze', 105), ('boros_charm', 99), ('sunbaked_canyon', 95), ('rift_bolt', 86)]
Cluster 4: [('hedron_crab', 428), ('ruin_crab', 428), ('archive_trap', 428), ('fractured_sanity', 425), (

## Map clusters to archetypes

In [106]:
cluster_mapping = {
    0: "R Belcher",
    1: "Other",
    2: "U Belcher",
    3: "RW Burn",
    4: "UB Mill",
    5: "R Storm",
    6: "UG Neoform",
    7: "Dredge",
    8: "Hammertime",
    9: "Merfolk",
    10: "Infect",
    11: "Samwise Combo",
    12: "UR Breach",
    13: "Eldrazi Aggro",
    14: "Living End",
    15: "Yawgmoth",
    16: "RG Eldrazi",
    17: "Wx Ponza",
    18: "Hollow One",
    19: "Broodscale Combo",
    20: "Eldrazi Tron",
    21: "UR Mox Cutter",
    22: "Jund Saga",
    23: "B Necro",
    24: "4-Color Ritual",
    25: "Reanimator",
    26: "Asmo Food",
    27: "Affinity",
    28: "Amulet Titan",
    29: "UG Ritual",
    30: "4-Color Control",
    31: "Domain Zoo",
    32: "UW Energy",
    33: "Jeskai Wizards",
    34: "UB Frog",
    35: "UR Prowess",
    36: "Esper Blink",
    37: "Jeskai Blink",
    38: "Affnity",
    39: "UW Control",
    40: "Esper Goryo's",
    41: "RW Energy",
}

df_train['archetype'] = df_train['merged_cluster'].map(lambda x: cluster_mapping.get(x, "Other"))


## Train Classifier & Predict on Test Set

In [107]:
mask = df_train["cluster"] != -1
X_train_masked = X_train_reduced[mask]
y_train_masked = df_train.loc[mask, "archetype"]

clf = LogisticRegression(max_iter=500)
clf.fit(X_train_masked, y_train_masked)

# Predict archetypes for test set
df_test['archetype'] = clf.predict(X_test_reduced)
df_test['confidence'] = clf.predict_proba(X_test_reduced).max(axis=1)


## Metrics/Visualizations Generation

In [108]:
# ----------------------------
# Archetype Metrics
# ----------------------------
archetype_counts = df_train['archetype'].value_counts()
print("Number of decks per archetype:\n", archetype_counts)

# ----------------------------
# Test Set Confidence
# ----------------------------
mean_confidence = df_test['confidence'].mean() * 100
mean_conf_per_archetype = df_test.groupby('archetype')['confidence'].mean() * 100
print(f"Mean classifier confidence on test set: {mean_confidence:.2f}%")
print("Mean confidence per predicted archetype (test set):\n", mean_conf_per_archetype)

# ----------------------------
# Charts
# ----------------------------

# 1. Number of decks per archetype (train)
plt.figure(figsize=(12,6))
archetype_counts.plot(kind='bar')
plt.title("Number of Decks per Archetype (Train Set)")
plt.xlabel("Archetype")
plt.ylabel("Number of Decks")
plt.tight_layout()
plt.savefig("archetype_counts.png")
plt.close()

# 2. Confidence distribution by archetype (test set)
median_conf = df_test.groupby('archetype')['confidence'].median().sort_values(ascending=True)
sorted_archetypes = median_conf.index.tolist()
data_to_plot = [df_test[df_test['archetype'] == archetype]['confidence'] 
                for archetype in sorted_archetypes]
plt.figure(figsize=(10, 12))
box = plt.boxplot(data_to_plot, vert=False, labels=sorted_archetypes, patch_artist=True)
for i, line in enumerate(box['medians']):
    x, y = line.get_xdata()[1], line.get_ydata()[1]
    plt.text(x-0.02, y + 0.17, f"{x:.2f}", va='center', fontsize=9, color='black')
plt.boxplot(data_to_plot, vert=False, labels=sorted_archetypes)
plt.xlabel("Confidence")
plt.ylabel("Predicted Archetype")
plt.title("Classifier Confidence by Predicted Archetype (Test Set)")
plt.tight_layout()
plt.savefig("confidence_by_archetype_test.png")
plt.close()


Number of decks per archetype:
 archetype
Other               4396
UR Prowess           603
Amulet Titan         497
RW Energy            440
Jeskai Blink         440
Eldrazi Tron         434
Domain Zoo           292
Affinity             289
Esper Goryo's        252
RG Eldrazi           238
U Belcher            237
Broodscale Combo     181
R Storm              175
UB Frog              157
Esper Blink          137
UG Ritual            122
UW Energy            118
UB Mill              107
Yawgmoth              92
UW Control            91
Samwise Combo         89
UR Mox Cutter         71
Reanimator            67
Jeskai Wizards        60
Living End            51
UG Neoform            50
Eldrazi Aggro         46
Merfolk               42
Hollow One            36
RW Burn               33
Affnity               32
Dredge                30
UR Breach             27
Hammertime            26
B Necro               26
4-Color Ritual        25
Infect                24
R Belcher             23
Jund Sag