In [1]:
import os
import random
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from scipy.spatial.distance import cdist


In [2]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEVICE


device(type='cpu')

In [3]:
class Encoder(nn.Module):
    def __init__(self, input_dim, latent_dim=10):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128), nn.ReLU(),
            nn.Linear(128, 64), nn.ReLU(),
            nn.Linear(64, latent_dim)
        )
    def forward(self, x):
        return self.model(x)

class Decoder(nn.Module):
    def __init__(self, latent_dim, output_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(latent_dim, 64), nn.ReLU(),
            nn.Linear(64, 128), nn.ReLU(),
            nn.Linear(128, output_dim)
        )
    def forward(self, z):
        return self.model(z)

class AutoEncoder(nn.Module):
    def __init__(self, input_dim, latent_dim=10):
        super().__init__()
        self.encoder = Encoder(input_dim, latent_dim)
        self.decoder = Decoder(latent_dim, input_dim)
    def forward(self, x):
        return self.decoder(self.encoder(x))

class DEC(nn.Module):
    def __init__(self, encoder, cluster_centers):
        super().__init__()
        self.encoder = encoder
        self.clusters = nn.Parameter(cluster_centers)
    def forward(self, x):
        z = self.encoder(x)
        q = 1.0 / (1.0 + torch.sum((z.unsqueeze(1) - self.clusters) ** 2, dim=2))
        q = q / torch.sum(q, dim=1, keepdim=True)
        return q

def target_distribution(q):
    weight = q ** 2 / q.sum(0)
    return (weight.t() / weight.sum(1)).t()


In [4]:
def compute_indices(X, labels):
    n_clusters = len(np.unique(labels))
    cluster_centers = []
    for i in range(n_clusters):
        pts = X[labels == i]
        cluster_centers.append(pts.mean(axis=0) if len(pts) else np.zeros(X.shape[1]))
    cluster_centers = np.array(cluster_centers)

    Iwcss = sum(np.sum((X[labels == i] - cluster_centers[i]) ** 2) for i in range(n_clusters))
    Isep = np.mean(cdist(cluster_centers, cluster_centers))

    valid_dists, valid_dens = [], []
    for i in range(n_clusters):
        pts = X[labels == i]
        if len(pts) < 2:
            continue
        d = np.linalg.norm(pts - cluster_centers[i], axis=1)
        valid_dists.append(np.mean(d))
        valid_dens.append(np.std(d))

    Idistcc = np.mean(valid_dists) if valid_dists else 0.0
    Idens = np.mean(valid_dens) if valid_dens else 0.0
    return Iwcss, Isep, Idistcc, Idens

def minmax_normalize(val, min_val, max_val, larger_is_better):
    if np.isclose(max_val - min_val, 0):
        return 1.0
    score = (val - min_val) / (max_val - min_val)
    return score if larger_is_better else 1 - score


In [5]:
def team_majority_vote(team_match_labels: pd.DataFrame) -> pd.DataFrame:
    """
    Input must include columns: teamId, dec_label
    Output: one row per teamId with probabilities per label + max_label
    """
    label_counts = team_match_labels.groupby(["teamId", "dec_label"]).size().reset_index(name="counts")
    totals = team_match_labels["teamId"].value_counts().reset_index()
    totals.columns = ["teamId", "total"]

    label_prob = label_counts.merge(totals, on="teamId")
    label_prob["probability"] = label_prob["counts"] / label_prob["total"]

    prob_pivot = label_prob.pivot(index="teamId", columns="dec_label", values="probability").fillna(0)

    all_labels = sorted(team_match_labels["dec_label"].unique().tolist())
    for lab in all_labels:
        if lab not in prob_pivot.columns:
            prob_pivot[lab] = 0.0

    prob_pivot["max_label"] = prob_pivot[all_labels].idxmax(axis=1)
    prob_pivot = prob_pivot.reset_index()

    return prob_pivot


## IP

In [18]:
PHASE_NAME = "ip"  # change: ip / op / pt / nt

CSV_PATHS = {
    "ip": "../create_datasets/final_final_possesion_df.csv",
    "nt": "../create_datasets/final_transition_out_of_poss_df.csv",
    "pt": "../create_datasets/final_transition_in_poss_df.csv",   # needs rename Unnamed: 0
    "op": "../create_datasets/final_out_of_possesion_df.csv",
}

RENAME_UNNAMED_TO_UNIQUE = (PHASE_NAME == "pt")

OUT_DIR = "../labels_k2"
os.makedirs(OUT_DIR, exist_ok=True)

SEED = 42
LATENT_DIM = 10
K = 2

AE_EPOCHS = 3000
DEC_EPOCHS = 3000
LR = 1e-3


In [7]:
set_seed(SEED)

df = pd.read_csv(CSV_PATHS[PHASE_NAME])

if RENAME_UNNAMED_TO_UNIQUE and "Unnamed: 0" in df.columns:
    df = df.rename(columns={"Unnamed: 0": "uniqueTeamId"})

assert "uniqueTeamId" in df.columns, "uniqueTeamId not found!"

id_series = df["uniqueTeamId"].copy()
df = df.set_index("uniqueTeamId").fillna(0)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(df.values)
X_tensor = torch.tensor(X_scaled, dtype=torch.float32).to(DEVICE)

input_dim = X_scaled.shape[1]
print("Phase:", PHASE_NAME, "| rows:", X_scaled.shape[0], "| features:", input_dim, "| device:", DEVICE)


Phase: ip | rows: 3652 | features: 30 | device: cpu


In [8]:
ae = AutoEncoder(input_dim, LATENT_DIM).to(DEVICE)
criterion = nn.MSELoss()
opt = optim.Adam(ae.parameters(), lr=LR)

ae.train()
for epoch in range(AE_EPOCHS):
    opt.zero_grad()
    recon = ae(X_tensor)
    loss = criterion(recon, X_tensor)
    loss.backward()
    opt.step()

    if (epoch + 1) % 500 == 0:
        print(f"[{PHASE_NAME}] AE {epoch+1}/{AE_EPOCHS} | loss={loss.item():.6f}")


[ip] AE 500/3000 | loss=0.128892
[ip] AE 1000/3000 | loss=0.119967
[ip] AE 1500/3000 | loss=0.115992
[ip] AE 2000/3000 | loss=0.112270
[ip] AE 2500/3000 | loss=0.110121
[ip] AE 3000/3000 | loss=0.108307


In [9]:
ae.eval()
with torch.no_grad():
    z0 = ae.encoder(X_tensor).detach().cpu().numpy()

kmeans = KMeans(n_clusters=K, random_state=SEED).fit(z0)
init_centers = torch.tensor(kmeans.cluster_centers_, dtype=torch.float32).to(DEVICE)

print(f"[{PHASE_NAME}] KMeans init done for k={K}")


[ip] KMeans init done for k=2


In [10]:
dec = DEC(ae.encoder, init_centers.clone()).to(DEVICE)
dec_opt = optim.Adam(dec.parameters(), lr=LR)

dec.train()
for epoch in range(DEC_EPOCHS):
    q = dec(X_tensor)
    p = target_distribution(q.detach())
    kl = torch.nn.functional.kl_div(q.log(), p, reduction="batchmean")

    dec_opt.zero_grad()
    kl.backward()
    dec_opt.step()

    if (epoch + 1) % 500 == 0:
        print(f"[{PHASE_NAME}] DEC {epoch+1}/{DEC_EPOCHS} | KL={kl.item():.6f}")


[ip] DEC 500/3000 | KL=0.011110
[ip] DEC 1000/3000 | KL=0.007646
[ip] DEC 1500/3000 | KL=0.005899
[ip] DEC 2000/3000 | KL=0.004775
[ip] DEC 2500/3000 | KL=0.003969
[ip] DEC 3000/3000 | KL=0.003370


In [11]:
dec.eval()
with torch.no_grad():
    q_final = dec(X_tensor).detach().cpu().numpy()
    labels = np.argmax(q_final, axis=1).astype(int)
    z_final = dec.encoder(X_tensor).detach().cpu().numpy()

out_tm = pd.DataFrame({"uniqueTeamId": id_series.values})
out_tm["dec_label"] = labels

# uniqueTeamId format: teamId---matchId
out_tm["matchId"] = out_tm["uniqueTeamId"].astype(str).str.split("---").str[1].astype(int)
out_tm["teamId"]  = out_tm["uniqueTeamId"].astype(str).str.split("---").str[0].astype(int)

out_tm["phase"] = PHASE_NAME
out_tm["k"] = K

# Save probabilities q_0..q_3
for c in range(K):
    out_tm[f"q_{c}"] = q_final[:, c]

# Save latent z_0..z_9 (optional but useful)
for j in range(z_final.shape[1]):
    out_tm[f"z_{j}"] = z_final[:, j]

tm_path = os.path.join(OUT_DIR, f"{PHASE_NAME}_team_match_labels.csv")
out_tm.to_csv(tm_path, index=False)
print("Saved:", tm_path)

out_tm.head()


Saved: ../labels_k2\ip_team_match_labels.csv


Unnamed: 0,uniqueTeamId,dec_label,matchId,teamId,phase,k,q_0,q_1,z_0,z_1,z_2,z_3,z_4,z_5,z_6,z_7,z_8,z_9
0,10531---2499726,1,2499726,10531,ip,2,0.003535,0.996465,-5.05268,-7.170287,4.730443,1.202852,-9.701825,-0.344173,4.026637,-9.499918,-2.962513,-3.669589
1,10531---2499736,0,2499736,10531,ip,2,0.996588,0.003412,1.395555,-1.302132,2.24771,-4.081516,-2.216829,4.667476,-0.811734,-2.517665,3.039354,-4.578185
2,10531---2499741,0,2499741,10531,ip,2,0.996554,0.003446,1.403952,-1.325512,2.296862,-4.084894,-2.205998,4.673843,-0.769684,-2.511825,2.969085,-4.542796
3,10531---2499757,0,2499757,10531,ip,2,0.996556,0.003444,1.378231,-1.349186,2.278719,-4.138491,-2.283345,4.7361,-0.755828,-2.605655,3.025097,-4.602322
4,10531---2499766,1,2499766,10531,ip,2,0.003413,0.996587,-5.002353,-7.141852,4.667868,1.094104,-9.606672,-0.215228,3.98207,-9.458146,-2.832905,-3.651589


In [12]:
team_major = team_majority_vote(out_tm[["teamId", "dec_label"]].copy())
team_major["phase"] = PHASE_NAME
team_major["k"] = K

team_path = os.path.join(OUT_DIR, f"{PHASE_NAME}_team_labels_majority.csv")
team_major.to_csv(team_path, index=False)
print("Saved:", team_path)

team_major.head()


Saved: ../labels_k2\ip_team_labels_majority.csv


dec_label,teamId,0,1,max_label,phase,k
0,674,0.763158,0.236842,0,ip,2
1,675,0.973684,0.026316,0,ip,2
2,676,1.0,0.0,0,ip,2
3,677,0.342105,0.657895,1,ip,2
4,678,0.421053,0.578947,1,ip,2


## PT

In [20]:
PHASE_NAME = "pt"  # change: ip / op / pt / nt

RENAME_UNNAMED_TO_UNIQUE = (PHASE_NAME == "pt")

In [21]:
set_seed(SEED)

df = pd.read_csv(CSV_PATHS[PHASE_NAME])

if RENAME_UNNAMED_TO_UNIQUE and "Unnamed: 0" in df.columns:
    df = df.rename(columns={"Unnamed: 0": "uniqueTeamId"})

assert "uniqueTeamId" in df.columns, "uniqueTeamId not found!"

id_series = df["uniqueTeamId"].copy()
df = df.set_index("uniqueTeamId").fillna(0)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(df.values)
X_tensor = torch.tensor(X_scaled, dtype=torch.float32).to(DEVICE)

input_dim = X_scaled.shape[1]
print("Phase:", PHASE_NAME, "| rows:", X_scaled.shape[0], "| features:", input_dim, "| device:", DEVICE)


Phase: pt | rows: 3652 | features: 35 | device: cpu


In [22]:
ae = AutoEncoder(input_dim, LATENT_DIM).to(DEVICE)
criterion = nn.MSELoss()
opt = optim.Adam(ae.parameters(), lr=LR)

ae.train()
for epoch in range(AE_EPOCHS):
    opt.zero_grad()
    recon = ae(X_tensor)
    loss = criterion(recon, X_tensor)
    loss.backward()
    opt.step()

    if (epoch + 1) % 500 == 0:
        print(f"[{PHASE_NAME}] AE {epoch+1}/{AE_EPOCHS} | loss={loss.item():.6f}")


[pt] AE 500/3000 | loss=0.204559
[pt] AE 1000/3000 | loss=0.190395
[pt] AE 1500/3000 | loss=0.181486
[pt] AE 2000/3000 | loss=0.174930
[pt] AE 2500/3000 | loss=0.171473
[pt] AE 3000/3000 | loss=0.168324


In [23]:
ae.eval()
with torch.no_grad():
    z0 = ae.encoder(X_tensor).detach().cpu().numpy()

kmeans = KMeans(n_clusters=K, random_state=SEED).fit(z0)
init_centers = torch.tensor(kmeans.cluster_centers_, dtype=torch.float32).to(DEVICE)

print(f"[{PHASE_NAME}] KMeans init done for k={K}")


[pt] KMeans init done for k=2


In [24]:
dec = DEC(ae.encoder, init_centers.clone()).to(DEVICE)
dec_opt = optim.Adam(dec.parameters(), lr=LR)

dec.train()
for epoch in range(DEC_EPOCHS):
    q = dec(X_tensor)
    p = target_distribution(q.detach())
    kl = torch.nn.functional.kl_div(q.log(), p, reduction="batchmean")

    dec_opt.zero_grad()
    kl.backward()
    dec_opt.step()

    if (epoch + 1) % 500 == 0:
        print(f"[{PHASE_NAME}] DEC {epoch+1}/{DEC_EPOCHS} | KL={kl.item():.6f}")


[pt] DEC 500/3000 | KL=0.018379
[pt] DEC 1000/3000 | KL=0.010955
[pt] DEC 1500/3000 | KL=0.007961
[pt] DEC 2000/3000 | KL=0.006238
[pt] DEC 2500/3000 | KL=0.005004
[pt] DEC 3000/3000 | KL=0.004217


In [25]:
dec.eval()
with torch.no_grad():
    q_final = dec(X_tensor).detach().cpu().numpy()
    labels = np.argmax(q_final, axis=1).astype(int)
    z_final = dec.encoder(X_tensor).detach().cpu().numpy()

out_tm = pd.DataFrame({"uniqueTeamId": id_series.values})
out_tm["dec_label"] = labels

# uniqueTeamId format: teamId---matchId
out_tm["matchId"] = out_tm["uniqueTeamId"].astype(str).str.split("---").str[1].astype(int)
out_tm["teamId"]  = out_tm["uniqueTeamId"].astype(str).str.split("---").str[0].astype(int)

out_tm["phase"] = PHASE_NAME
out_tm["k"] = K

# Save probabilities q_0..q_3
for c in range(K):
    out_tm[f"q_{c}"] = q_final[:, c]

# Save latent z_0..z_9 (optional but useful)
for j in range(z_final.shape[1]):
    out_tm[f"z_{j}"] = z_final[:, j]

tm_path = os.path.join(OUT_DIR, f"{PHASE_NAME}_team_match_labels.csv")
out_tm.to_csv(tm_path, index=False)
print("Saved:", tm_path)

out_tm.head()


Saved: ../labels_k2\pt_team_match_labels.csv


Unnamed: 0,uniqueTeamId,dec_label,matchId,teamId,phase,k,q_0,q_1,z_0,z_1,z_2,z_3,z_4,z_5,z_6,z_7,z_8,z_9
0,10531---2499726,0,2499726,10531,pt,2,0.996159,0.003841,-2.226733,-0.075997,-0.767249,1.222714,4.193018,0.637631,-4.134865,1.865572,-1.14347,0.873497
1,10531---2499736,1,2499736,10531,pt,2,0.005651,0.994349,3.146387,6.01805,-4.88665,-3.443526,10.607821,3.052655,-9.647493,-2.914775,-6.947804,3.29341
2,10531---2499741,1,2499741,10531,pt,2,0.003957,0.996043,3.37909,6.193444,-5.026008,-3.591833,10.745466,3.125397,-9.761405,-3.058889,-7.129265,3.344765
3,10531---2499757,1,2499757,10531,pt,2,0.004028,0.995972,3.342971,6.199474,-5.020159,-3.564582,10.731514,3.111959,-9.756318,-3.058627,-7.107853,3.335284
4,10531---2499766,1,2499766,10531,pt,2,0.003812,0.996188,3.301532,6.258128,-5.035612,-3.610282,10.885191,3.115404,-9.86807,-3.0175,-7.173415,3.346421


In [26]:
team_major = team_majority_vote(out_tm[["teamId", "dec_label"]].copy())
team_major["phase"] = PHASE_NAME
team_major["k"] = K

team_path = os.path.join(OUT_DIR, f"{PHASE_NAME}_team_labels_majority.csv")
team_major.to_csv(team_path, index=False)
print("Saved:", team_path)

team_major.head()


Saved: ../labels_k2\pt_team_labels_majority.csv


dec_label,teamId,0,1,max_label,phase,k
0,674,0.5,0.5,0,pt,2
1,675,0.842105,0.157895,0,pt,2
2,676,0.763158,0.236842,0,pt,2
3,677,0.526316,0.473684,0,pt,2
4,678,0.631579,0.368421,0,pt,2


## NT

In [27]:
PHASE_NAME = "nt"  # change: ip / op / pt / nt

RENAME_UNNAMED_TO_UNIQUE = (PHASE_NAME == "nt")

In [28]:
set_seed(SEED)

df = pd.read_csv(CSV_PATHS[PHASE_NAME])

if RENAME_UNNAMED_TO_UNIQUE and "Unnamed: 0" in df.columns:
    df = df.rename(columns={"Unnamed: 0": "uniqueTeamId"})

assert "uniqueTeamId" in df.columns, "uniqueTeamId not found!"

id_series = df["uniqueTeamId"].copy()
df = df.set_index("uniqueTeamId").fillna(0)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(df.values)
X_tensor = torch.tensor(X_scaled, dtype=torch.float32).to(DEVICE)

input_dim = X_scaled.shape[1]
print("Phase:", PHASE_NAME, "| rows:", X_scaled.shape[0], "| features:", input_dim, "| device:", DEVICE)


Phase: nt | rows: 3652 | features: 45 | device: cpu


In [29]:
ae = AutoEncoder(input_dim, LATENT_DIM).to(DEVICE)
criterion = nn.MSELoss()
opt = optim.Adam(ae.parameters(), lr=LR)

ae.train()
for epoch in range(AE_EPOCHS):
    opt.zero_grad()
    recon = ae(X_tensor)
    loss = criterion(recon, X_tensor)
    loss.backward()
    opt.step()

    if (epoch + 1) % 500 == 0:
        print(f"[{PHASE_NAME}] AE {epoch+1}/{AE_EPOCHS} | loss={loss.item():.6f}")


[nt] AE 500/3000 | loss=0.273092
[nt] AE 1000/3000 | loss=0.249239
[nt] AE 1500/3000 | loss=0.240035
[nt] AE 2000/3000 | loss=0.234723
[nt] AE 2500/3000 | loss=0.230460
[nt] AE 3000/3000 | loss=0.227605


In [30]:
ae.eval()
with torch.no_grad():
    z0 = ae.encoder(X_tensor).detach().cpu().numpy()

kmeans = KMeans(n_clusters=K, random_state=SEED).fit(z0)
init_centers = torch.tensor(kmeans.cluster_centers_, dtype=torch.float32).to(DEVICE)

print(f"[{PHASE_NAME}] KMeans init done for k={K}")


[nt] KMeans init done for k=2


In [31]:
dec = DEC(ae.encoder, init_centers.clone()).to(DEVICE)
dec_opt = optim.Adam(dec.parameters(), lr=LR)

dec.train()
for epoch in range(DEC_EPOCHS):
    q = dec(X_tensor)
    p = target_distribution(q.detach())
    kl = torch.nn.functional.kl_div(q.log(), p, reduction="batchmean")

    dec_opt.zero_grad()
    kl.backward()
    dec_opt.step()

    if (epoch + 1) % 500 == 0:
        print(f"[{PHASE_NAME}] DEC {epoch+1}/{DEC_EPOCHS} | KL={kl.item():.6f}")


[nt] DEC 500/3000 | KL=0.003636
[nt] DEC 1000/3000 | KL=0.001687
[nt] DEC 1500/3000 | KL=0.001313
[nt] DEC 2000/3000 | KL=0.001145
[nt] DEC 2500/3000 | KL=0.001027
[nt] DEC 3000/3000 | KL=0.000926


In [32]:
dec.eval()
with torch.no_grad():
    q_final = dec(X_tensor).detach().cpu().numpy()
    labels = np.argmax(q_final, axis=1).astype(int)
    z_final = dec.encoder(X_tensor).detach().cpu().numpy()

out_tm = pd.DataFrame({"uniqueTeamId": id_series.values})
out_tm["dec_label"] = labels

# uniqueTeamId format: teamId---matchId
out_tm["matchId"] = out_tm["uniqueTeamId"].astype(str).str.split("---").str[1].astype(int)
out_tm["teamId"]  = out_tm["uniqueTeamId"].astype(str).str.split("---").str[0].astype(int)

out_tm["phase"] = PHASE_NAME
out_tm["k"] = K

# Save probabilities q_0..q_3
for c in range(K):
    out_tm[f"q_{c}"] = q_final[:, c]

# Save latent z_0..z_9 (optional but useful)
for j in range(z_final.shape[1]):
    out_tm[f"z_{j}"] = z_final[:, j]

tm_path = os.path.join(OUT_DIR, f"{PHASE_NAME}_team_match_labels.csv")
out_tm.to_csv(tm_path, index=False)
print("Saved:", tm_path)

out_tm.head()


Saved: ../labels_k2\nt_team_match_labels.csv


Unnamed: 0,uniqueTeamId,dec_label,matchId,teamId,phase,k,q_0,q_1,z_0,z_1,z_2,z_3,z_4,z_5,z_6,z_7,z_8,z_9
0,10531---2499726,0,2499726,10531,nt,2,0.999036,0.000964,-0.869528,-0.693438,-1.509858,-2.531914,-0.160335,-0.804405,-3.300501,-3.203357,1.637879,4.434292
1,10531---2499736,0,2499736,10531,nt,2,0.999043,0.000957,-0.974788,-0.727489,-1.433544,-2.271192,-0.229476,-0.771304,-2.971188,-2.836963,1.666163,4.101511
2,10531---2499741,0,2499741,10531,nt,2,0.999129,0.000871,-0.848585,-0.690543,-1.490334,-2.421262,-0.217178,-0.792934,-3.107694,-3.024022,1.623971,4.28938
3,10531---2499757,0,2499757,10531,nt,2,0.999032,0.000968,-1.031548,-0.788307,-1.431023,-2.309644,-0.248559,-0.789373,-3.047791,-2.752174,1.729367,4.160289
4,10531---2499766,1,2499766,10531,nt,2,0.001539,0.998461,8.825151,3.75639,-5.921632,-13.745937,4.486325,-2.974756,-15.310389,-25.961107,-1.797795,19.206905


In [33]:
team_major = team_majority_vote(out_tm[["teamId", "dec_label"]].copy())
team_major["phase"] = PHASE_NAME
team_major["k"] = K

team_path = os.path.join(OUT_DIR, f"{PHASE_NAME}_team_labels_majority.csv")
team_major.to_csv(team_path, index=False)
print("Saved:", team_path)

team_major.head()


Saved: ../labels_k2\nt_team_labels_majority.csv


dec_label,teamId,0,1,max_label,phase,k
0,674,0.394737,0.605263,1,nt,2
1,675,0.368421,0.631579,1,nt,2
2,676,0.394737,0.605263,1,nt,2
3,677,0.710526,0.289474,0,nt,2
4,678,0.578947,0.421053,0,nt,2


## OP

In [34]:
PHASE_NAME = "op"  # change: ip / op / pt / nt

RENAME_UNNAMED_TO_UNIQUE = (PHASE_NAME == "op")

In [35]:
set_seed(SEED)

df = pd.read_csv(CSV_PATHS[PHASE_NAME])

if RENAME_UNNAMED_TO_UNIQUE and "Unnamed: 0" in df.columns:
    df = df.rename(columns={"Unnamed: 0": "uniqueTeamId"})

assert "uniqueTeamId" in df.columns, "uniqueTeamId not found!"

id_series = df["uniqueTeamId"].copy()
df = df.set_index("uniqueTeamId").fillna(0)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(df.values)
X_tensor = torch.tensor(X_scaled, dtype=torch.float32).to(DEVICE)

input_dim = X_scaled.shape[1]
print("Phase:", PHASE_NAME, "| rows:", X_scaled.shape[0], "| features:", input_dim, "| device:", DEVICE)


Phase: op | rows: 3652 | features: 45 | device: cpu


In [36]:
ae = AutoEncoder(input_dim, LATENT_DIM).to(DEVICE)
criterion = nn.MSELoss()
opt = optim.Adam(ae.parameters(), lr=LR)

ae.train()
for epoch in range(AE_EPOCHS):
    opt.zero_grad()
    recon = ae(X_tensor)
    loss = criterion(recon, X_tensor)
    loss.backward()
    opt.step()

    if (epoch + 1) % 500 == 0:
        print(f"[{PHASE_NAME}] AE {epoch+1}/{AE_EPOCHS} | loss={loss.item():.6f}")


[op] AE 500/3000 | loss=0.287245
[op] AE 1000/3000 | loss=0.266672
[op] AE 1500/3000 | loss=0.257917
[op] AE 2000/3000 | loss=0.251960
[op] AE 2500/3000 | loss=0.248061
[op] AE 3000/3000 | loss=0.244943


In [37]:
ae.eval()
with torch.no_grad():
    z0 = ae.encoder(X_tensor).detach().cpu().numpy()

kmeans = KMeans(n_clusters=K, random_state=SEED).fit(z0)
init_centers = torch.tensor(kmeans.cluster_centers_, dtype=torch.float32).to(DEVICE)

print(f"[{PHASE_NAME}] KMeans init done for k={K}")


[op] KMeans init done for k=2


In [38]:
dec = DEC(ae.encoder, init_centers.clone()).to(DEVICE)
dec_opt = optim.Adam(dec.parameters(), lr=LR)

dec.train()
for epoch in range(DEC_EPOCHS):
    q = dec(X_tensor)
    p = target_distribution(q.detach())
    kl = torch.nn.functional.kl_div(q.log(), p, reduction="batchmean")

    dec_opt.zero_grad()
    kl.backward()
    dec_opt.step()

    if (epoch + 1) % 500 == 0:
        print(f"[{PHASE_NAME}] DEC {epoch+1}/{DEC_EPOCHS} | KL={kl.item():.6f}")


[op] DEC 500/3000 | KL=0.004732
[op] DEC 1000/3000 | KL=0.001805
[op] DEC 1500/3000 | KL=0.001318
[op] DEC 2000/3000 | KL=0.001131
[op] DEC 2500/3000 | KL=0.001025
[op] DEC 3000/3000 | KL=0.000934


In [39]:
dec.eval()
with torch.no_grad():
    q_final = dec(X_tensor).detach().cpu().numpy()
    labels = np.argmax(q_final, axis=1).astype(int)
    z_final = dec.encoder(X_tensor).detach().cpu().numpy()

out_tm = pd.DataFrame({"uniqueTeamId": id_series.values})
out_tm["dec_label"] = labels

# uniqueTeamId format: teamId---matchId
out_tm["matchId"] = out_tm["uniqueTeamId"].astype(str).str.split("---").str[1].astype(int)
out_tm["teamId"]  = out_tm["uniqueTeamId"].astype(str).str.split("---").str[0].astype(int)

out_tm["phase"] = PHASE_NAME
out_tm["k"] = K

# Save probabilities q_0..q_3
for c in range(K):
    out_tm[f"q_{c}"] = q_final[:, c]

# Save latent z_0..z_9 (optional but useful)
for j in range(z_final.shape[1]):
    out_tm[f"z_{j}"] = z_final[:, j]

tm_path = os.path.join(OUT_DIR, f"{PHASE_NAME}_team_match_labels.csv")
out_tm.to_csv(tm_path, index=False)
print("Saved:", tm_path)

out_tm.head()


Saved: ../labels_k2\op_team_match_labels.csv


Unnamed: 0,uniqueTeamId,dec_label,matchId,teamId,phase,k,q_0,q_1,z_0,z_1,z_2,z_3,z_4,z_5,z_6,z_7,z_8,z_9
0,10531---2499726,1,2499726,10531,op,2,0.001013,0.998987,-0.772935,-0.286178,-0.921619,-4.254721,-0.253838,-1.38199,-4.278471,-3.157329,0.850064,4.302069
1,10531---2499736,1,2499736,10531,op,2,0.000898,0.999102,-0.816703,-0.306604,-0.808244,-4.02337,-0.368242,-1.365797,-4.115334,-2.941272,0.836609,4.077654
2,10531---2499741,0,2499741,10531,op,2,0.999102,0.000898,8.381251,4.579641,-9.01088,-18.120449,10.181544,-5.038311,-13.472561,-19.527399,4.902805,19.977169
3,10531---2499757,1,2499757,10531,op,2,0.000898,0.999102,-0.798637,-0.271621,-0.845683,-4.043488,-0.354027,-1.358925,-4.060384,-2.986989,0.848555,4.064178
4,10531---2499766,0,2499766,10531,op,2,0.998958,0.001042,8.167813,4.550439,-8.993102,-18.146616,9.984687,-5.124774,-13.739801,-19.574709,4.868281,20.139133


In [40]:
team_major = team_majority_vote(out_tm[["teamId", "dec_label"]].copy())
team_major["phase"] = PHASE_NAME
team_major["k"] = K

team_path = os.path.join(OUT_DIR, f"{PHASE_NAME}_team_labels_majority.csv")
team_major.to_csv(team_path, index=False)
print("Saved:", team_path)

team_major.head()


Saved: ../labels_k2\op_team_labels_majority.csv


dec_label,teamId,0,1,max_label,phase,k
0,674,0.710526,0.289474,0,op,2
1,675,0.605263,0.394737,0,op,2
2,676,0.394737,0.605263,1,op,2
3,677,0.473684,0.526316,1,op,2
4,678,0.447368,0.552632,1,op,2
