In [None]:
import os
import torch
import kagglehub
import numpy as np
import polars as pl
import sklearn as sk
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

from dota import Dota2
from heroes import get_heroes
from model import Dota2Autoencoder
from dataset import get_dataset
from leagues import get_tier_one
from mpl_toolkits.mplot3d import Axes3D
from sklearn.cluster import DBSCAN
from itertools import product
from sklearn.cluster import AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.cluster import OPTICS

def plot_clustering_grid(latent_space, title, n_clusters=4):
    """
    Plota diferentes algoritmos de clustering em um grid 2x2
    
    Args:
        latent_space: Array numpy com os dados no espaço latente
        title: Título principal do plot (ex: nome do torneio)
        n_clusters: Número de clusters para os algoritmos (default: 4)
    """
    # Organize plots in a 2x2 grid
    fig, axs = plt.subplots(2, 2, figsize=(10, 8))
    
    # KMeans clustering
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    cluster_labels = kmeans.fit_predict(latent_space)
    print(f"Cluster labels: {np.unique(cluster_labels)}")
    for cluster_id in np.unique(cluster_labels):
        mask = cluster_labels == cluster_id
        axs[0, 0].scatter(latent_space[mask, 0], latent_space[mask, 1], 
                       label=f"Cluster {cluster_id}", alpha=0.7)
    axs[0, 0].set_xlabel("Latent X")
    axs[0, 0].set_ylabel("Latent Y")
    axs[0, 0].set_title("KMeans Clustering")
    axs[0, 0].legend()

    # Agglomerative clustering
    agglo = AgglomerativeClustering(n_clusters=n_clusters)
    agglo_labels = agglo.fit_predict(latent_space)
    print(f"Agglomerative labels: {np.unique(agglo_labels)}")
    for cluster_id in np.unique(agglo_labels):
        mask = agglo_labels == cluster_id
        axs[0, 1].scatter(latent_space[mask, 0], latent_space[mask, 1], 
                       label=f"Cluster {cluster_id}", alpha=0.7)
    axs[0, 1].set_xlabel("Latent X")
    axs[0, 1].set_ylabel("Latent Y")
    axs[0, 1].set_title("Agglomerative Clustering")
    axs[0, 1].legend()

    # Gaussian Mixture clustering
    gmm = GaussianMixture(n_components=n_clusters, random_state=42)
    gmm_labels = gmm.fit_predict(latent_space)
    print(f"GMM labels: {np.unique(gmm_labels)}")
    for cluster_id in np.unique(gmm_labels):
        mask = gmm_labels == cluster_id
        axs[1, 0].scatter(latent_space[mask, 0], latent_space[mask, 1], 
                       label=f"Cluster {cluster_id}", alpha=0.7)
    axs[1, 0].set_xlabel("Latent X")
    axs[1, 0].set_ylabel("Latent Y")
    axs[1, 0].set_title("Gaussian Mixture Clustering")
    axs[1, 0].legend()

    # OPTICS clustering
    optics = OPTICS(min_samples=10, xi=0.05, min_cluster_size=0.05)
    optics_labels = optics.fit_predict(latent_space)
    print(f"OPTICS labels: {np.unique(optics_labels)}")
    for cluster_id in np.unique(optics_labels):
        mask = optics_labels == cluster_id
        if cluster_id == -1:
            axs[1, 1].scatter(latent_space[mask, 0], latent_space[mask, 1], 
                           label="Ruído", alpha=0.5, c="k")
        else:
            axs[1, 1].scatter(latent_space[mask, 0], latent_space[mask, 1], 
                           label=f"Cluster {cluster_id}", alpha=0.7)
    axs[1, 1].set_xlabel("Latent X")
    axs[1, 1].set_ylabel("Latent Y")
    axs[1, 1].set_title("OPTICS Clustering")
    axs[1, 1].legend()
    
    plt.tight_layout()
    plt.suptitle(f"Clustering de {title}", y=1.02)
    plt.show()
    
    return {
        "kmeans": cluster_labels,
        "agglomerative": agglo_labels,
        "gmm": gmm_labels,
        "optics": optics_labels
    }

In [None]:
def plot_clustering_grid_3d(latent_space, title, n_clusters=4, figsize=(12, 10), alpha=0.7, elev=30, azim=45):
    """
    Plota diferentes algoritmos de clustering em um grid 2x2 com visualização 3D
    
    Args:
        latent_space: Array numpy com os dados no espaço latente (precisa ter pelo menos 3 dimensões)
        title: Título principal do plot (ex: nome do torneio)
        n_clusters: Número de clusters para os algoritmos (default: 4)
        figsize: Tamanho da figura (default: (12, 10))
        alpha: Transparência dos pontos (default: 0.7)
        elev: Elevação da visualização 3D (default: 30)
        azim: Azimute da visualização 3D (default: 45)
    """
    # Verificar se o espaço latente tem pelo menos 3 dimensões
    if latent_space.shape[1] < 3:
        raise ValueError("O espaço latente precisa ter pelo menos 3 dimensões para visualização 3D")
    
    # Organize plots in a 2x2 grid
    fig = plt.figure(figsize=figsize)
    
    # KMeans clustering
    ax1 = fig.add_subplot(2, 2, 1, projection='3d')
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    cluster_labels = kmeans.fit_predict(latent_space)
    print(f"Cluster labels: {np.unique(cluster_labels)}")
    
    for cluster_id in np.unique(cluster_labels):
        mask = cluster_labels == cluster_id
        ax1.scatter(latent_space[mask, 0], latent_space[mask, 1], latent_space[mask, 2],
                   label=f"Cluster {cluster_id}", alpha=alpha)
    
    ax1.set_xlabel("Latent X")
    ax1.set_ylabel("Latent Y")
    ax1.set_zlabel("Latent Z")
    ax1.set_title("KMeans Clustering")
    ax1.legend()
    ax1.view_init(elev=elev, azim=azim)
    
    # Agglomerative clustering
    ax2 = fig.add_subplot(2, 2, 2, projection='3d')
    agglo = AgglomerativeClustering(n_clusters=n_clusters)
    agglo_labels = agglo.fit_predict(latent_space)
    print(f"Agglomerative labels: {np.unique(agglo_labels)}")
    
    for cluster_id in np.unique(agglo_labels):
        mask = agglo_labels == cluster_id
        ax2.scatter(latent_space[mask, 0], latent_space[mask, 1], latent_space[mask, 2],
                   label=f"Cluster {cluster_id}", alpha=alpha)
    
    ax2.set_xlabel("Latent X")
    ax2.set_ylabel("Latent Y")
    ax2.set_zlabel("Latent Z")
    ax2.set_title("Agglomerative Clustering")
    ax2.legend()
    ax2.view_init(elev=elev, azim=azim)
    
    # Gaussian Mixture clustering
    ax3 = fig.add_subplot(2, 2, 3, projection='3d')
    gmm = GaussianMixture(n_components=n_clusters, random_state=42)
    gmm_labels = gmm.fit_predict(latent_space)
    print(f"GMM labels: {np.unique(gmm_labels)}")
    
    for cluster_id in np.unique(gmm_labels):
        mask = gmm_labels == cluster_id
        ax3.scatter(latent_space[mask, 0], latent_space[mask, 1], latent_space[mask, 2],
                   label=f"Cluster {cluster_id}", alpha=alpha)
    
    ax3.set_xlabel("Latent X")
    ax3.set_ylabel("Latent Y")
    ax3.set_zlabel("Latent Z")
    ax3.set_title("Gaussian Mixture Clustering")
    ax3.legend()
    ax3.view_init(elev=elev, azim=azim)
    
    # OPTICS clustering
    ax4 = fig.add_subplot(2, 2, 4, projection='3d')
    optics = OPTICS(min_samples=10, xi=0.05, min_cluster_size=0.05)
    optics_labels = optics.fit_predict(latent_space)
    print(f"OPTICS labels: {np.unique(optics_labels)}")
    
    for cluster_id in np.unique(optics_labels):
        mask = optics_labels == cluster_id
        if cluster_id == -1:
            ax4.scatter(latent_space[mask, 0], latent_space[mask, 1], latent_space[mask, 2],
                       label="Ruído", alpha=alpha, c="k")
        else:
            ax4.scatter(latent_space[mask, 0], latent_space[mask, 1], latent_space[mask, 2],
                       label=f"Cluster {cluster_id}", alpha=alpha)
    
    ax4.set_xlabel("Latent X")
    ax4.set_ylabel("Latent Y")
    ax4.set_zlabel("Latent Z")
    ax4.set_title("OPTICS Clustering")
    ax4.legend()
    ax4.view_init(elev=elev, azim=azim)
    
    plt.tight_layout()
    plt.suptitle(f"Clustering 3D de {title}", y=1.02)
    plt.show()
    
    return {
        "kmeans": cluster_labels,
        "agglomerative": agglo_labels,
        "gmm": gmm_labels,
        "optics": optics_labels
    }

In [None]:
import os
import torch
import kagglehub
import numpy as np
import polars as pl
import sklearn as sk
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

from dota import Dota2
from heroes import get_heroes
from model import Dota2Autoencoder
from dataset import get_dataset
from leagues import get_tier_one
from mpl_toolkits.mplot3d import Axes3D
from sklearn.cluster import DBSCAN
from itertools import product
from sklearn.cluster import AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.cluster import OPTICS

path = kagglehub.dataset_download("bwandowando/dota-2-pro-league-matches-2023")
heroes, hero_cols, dict_attributes, dict_roles = get_heroes(path)
tier_one_matches = get_tier_one(path)
n_heroes = len(heroes.collect())
player_cols = []
hero_cols = []

def cluster(datasets: list[pl.DataFrame], internationals: list[pl.DataFrame], hero_cols, player_cols, latent_dim=2, n_clusters=6):
    hero_pick_embedding_dim: int = 16
    hero_role_embedding_dim: int = 8
    n_players: int = 5
    n_bans: int = 7
    # latent_dim: int = 8
    hidden_layers: list[int] = [256, 128, 64, 32]
    dropout: float = 0.3
    learning_rate: float = 0.001
    force: bool = True

    autoencoders: list[Dota2Autoencoder] = []

    for ti, dataset in enumerate(datasets):
        autoencoder = Dota2Autoencoder(
            dict_roles=dict_roles,
            hero_cols=hero_cols[ti],
            player_cols=player_cols[ti],
            n_heroes=n_heroes,
            hero_pick_embedding_dim=hero_pick_embedding_dim,
            hero_role_embedding_dim=hero_role_embedding_dim,
            n_players=n_players,
            n_bans=n_bans,
            latent_dim=latent_dim,
            hidden_layers=hidden_layers,
            dropout=dropout,
            learning_rate=learning_rate,
            name=f"ti_{2024 - ti}_{latent_dim}_autoencoder",
        )
        if (os.path.exists(f"ti_{2024 - ti}_{latent_dim}_best_model.h5") and force == False):
            print(f"Loading pre-trained model for TI {2024 - ti}")
            autoencoder.load_model(f"ti_{2024 - ti}_{latent_dim}_autoencoder.h5", silent=True)
        else:
            print(f"Training autoencoder for TI {2024 - ti}")
            train_df, val_df, test_df = dataset.sample(fraction=0.7, seed=42), dataset.sample(
                fraction=0.15, seed=42), dataset.sample(fraction=0.15, seed=42)
            autoencoder.train_data(train_df, val_df, epochs=100, patience=20,
                                best_model_filename=f"ti_{2024 - ti}_{latent_dim}_best_model.h5", silent=True)
            autoencoder.save_loss_history(
                f"ti_{2024 - ti}_{latent_dim}_loss_history.csv", silent=True)
            autoencoder.save_model(f"ti_{2024 - ti}_{latent_dim}_autoencoder.h5", silent=True)
            accuracy, mse, _, _ = autoencoder.test_model(test_df)
            print(
                f"TI {2024 - ti} - Accuracy: {accuracy}, MSE: {mse}, Loss: {autoencoder.best_val_loss}")
            print("=" * 50)
        autoencoders.append(autoencoder)

    latent_spaces = []
    for ti_matches, autoencoder in product(internationals, autoencoders):
        ti = ti_matches.select('league_name').unique().item()
        print(f"Processing {ti_matches.shape[0]} matches from {ti}")
        print(f"Autoencoder name: {autoencoder.name}")
        autoencoder.eval()
        encoded = []
        total_similarity = 0
        matches_encoded = []
        autoencoder.eval()
        with torch.no_grad():
            for batch in ti_matches.iter_slices(32):
                data_np = batch.to_numpy()
                try:
                    matches_encoded.append(batch.select("match_id").to_numpy())
                    latent, reconstructed = autoencoder.encode(
                        data_np, min(32, batch.shape[0]), ti_matches.columns)
                    similarity = torch.cosine_similarity(autoencoder.flatten(
                        data_np, min(32, batch.shape[0]), ti_matches.columns), reconstructed)
                    total_similarity += similarity.sum().item()
                    encoded.append(latent.cpu().numpy())
                except RuntimeError as e:
                    print(f"RuntimeError: {e}")
                    print("Check if the input shape matches the model's expected input size.")
                    print(f"Expected input size: {autoencoder.input_dim if hasattr(autoencoder, 'input_dim') else 'unknown'}")
                    print(f"Actual input size: {data_np.shape[1]}")
                    raise

        latent_space = np.concatenate(encoded, axis=0)
        latent_spaces.append((ti, ti_matches, latent_space, matches_encoded, autoencoder.name))
        if(latent_dim == 2):
            plot_clustering_grid(latent_space, f"{ti}  {autoencoder.name}", n_clusters=n_clusters)
        if(latent_dim == 3):
            cluster_results_3d = plot_clustering_grid_3d(
                latent_space, 
                f"{ti} - {autoencoder.name}",
                n_clusters=n_clusters,
                elev=30,  # Ajuste a elevação para melhor visualização
                azim=120   # Ajuste o ângulo de visualização
            )
            print(f"Cluster results for {ti} - {autoencoder.name}: {cluster_results_3d}")
        else:
            kmeans = KMeans(n_clusters=n_clusters, random_state=42)
            cluster_labels = kmeans.fit_predict(latent_space)

            agglo = AgglomerativeClustering(n_clusters=n_clusters)
            agglo_labels = agglo.fit_predict(latent_space)

            gmm = GaussianMixture(n_components=n_clusters, random_state=42)
            gmm_labels = gmm.fit_predict(latent_space)

            optics = OPTICS(min_samples=10, xi=0.05, min_cluster_size=0.05)
            optics_labels = optics.fit_predict(latent_space)
            print(f"Cluster labels: {np.unique(cluster_labels)}")
            print(f"Agglomerative labels: {np.unique(agglo_labels)}")
            print(f"GMM labels: {np.unique(gmm_labels)}")
            print(f"OPTICS labels: {np.unique(optics_labels)}")

In [None]:
import os
import torch
import kagglehub
import numpy as np
import polars as pl
import sklearn as sk
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

from dota import Dota2
from heroes import get_heroes
from model import Dota2Autoencoder
from dataset import get_dataset
from leagues import get_tier_one
from mpl_toolkits.mplot3d import Axes3D
from sklearn.cluster import DBSCAN
from itertools import product
from sklearn.cluster import AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.cluster import OPTICS

heroes, hero_cols, dict_attributes, dict_roles = get_heroes(path)
tier_one_matches = get_tier_one(path)
n_heroes = len(heroes.collect())
player_cols = []
hero_cols = []

print(f"Carregando dados de torneios...")
ti_2024, p_cols, h_cols = get_dataset(path, specific_patches=[56])
player_cols.append(p_cols)
hero_cols.append(h_cols)
ti_2023, p_cols, h_cols = get_dataset(path, specific_patches=[53])
player_cols.append(p_cols)
hero_cols.append(h_cols)
ti_2022, p_cols, h_cols = get_dataset(path, specific_patches=[51])
player_cols.append(p_cols)
hero_cols.append(h_cols)
ti_2021, p_cols, h_cols = get_dataset(path, specific_patches=[49, 48])
player_cols.append(p_cols)
hero_cols.append(h_cols)

print(f"Carregando dados de torneios concluído.")
matches_ti_2024 = ti_2024.join(tier_one_matches, on="league_id", how="left").filter(
    pl.col("league_name") == "The International 2024")
matches_ti_2023 = ti_2023.join(tier_one_matches, on="league_id", how="left").filter(
    pl.col("league_name") == "The International 2023")
matches_ti_2022 = ti_2022.join(tier_one_matches, on="league_id", how="left").filter(
    pl.col("league_name") == "The International 2022")
matches_ti_2021 = ti_2021.join(tier_one_matches, on="league_id", how="left").filter(
    pl.col("league_name") == "The International 2021")
internationals = [matches_ti_2024, matches_ti_2023,
                  matches_ti_2022, matches_ti_2021]
datasets = [ti_2024, ti_2023, ti_2022, ti_2021]
