In [8]:
import numpy as np
from melon_clustering import PatternExtractorGraph, ClusterEvaluator, ClusterManager, Loader, reference_clusters_de
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data
import networkx as nx
import numpy as np
from collections import deque
import pandas as pd

from melon_clustering import PatternExtractorGraph, Node, CACHE_DIR

class GCN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return x

class PatternExtractorGNN(PatternExtractorGraph):
    def __init__(self):
        super().__init__()
        self.graph = nx.DiGraph()
        self.node_embeddings = {}

    def set_up_digraph(self):
        for node in self.id_to_node.values():
            for child_node in node.children.values():
                self.graph.add_edge(node.id, child_node.id)

    def build_graph(self):
        edge_index = torch.tensor(list(self.graph.edges)).t().contiguous()
        return edge_index

    def initialize_node_features(self, feature_dim=100):
        num_nodes = self.node_counter
        node_features = torch.randn((num_nodes, feature_dim), requires_grad=True)
        return node_features

    def initialize_node_embeddings(self, hidden_dim=64, output_dim=100, epochs=200):
        extractor.set_up_digraph()
        edge_index = self.build_graph()
        node_features = self.initialize_node_features(feature_dim=100)
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        model = GCN(input_dim=node_features.shape[1], hidden_dim=hidden_dim, output_dim=output_dim).to(device)
        optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
        edge_index = edge_index.to(device)
        node_features = node_features.to(device)
        model.train()
        for epoch in range(epochs):
            optimizer.zero_grad()
            out = model(node_features, edge_index)
            loss = F.mse_loss(out, node_features)  # Unsupervised loss
            loss.backward()
            optimizer.step()
        self.node_embeddings = out.detach().cpu().numpy()

    def compute_weighted_sentence_embedding(self, sentence_path, steepness=1.0):
        weighted_embedding = np.zeros(self.node_embeddings.shape[1])
        for i, node_id in enumerate(sentence_path):
            weight = 1 - (1 / (1 + np.exp(-steepness * (i + 1))))
            weighted_embedding += weight * self.node_embeddings[node_id]
        return weighted_embedding / len(sentence_path)

dim_reduction_methods = ['LSA', 'PCA', 't-SNE', 'MDS']
clustering_methods = ['KMeans', 'Agglomerative', 'DBSCAN']
cluster_manager = ClusterManager()

results_collection = []

for sigmoid_steepness in [0.2, 0.4, 0.6, 0.8, 1]:
    for overlap_threshold in [0.2, 0.4, 0.6, 0.8, 1]:
        for n_sentences in [5, 50, 500, 5000, 10000]:
# for sigmoid_steepness in [0.2]:
#     for overlap_threshold in [0.2]:
#         for n_sentences in [5]:
            for ref_word, reference_clusters in reference_clusters_de.items():
                additional_sentences_dict = Loader.load_sentences_from_word(ref_word, 'de', n_sentences=n_sentences)
                evaluator = ClusterEvaluator(reference_clusters=reference_clusters)
                sentences_dict_ref = evaluator.add_reference_sentences(reference_clusters)
                sentences_dict_db = evaluator.add_additional_sentences(additional_sentences_dict)

                extractor = PatternExtractorGNN()
                extractor.initialize(evaluator.sentences_dict, overlap_threshold=overlap_threshold)
                extractor.initialize_node_embeddings()
                embeddings, sentences_list = extractor.get_all_sentence_embeddings(evaluator.sentences_dict, steepness=sigmoid_steepness)

                vectors_reference = evaluator.isolate_reference_embeddings(embeddings, num_additional_sentences=len(sentences_dict_db['krass']))
                reference_sentence_ids = sorted(evaluator.reference_sentence_ids)
                reference_sentence_paths = [(sid, evaluator.sentence_id_to_original[sid]) for sid in reference_sentence_ids]

                clustering_results = cluster_manager.run_all_configurations(
                    vectors=vectors_reference,
                    dim_methods=dim_reduction_methods,
                    cluster_methods=clustering_methods,
                    sentence_paths=reference_sentence_paths,
                    plot_seaborn=False,
                    plot_plt=False,
                    annotate_plt=True
                )

                results_df = evaluator.evaluate_multiple_configurations(clustering_results, ref_word, plot=False,
                                                                        overlap_threshold=str(overlap_threshold),
                                                                        n_sentences=str(n_sentences),
                                                                        sigmoid_steepness=str(sigmoid_steepness))

                # Add additional information to each result and store in a list
                for index, row in results_df.iterrows():
                    row_data = {
                        'Ref Word': ref_word,
                        'Overlap Threshold': overlap_threshold,
                        'Num Sentences': n_sentences,
                        'Sigmoid Steepness': sigmoid_steepness,
                        'Dimensionality Reduction': row['Dimensionality Reduction'],
                        'Clustering Method': row['Clustering Method'],
                        'Average Jaccard Similarity': row['Average Jaccard Similarity']
                    }
                    results_collection.append(row_data)

df = pd.DataFrame(results_collection)
df.to_csv(CACHE_DIR / 'clustering_results.csv', index=False)
df.head()


sentence_clusterssentence_clusters [[['Krass das habe ich noch nie gesehen', 'Krass'], ['Krass das war wirklich beeindruckend', 'Krass'], ['Krass wie schnell du das geschafft hast', 'Krass']], [['Das war eine krass schwierige Entscheidung', 'krass'], ['Er hatte eine krass interessante Idee', 'krass'], ['Die krasse VerÃ¤nderung hat alle Ã¼berrascht', 'krasse']], [['Die Party war krass gut organisiert', 'krass'], ['Er hat das krass schnell erledigt', 'krass'], ['Sie war krass begeistert von dem Ergebnis', 'krass'], ['Der Vortrag war krass langweilig', 'krass']]]
sentence_clusterssentence_clusters [[('Finde ich schon <ROOT>', 'krass')], [('Das war <ROOT>', 'krass')], [('Amerikanische Universitäten sind <ROOT> elitär', 'krass')], [('normalen Jar Jar nicht den <ROOT> Sith Jar Jar Das Ding ist aber Mit all diesen Entscheidungen', 'krassen')], [('dieses <ROOT> Gefühl', 'krasse')]]


ValueError: not enough values to unpack (expected 2, got 1)