In [1]:
import pandas as pd
from melon_clustering import PatternExtractorGNN, ClusterEvaluator, CACHE_DIR, Loader, reference_clusters_de

dim_reduction_methods = ['LSA', 'PCA', 't-SNE']
clustering_methods = ['KMeans', 'Agglomerative', 'DBSCAN']

results_collection = []

# for sigmoid_steepness in [0.2, 0.4, 0.6, 0.8, 1]:
#     for overlap_threshold in [0.2, 0.4, 0.6, 0.8, 1]:
#         for n_sentences in [5, 50, 500, 5000, 10000]:

for sigmoid_steepness in [i*0.1 for i in range(0, 10)]:
    for overlap_threshold in [i*0.1 for i in range(0, 10, 2)]:
        for n_sentences in [0, 50]:
            for ref_word, reference_clusters in reference_clusters_de.items():
                additional_sentences_dict = Loader.load_sentences_from_word(ref_word, 'de', n_sentences=n_sentences)
                evaluator = ClusterEvaluator(reference_clusters=reference_clusters)
                sentences_dict_ref = evaluator.add_reference_sentences(reference_clusters)
                sentences_dict_db = evaluator.add_additional_sentences(additional_sentences_dict)

                extractor = PatternExtractorGNN()
                extractor.initialize(evaluator.sentences_dict, overlap_threshold=overlap_threshold)
                extractor.initialize_node_embeddings()
                embeddings, sentences_list = extractor.get_all_sentence_embeddings(evaluator.sentences_dict, steepness=sigmoid_steepness)

                results_df = evaluator.run_all_configurations(
                    vectors=embeddings,
                    ref_word=ref_word,
                    overlap_threshold=overlap_threshold,
                    n_sentences=n_sentences,
                    sigmoid_steepness=sigmoid_steepness,
                    dim_methods=dim_reduction_methods,
                    cluster_methods=clustering_methods,
                    sentence_paths=[(sid, evaluator.sentence_id_to_original[sid]) for sid in evaluator.reference_sentence_ids],
                    plot_seaborn=False,
                    plot_plt=False,
                    annotate_plt=True,
                    save=False
                )

                results_collection.append(results_df)

# Combine all results using pd.concat
df = pd.concat(results_collection, ignore_index=True)
df.to_csv(CACHE_DIR / 'final_clustering_results_gnn.csv', index=False)
df


eva LSA KMeans
liys
red
labels [1 2 1 0 0 0 0 1 1 2]
lis
comp
eva LSA Agglomerative
liys
red
labels [2 0 2 1 1 1 1 2 2 0]
lis
comp
eva LSA DBSCAN
liys
red
labels [ 0 -1  0  1  1  1  1  0  0 -1]
lis
comp
eva PCA KMeans
liys
red
labels [1 2 1 0 0 0 0 2 2 1]
lis
comp
eva PCA Agglomerative
liys
red
labels [2 0 2 1 1 1 1 0 0 2]
lis
comp
eva PCA DBSCAN
liys
red
labels [ 0 -1  0  1  1  2  2  3  3 -1]
lis
comp
eva t-SNE KMeans
liys
red
labels [2 1 2 0 0 0 0 1 1 2]
lis
comp
eva t-SNE Agglomerative
liys
red
labels [2 0 2 1 1 1 1 0 0 2]
lis
comp
eva t-SNE DBSCAN
liys
red
labels [-1 -1 -1 -1 -1 -1 -1 -1 -1 -1]
lis
comp
eva LSA KMeans
liys
red
labels [1 2 1 2 2 2 1 0 2]
lis
comp
eva LSA Agglomerative
liys
red
labels [0 2 0 2 2 2 0 1 2]
lis
comp
eva LSA DBSCAN
liys
red
labels [ 0  1 -1  2  2  2  0 -1  1]
lis
comp
eva PCA KMeans
liys
red
labels [1 1 2 2 2 2 1 0 1]
lis
comp
eva PCA Agglomerative
liys
red
labels [0 0 1 1 1 1 0 2 0]
lis
comp
eva PCA DBSCAN
liys
red
labels [ 0  1 -1  2  2  2  0 -1  1]
li