In [1]:
import random
import pandas as pd
from sklearn.model_selection import KFold

from helpers_ import (
    train,
    eval,
    get_fields_to_report_for_spectral_clustering,
    calculate_results_for_spectral_clustering_method,
    present_results,
    calculate_average,
    present_parameters_per_method,
    print_results
)

In [2]:
k_fold = 5
METHODS = [
    "chinese_whispers",
    "correlation_clustering",
    "wsbm",
    "spectral_clustering",
]
NUMBER_OF_WORDS = 24
TARGET_WORDS = [
    "überspannen",
    "Manschette",
    "Fuß",
    "Rezeption",
    "abgebrüht",
    "Dynamik",
    "Engpaß",
    "abbauen",
    "Mißklang",
    "Abgesang",
    "Knotenpunkt",
    "Spielball",
    "zersetzen",
    "Armenhaus",
    "Ohrwurm",
    "Eintagsfliege",
    "Seminar",
    "Sensation",
    "Titel",
    "Schmiere",
    "ausspannen",
    "packen",
    "artikulieren",
    "abdecken",
]
assert len(TARGET_WORDS) == NUMBER_OF_WORDS

In [3]:
results_per_method = {}
parameters_per_method = {}
for m in METHODS:
    parameters_per_method[m] = {"ari": [], "spr_lscd": []}
    if m != "spectral_clustering":
        results_per_method[m] = {"ari": 0.0, "spr_lscd": 0.0}
    else:
        results_per_method[m] = {
            "ari": get_fields_to_report_for_spectral_clustering(),
            "spr_lscd": 0.0,
        }


In [4]:
kf = KFold(n_splits=k_fold, random_state=32, shuffle=True)
for m in METHODS:
    iteration = 1
    print(f"Method: {m}")
    for train_, test_ in kf.split(TARGET_WORDS):
        best_configuration_ari, best_configuration_jsd = train(
            m, train_, TARGET_WORDS
        )
        parameters_per_method[m]["ari"].append(best_configuration_ari)
        parameters_per_method[m]["spr_lscd"].append(best_configuration_jsd)
        ari, spr_lscd = eval(
            m,
            test_,
            best_configuration_ari,
            best_configuration_jsd,
            TARGET_WORDS,
        )
        results_per_method[m]["spr_lscd"] += spr_lscd
        print_results(
            iteration,
            best_configuration_ari["ari"]
            if "ari" in best_configuration_ari
            else best_configuration_ari,
            ari,
            best_configuration_jsd["jsd"],
            spr_lscd,
        )

        if m != "spectral_clustering":
            results_per_method[m]["ari"] += ari
        else:
            calculate_results_for_spectral_clustering_method(
                results_per_method["spectral_clustering"]["ari"], ari
            )

        iteration += 1

calculate_average(results_per_method, METHODS, k_fold)

Method: chinese_whispers
  Fold-1:
    dev-ARI: 0.6994453736616235 test-ARI: 0.5516903886516474
    dev-Spr_LSCD: 0.8430789592560737 test-Spr_LSCD: 0.9746794344808963

  Fold-2:
    dev-ARI: 0.6792590170598993 test-ARI: 0.6076291157738213
    dev-Spr_LSCD: 0.8597198876528641 test-Spr_LSCD: 0.8999999999999998

  Fold-3:
    dev-ARI: 0.7204040209746494 test-ARI: 0.5200483273426302
    dev-Spr_LSCD: 0.9200355045989487 test-Spr_LSCD: 0.3

  Fold-4:
    dev-ARI: 0.7386727431467213 test-ARI: 0.5256599023950035
    dev-Spr_LSCD: 0.9635328851888704 test-Spr_LSCD: 0.5642880936468347

  Fold-5:
    dev-ARI: 0.6826800224478888 test-ARI: 0.5984515823986658
    dev-Spr_LSCD: 0.8447056087291156 test-Spr_LSCD: 1.0

Method: correlation_clustering
  Fold-1:
    dev-ARI: 0.7182303089218093 test-ARI: 0.8042905366730719
    dev-Spr_LSCD: 0.8042145329547151 test-Spr_LSCD: 0.9746794344808963

  Fold-2:
    dev-ARI: 0.7356176144611001 test-ARI: 0.7382187756237669
    dev-Spr_LSCD: 0.8127803778557416 test-Spr

In [5]:
present_results(results_per_method, METHODS)

chinese_whispers
    Spr_LSCD: 0.7477935056255461
    ARI: 0.5606958633123537

correlation_clustering
    Spr_LSCD: 0.7149358868961792
    ARI: 0.7379364570255248

wsbm
    Spr_LSCD: 0.6944163198544762
    ARI: 0.5436614823130781

spectral_clustering
    Spr_LSCD: 0.794416319854476
    AVG_ARI_SILHOUETTE: 0.6912630694220034
    AVG_ARI_CALINSKI: 0.6621919430746371
    AVG_ARI_EIGENGAP: 0.6853691283764869
    AVG_ARI_SILHOUETTE_OLD: 0.6623189740754645
    AVG_ARI_CALINSKI_OLD: 0.6163425904644021
    AVG_ARI_EIGENGAP_OLD: 0.6657986047593586
    AVG_ARI_SILHOUETTE_NEW: 0.7026432475958058
    AVG_ARI_CALINSKI_NEW: 0.6905615985291641
    AVG_ARI_EIGENGAP_NEW: 0.6016686117663294



In [6]:
present_parameters_per_method(parameters_per_method, METHODS)