In [1]:
import random
import pandas as pd
from sklearn.model_selection import KFold

from helpers_ import (
    train,
    eval,
    calculate_results_for_spectral_clustering_method,
    calculate_average,
    save_parameters_per_method,
    print_results,
)

In [2]:
k_fold = 5
METHODS = [
    "chinese_whispers",
    "correlation_clustering",
    "wsbm",
    "spectral_clustering",
]
NUMBER_OF_WORDS = 24
TARGET_WORDS = [
    "überspannen",
    "Manschette",
    "Fuß",
    "Rezeption",
    "abgebrüht",
    "Dynamik",
    "Engpaß",
    "abbauen",
    "Mißklang",
    "Abgesang",
    "Knotenpunkt",
    "Spielball",
    "zersetzen",
    "Armenhaus",
    "Ohrwurm",
    "Eintagsfliege",
    "Seminar",
    "Sensation",
    "Titel",
    "Schmiere",
    "ausspannen",
    "packen",
    "artikulieren",
    "abdecken",
]
assert len(TARGET_WORDS) == NUMBER_OF_WORDS

In [3]:
def get_fields_to_report_results():
    results_per_method = {}
    parameters_per_method = {}
    for method in METHODS:
        parameters_per_method[method] = {"ari": [], "spr_lscd": []}
        if method != "spectral_clustering":
            results_per_method[method] = {"ari": 0.0, "spr_lscd": 0.0}
        else:
            results_per_method[method] = {
                "avg_ari_silhouette": 0.0,
                "avg_ari_calinski": 0.0,
                "avg_ari_eigengap": 0.0,
                "avg_ari_silhouette_old": 0.0,
                "avg_ari_calinski_old": 0.0,
                "avg_ari_eigengap_old": 0.0,
                "avg_ari_silhouette_new": 0.0,
                "avg_ari_calinski_new": 0.0,
                "avg_ari_eigengap_new": 0.0,
                "spr_lscd_silhouette": 0.0,
                "spr_lscd_calinski": 0.0,
                "spr_lscd_eigengap": 0.0,
            }

    return results_per_method, parameters_per_method

results_per_method, parameters_per_method = get_fields_to_report_results()

In [4]:
kf = KFold(n_splits=k_fold, random_state=32, shuffle=True)
for method in METHODS:
    iteration = 1
    print(f"Method: {method}")

    for train_, test_ in kf.split(TARGET_WORDS):
        best_combination_for_ari, best_combination_for_spr_lscd = train(
            method, train_, TARGET_WORDS
        )
        parameters_per_method[method]["ari"].append(best_combination_for_ari)
        parameters_per_method[method]["spr_lscd"].append(
            best_combination_for_spr_lscd
        )
        ari, spr_lscd = eval(
            method,
            test_,
            best_combination_for_ari,
            best_combination_for_spr_lscd,
            TARGET_WORDS,
        )

        print_results(
            iteration,
            best_combination_for_ari["ari"]
            if "ari" in best_combination_for_ari
            else best_combination_for_ari,
            ari,
            best_combination_for_spr_lscd["spr_lscd"]
            if "spr_lscd" in best_combination_for_spr_lscd
            else best_combination_for_spr_lscd,
            spr_lscd,
        )

        if method != "spectral_clustering":
            results_per_method[method]["ari"] += ari
            results_per_method[method]["spr_lscd"] += spr_lscd
        else:
            ari_and_spr_lscd_calculated_fields = {**ari, **spr_lscd}
            calculate_results_for_spectral_clustering_method(
                results_per_method["spectral_clustering"],
                ari_and_spr_lscd_calculated_fields,
            )

        iteration += 1

calculate_average(results_per_method, METHODS, k_fold)

Method: chinese_whispers




  Fold-1:
    dev-ARI: 0.6994453736616235 test-ARI: 0.5516903886516474
    dev-Spr_LSCD: 0.8541313718020878 test-Spr_LSCD: 0.3947368421052632

  Fold-2:
    dev-ARI: 0.6792590170598993 test-ARI: 0.6076291157738213
    dev-Spr_LSCD: 0.8729306113538294 test-Spr_LSCD: 0.8999999999999998

  Fold-3:
    dev-ARI: 0.7204040209746494 test-ARI: 0.5200483273426302
    dev-Spr_LSCD: 0.9200355045989487 test-Spr_LSCD: 0.3

  Fold-4:
    dev-ARI: 0.7386727431467213 test-ARI: 0.5256599023950035
    dev-Spr_LSCD: 0.9683659274766395 test-Spr_LSCD: 0.35909242322980395

  Fold-5:
    dev-ARI: 0.6894950731814958 test-ARI: 0.6811327266270648
    dev-Spr_LSCD: 0.8447056087291156 test-Spr_LSCD: 1.0

Method: correlation_clustering
  Fold-1:
    dev-ARI: 0.7182303089218093 test-ARI: 0.8034800735512881
    dev-Spr_LSCD: 0.8042145329547151 test-Spr_LSCD: 0.9746794344808963

  Fold-2:
    dev-ARI: 0.7354043346922096 test-ARI: 0.7382187756237669
    dev-Spr_LSCD: 0.8127803778557416 test-Spr_LSCD: 0.999999999999999

In [5]:
save_parameters_per_method(parameters_per_method, METHODS)

In [6]:
results_per_method, parameters_per_method = get_fields_to_report_results()

In [7]:
for method in METHODS:
    iteration = 1
    print(f"Method: {method}")

    for train_, test_ in kf.split(TARGET_WORDS):
        best_combination_for_ari, best_combination_for_spr_lscd = train(
            method, train_, TARGET_WORDS
        )
        parameters_per_method[method]["ari"].append(best_combination_for_ari)
        parameters_per_method[method]["spr_lscd"].append(
            best_combination_for_spr_lscd
        )
        ari, spr_lscd = eval(
            method,
            test_,
            best_combination_for_ari,
            best_combination_for_spr_lscd,
            TARGET_WORDS,
            exchange_optimized_parameters=True
        )

        print_results(
            iteration,
            best_combination_for_ari["ari"]
            if "ari" in best_combination_for_ari
            else best_combination_for_ari,
            ari,
            best_combination_for_spr_lscd["spr_lscd"]
            if "spr_lscd" in best_combination_for_spr_lscd
            else best_combination_for_spr_lscd,
            spr_lscd,
        )

        if method != "spectral_clustering":
            results_per_method[method]["ari"] += ari
            results_per_method[method]["spr_lscd"] += spr_lscd
        else:
            ari_and_spr_lscd_calculated_fields = {**ari, **spr_lscd}
            calculate_results_for_spectral_clustering_method(
                results_per_method["spectral_clustering"],
                ari_and_spr_lscd_calculated_fields,
            )

        iteration += 1

calculate_average(results_per_method, METHODS, k_fold)

Method: chinese_whispers
  Fold-1:
    dev-ARI: 0.6994453736616235 test-ARI: 0.4472319613793063
    dev-Spr_LSCD: 0.8541313718020878 test-Spr_LSCD: 0.9746794344808963

  Fold-2:
    dev-ARI: 0.6792590170598993 test-ARI: 0.7222222259639388
    dev-Spr_LSCD: 0.8729306113538294 test-Spr_LSCD: 0.7999999999999999

  Fold-3:
    dev-ARI: 0.7204040209746494 test-ARI: 0.49203251628172284
    dev-Spr_LSCD: 0.9200355045989487 test-Spr_LSCD: 0.3

  Fold-4:
    dev-ARI: 0.7386727431467213 test-ARI: 0.6175118515494609
    dev-Spr_LSCD: 0.9683659274766395 test-Spr_LSCD: 0.5642880936468347

  Fold-5:
    dev-ARI: 0.6894950731814958 test-ARI: 0.6189588558827837
    dev-Spr_LSCD: 0.8447056087291156 test-Spr_LSCD: 0.7999999999999999

Method: correlation_clustering
  Fold-1:
    dev-ARI: 0.7182303089218093 test-ARI: 0.5995437211318104
    dev-Spr_LSCD: 0.8042145329547151 test-Spr_LSCD: 0.9746794344808963

  Fold-2:
    dev-ARI: 0.7354043346922096 test-ARI: 0.7725347913931706
    dev-Spr_LSCD: 0.812780377