In [1]:
import logging
from datetime import datetime

from common import *
from sklearn.cluster import SpectralClustering

In [2]:
def get_clusters(adj_matrix, n_clusters, hyperparameters, seed=0):
    sc = SpectralClustering(
        n_clusters,
        affinity="precomputed",
        assign_labels=hyperparameters["strategy"],
        random_state=seed,
    )
    sc.fit(adj_matrix)
    return sc.labels_

In [3]:
model_hyperparameter_combinations = []

for strategy in ["kmeans"]:
    model_hyperparameter_combinations.append({"strategy": strategy})

In [4]:
method = "spectral_clustering"
no_experiments = 5

In [5]:
logging.basicConfig(
    filename=f"../logs/{method}/logs.txt",
    filemode="a+",
    format="%(asctime)s : %(message)s",
    datefmt="%y-%m-%d %I:%M:%S %p",
    level=logging.INFO
)

In [6]:
score_paths = {
    'rusemshift-finetune': u'Data/l1ndotn_schemas/rusemshift/finetune/german/{0}/dev.*.scores',
    'rusemshift-train': u'Data/l1ndotn_schemas/rusemshift/train/german/{0}/dev.*.scores',
    'ru-ru': u'Data/l1ndotn_schemas/ru-ru/german/{0}/dev.*.scores',
    'en-en': u'Data/l1ndotn_schemas/en-en/german/{0}/dev.*.scores'
}

We run the algorithm on 7 datasets:
1. DWUG-DE (annotated sentences only)
2. DWUG-DE
3. DWUG-DE (old time-period; annotated sentences only)
4. DWUG-DE (old time-period)
5. DWUG-DE (new time-period; annotated sentences only)
6. DWUG-DE (new time-period)
7. BTS-RNC

In [7]:
start_time = datetime.now()

# The second last parameter is the maximum number of clusters that we want. Increasing it can lead to errors for certain
# words with less data in datasets like DWUG-DE-old-annotated.
# The last parameter is the name of the excel file which contains the entire results from the grid search.

grid_search(
    get_dwug_data_annotated_only, 
    get_clusters, 
    score_paths, 
    model_hyperparameter_combinations, 
    5,
    method=f"{method}",
    logger_message={
        "logging": logging
    },
    cache={
        "name": "../outputs/experiment-results/{result}/{method}/cache/cache_dwug_data_annotated_only.csv",
        "result_name": "../outputs/experiment-results/{result}/{method}/results/results_dwug_data_annotated_only.csv"
    },
    run_experiments=no_experiments,
    dataset="dwug_data_annotated_only"
)

print(f"Elapsed time: {datetime.now() - start_time}")

{'adjusted_rand_score': 0.7839637572509619, 'silhouette': 0.6837056089103292, 'calinski_harabasz': 0.6365052357372195, 'eigengap': 0.5754762506995857, 'ari_silhouette': 0.6837056089103292, 'ari_calinski': 0.6365052357372195, 'ari_eigengap': 0.5754762506995857} {'binarize': True, 'percentile': 0, 'word_level_threshold': False, 'score_path': 'rusemshift-finetune', 'model_hyperparameters': {'strategy': 'kmeans'}, 'threshold': 0.5}
Saving. Total results: 1 / 80
{'adjusted_rand_score': 0.8431288465384235, 'silhouette': 0.6944952620286667, 'calinski_harabasz': 0.6993671245917672, 'eigengap': 0.6003568897823307, 'ari_silhouette': 0.6944952620286667, 'ari_calinski': 0.6993671245917672, 'ari_eigengap': 0.6003568897823307} {'binarize': True, 'percentile': 0, 'word_level_threshold': False, 'score_path': 'rusemshift-train', 'model_hyperparameters': {'strategy': 'kmeans'}, 'threshold': 0.5}
{'adjusted_rand_score': 0.7819518521504835, 'silhouette': 0.7093317601409336, 'calinski_harabasz': 0.68527949

In [8]:
start_time = datetime.now()

grid_search(
    get_dwug_old_data_annotated_only,
    get_clusters,
    score_paths,
    model_hyperparameter_combinations,
    5,
    method=f"{method}",
    logger_message={"logging": logging},
    cache={
        "name": "../outputs/experiment-results/{result}/{method}/cache/cache_dwug_old_data_annotated_only.csv",
        "result_name": "../outputs/experiment-results/{result}/{method}/results/results_dwug_old_data_annotated_only.csv",
    },
    run_experiments=no_experiments,
    dataset="dwug_old_data_annotated_only",
)

print(f"Elapsed time: {datetime.now() - start_time}")

{'adjusted_rand_score': 0.8029095551359178, 'silhouette': 0.3938759503549472, 'calinski_harabasz': 0.3817987678339447, 'eigengap': 0.671275150021934, 'ari_silhouette': 0.3938759503549472, 'ari_calinski': 0.3817987678339447, 'ari_eigengap': 0.671275150021934} {'binarize': True, 'percentile': 0, 'word_level_threshold': False, 'score_path': 'rusemshift-finetune', 'model_hyperparameters': {'strategy': 'kmeans'}, 'threshold': 0.5}
Saving. Total results: 1 / 80
{'adjusted_rand_score': 0.7862330367187426, 'silhouette': 0.33684046135344875, 'calinski_harabasz': 0.3281899579833332, 'eigengap': 0.6081944143237398, 'ari_silhouette': 0.33684046135344875, 'ari_calinski': 0.3281899579833332, 'ari_eigengap': 0.6081944143237398} {'binarize': True, 'percentile': 0, 'word_level_threshold': False, 'score_path': 'rusemshift-train', 'model_hyperparameters': {'strategy': 'kmeans'}, 'threshold': 0.5}
{'adjusted_rand_score': 0.7983788760576822, 'silhouette': 0.4021725814245683, 'calinski_harabasz': 0.40503335

In [9]:
start_time = datetime.now()

grid_search(
    get_dwug_new_data_annotated_only, 
    get_clusters, 
    score_paths, 
    model_hyperparameter_combinations, 
    5, 
    method=f"{method}",
    logger_message={
        "logging": logging
    },
    cache={
        "name": "../outputs/experiment-results/{result}/{method}/cache/cache_dwug_new_data_annotated_only.csv",
        "result_name": "../outputs/experiment-results/{result}/{method}/results/results_dwug_new_data_annotated_only.csv"
    },
    run_experiments=no_experiments,
    dataset="dwug_new_data_annotated_only"
)

print(f"Elapsed time: {datetime.now() - start_time}")

{'adjusted_rand_score': 0.6214907448773, 'silhouette': 0.3065872834104278, 'calinski_harabasz': 0.34323013230583, 'eigengap': 0.42450408100254505, 'ari_silhouette': 0.3065872834104278, 'ari_calinski': 0.34323013230583, 'ari_eigengap': 0.42450408100254505} {'binarize': True, 'percentile': 0, 'word_level_threshold': False, 'score_path': 'rusemshift-finetune', 'model_hyperparameters': {'strategy': 'kmeans'}, 'threshold': 0.5}
Saving. Total results: 1 / 80
{'adjusted_rand_score': 0.691594094067295, 'silhouette': 0.40447553494319716, 'calinski_harabasz': 0.40610791950691394, 'eigengap': 0.4505832486792138, 'ari_silhouette': 0.40447553494319716, 'ari_calinski': 0.40610791950691394, 'ari_eigengap': 0.4505832486792138} {'binarize': True, 'percentile': 0, 'word_level_threshold': False, 'score_path': 'rusemshift-train', 'model_hyperparameters': {'strategy': 'kmeans'}, 'threshold': 0.5}
{'adjusted_rand_score': 0.7107676433024931, 'silhouette': 0.4095596281022698, 'calinski_harabasz': 0.4537124993