In [1]:
import logging
from datetime import datetime

from common import *
import clustering

In [2]:
def get_clusters(adj_matrix, hyperparameters, seed=0):
    graph = clustering._adjacency_matrix_to_nxgraph(adj_matrix)
    if "seed" not in hyperparameters:
        hyperparameters.update({"seed": seed})

    clusters = clustering.chinese_whispers_clustering(graph, **hyperparameters)    
    return clustering._convert_graph_cluster_list_set_to_list(graph, clusters)

In [3]:
model_hyperparameter_combinations = []

for weighting in ['top', 'lin', 'log']:
    model_hyperparameter_combinations.append({'weighting': weighting})

In [4]:
score_paths = {
    'rusemshift-finetune': u'Data/l1ndotn_schemas/rusemshift/finetune/german/{0}/dev.*.scores',
    'rusemshift-train': u'Data/l1ndotn_schemas/rusemshift/train/german/{0}/dev.*.scores',
    'ru-ru': u'Data/l1ndotn_schemas/ru-ru/german/{0}/dev.*.scores',
    'en-en': u'Data/l1ndotn_schemas/en-en/german/{0}/dev.*.scores'
}

In [5]:
logging.basicConfig(
    filename="../logs/chinese_whispers/logs.txt",
    filemode="a+",
    format="%(asctime)s : %(message)s",
    datefmt="%y-%m-%d %I:%M:%S %p",
    level=logging.INFO
)

In [6]:
method = "chinese_whispers"
no_experiments = 5

In [7]:
start_time = datetime.now()

grid_search_without_nclusters(
    get_dwug_data_annotated_only,
    get_clusters,
    score_paths,
    model_hyperparameter_combinations,
    include_binarize=True,
    method=f"{method}",
    logger_message={
        "logging": logging
    },
    cache={
        "name": "../outputs/experiment-results/{result}/{method}/cache/cache_dwug_data_annotated_only.csv",
        "result_name": "../outputs/experiment-results/{result}/{method}/results/results_dwug_data_annotated_only.csv",
    },
    run_experiments=no_experiments,
    dataset="dwug_data_annotated_only",
)

print(f"Elapsed time: {datetime.now() - start_time}")

{'adjusted_rand_score': 0.5998985562545917} {'binarize': True, 'percentile': 0, 'word_level_threshold': False, 'score_path': 'rusemshift-finetune', 'model_hyperparameters': {'weighting': 'top', 'seed': 90}, 'threshold': 0.5}
Saving. Total results: 1 / 240
{'adjusted_rand_score': 0.6820732025254044} {'binarize': True, 'percentile': 0, 'word_level_threshold': False, 'score_path': 'rusemshift-finetune', 'model_hyperparameters': {'weighting': 'lin', 'seed': 90}, 'threshold': 0.5}
{'adjusted_rand_score': 0.6256995981165755} {'binarize': True, 'percentile': 0, 'word_level_threshold': False, 'score_path': 'rusemshift-finetune', 'model_hyperparameters': {'weighting': 'log', 'seed': 90}, 'threshold': 0.5}
{'adjusted_rand_score': 0.5582877916869152} {'binarize': True, 'percentile': 0, 'word_level_threshold': False, 'score_path': 'rusemshift-train', 'model_hyperparameters': {'weighting': 'top', 'seed': 90}, 'threshold': 0.5}
{'adjusted_rand_score': 0.6584723720683918} {'binarize': True, 'percenti

In [8]:
start_time = datetime.now()

grid_search_without_nclusters(
    get_dwug_old_data_annotated_only, 
    get_clusters, 
    score_paths,
    model_hyperparameter_combinations, 
    include_binarize=True,
    method=f"{method}",
    logger_message={
        "logging": logging
    },
    cache={
        "name": "../outputs/experiment-results/{result}/{method}/cache/cache_dwug_old_data_annotated_only.csv",
        "result_name": "../outputs/experiment-results/{result}/{method}/results/results_dwug_old_data_annotated_only.csv"
    },
    run_experiments=no_experiments,
    dataset="dwug_old_data_annotated_only"
)

print(f"Elapsed time: {datetime.now() - start_time}")


{'adjusted_rand_score': 0.60591376515973} {'binarize': True, 'percentile': 0, 'word_level_threshold': False, 'score_path': 'rusemshift-finetune', 'model_hyperparameters': {'weighting': 'top', 'seed': 90}, 'threshold': 0.5}
Saving. Total results: 1 / 240
{'adjusted_rand_score': 0.5393195433342782} {'binarize': True, 'percentile': 0, 'word_level_threshold': False, 'score_path': 'rusemshift-finetune', 'model_hyperparameters': {'weighting': 'lin', 'seed': 90}, 'threshold': 0.5}
{'adjusted_rand_score': 0.6402176314915132} {'binarize': True, 'percentile': 0, 'word_level_threshold': False, 'score_path': 'rusemshift-finetune', 'model_hyperparameters': {'weighting': 'log', 'seed': 90}, 'threshold': 0.5}
{'adjusted_rand_score': 0.6177124180001023} {'binarize': True, 'percentile': 0, 'word_level_threshold': False, 'score_path': 'rusemshift-train', 'model_hyperparameters': {'weighting': 'top', 'seed': 90}, 'threshold': 0.5}
{'adjusted_rand_score': 0.6550429799903185} {'binarize': True, 'percentile

In [9]:
start_time = datetime.now()

grid_search_without_nclusters(
    get_dwug_new_data_annotated_only, 
    get_clusters, 
    score_paths,
    model_hyperparameter_combinations, 
    include_binarize=True,
    method=f"{method}",
    logger_message={
        "logging": logging
    },
    cache={
        "name": "../outputs/experiment-results/{result}/{method}/cache/cache_dwug_new_data_annotated_only.csv",
        "result_name": "../outputs/experiment-results/{result}/{method}/results/results_dwug_new_data_annotated_only.csv"
    },
    run_experiments=no_experiments,
    dataset="dwug_new_data_annotated_only"
)

print(f"Elapsed time: {datetime.now() - start_time}")


{'adjusted_rand_score': 0.6987955368985102} {'binarize': True, 'percentile': 0, 'word_level_threshold': False, 'score_path': 'rusemshift-finetune', 'model_hyperparameters': {'weighting': 'top', 'seed': 90}, 'threshold': 0.5}
Saving. Total results: 1 / 240
{'adjusted_rand_score': 0.713817697998025} {'binarize': True, 'percentile': 0, 'word_level_threshold': False, 'score_path': 'rusemshift-finetune', 'model_hyperparameters': {'weighting': 'lin', 'seed': 90}, 'threshold': 0.5}
{'adjusted_rand_score': 0.6564663590957246} {'binarize': True, 'percentile': 0, 'word_level_threshold': False, 'score_path': 'rusemshift-finetune', 'model_hyperparameters': {'weighting': 'log', 'seed': 90}, 'threshold': 0.5}
{'adjusted_rand_score': 0.6321426440541073} {'binarize': True, 'percentile': 0, 'word_level_threshold': False, 'score_path': 'rusemshift-train', 'model_hyperparameters': {'weighting': 'top', 'seed': 90}, 'threshold': 0.5}
{'adjusted_rand_score': 0.6893805392635081} {'binarize': True, 'percentil