In [1]:
import logging
from datetime import datetime

from _correlation import cluster_correlation_search
import clustering
from common import *
import networkx as nx
import numpy as np

In [2]:
def get_clusters(adj_matrix, hyperparameters, seed=0):
    G = clustering._adjacency_matrix_to_nxgraph(adj_matrix)
    
    clusters = clustering.correlation_clustering(G, **hyperparameters)
    return clustering._convert_graph_cluster_list_set_to_list(G, clusters)

In [3]:
score_paths = {
    'rusemshift-finetune': u'Data/l1ndotn_schemas/rusemshift/finetune/german/{0}/dev.*.scores',
    'rusemshift-train': u'Data/l1ndotn_schemas/rusemshift/train/german/{0}/dev.*.scores',
    'ru-ru': u'Data/l1ndotn_schemas/ru-ru/german/{0}/dev.*.scores',
    'en-en': u'Data/l1ndotn_schemas/en-en/german/{0}/dev.*.scores'
}

In [4]:
method = "correlation_clustering"
no_experiments=5

In [5]:
logging.basicConfig(
    filename=f"../logs/{method}/logs.txt",
    filemode="a+",
    format="%(asctime)s : %(message)s",
    datefmt="%y-%m-%d %I:%M:%S %p",
    level=logging.INFO
)

In [6]:
def get_thresholds_for_cc(get_data, score_paths):
    model_hyperparameter_combinations = []
    ans = {}

    senses, scores = get_data(score_paths)
    scalers = get_scaling(scores)
    
    for score_path in score_paths:
        for sense in [10]:
            for attempt in [2000]:
                for iteration in [50000]:
                    model_hyperparameter_combinations.append(
                        {
                            "threshold_cc": scalers[score_path].transform(np.array(0.5).reshape(1, -1)).item(0),
                            "max_attempts": attempt,
                            "max_iters": iteration,
                            "max_senses": sense
                        }
                    )

        ans[score_path] = model_hyperparameter_combinations
        model_hyperparameter_combinations = []

    return ans


In [7]:
start_time = datetime.now()

model_hyperparameter_combinations = get_thresholds_for_cc(
    get_dwug_data_annotated_only, score_paths
)

grid_search_without_nclusters(
    get_dwug_data_annotated_only,
    get_clusters,
    score_paths,
    model_hyperparameter_combinations,
    include_binarize=True,
    method=f"{method}",
    logger_message={"logging": logging},
    cache={
        "name": "../outputs/experiment-results/{result}/{method}/cache/cache_dwug_data_annotated_only.csv",
        "result_name": "../outputs/experiment-results/{result}/{method}/results/results_dwug_data_annotated_only.csv",
    },
    run_experiments=no_experiments,
    dataset="dwug_data_annotated_only"
)

print(f"Elapsed time: {datetime.now() - start_time}")

{'adjusted_rand_score': 0.2364369600228846} {'binarize': True, 'percentile': 0, 'word_level_threshold': False, 'score_path': 'rusemshift-finetune', 'model_hyperparameters': {'threshold_cc': 0.5406312699576192, 'max_attempts': 2000, 'max_iters': 50000, 'max_senses': 10}, 'threshold': 0.5}
Saving. Total results: 1 / 80
{'adjusted_rand_score': 0.24599869369757757} {'binarize': True, 'percentile': 0, 'word_level_threshold': False, 'score_path': 'rusemshift-train', 'model_hyperparameters': {'threshold_cc': 0.4949049830710326, 'max_attempts': 2000, 'max_iters': 50000, 'max_senses': 10}, 'threshold': 0.5}
{'adjusted_rand_score': 0.1628761638844518} {'binarize': True, 'percentile': 0, 'word_level_threshold': False, 'score_path': 'ru-ru', 'model_hyperparameters': {'threshold_cc': 0.4906564384435157, 'max_attempts': 2000, 'max_iters': 50000, 'max_senses': 10}, 'threshold': 0.5}
{'adjusted_rand_score': 0.1317694342759421} {'binarize': True, 'percentile': 0, 'word_level_threshold': False, 'score_p

In [8]:
start_time = datetime.now()

model_hyperparameter_combinations = get_thresholds_for_cc(
    get_dwug_old_data_annotated_only, score_paths
)

grid_search_without_nclusters(
    get_dwug_old_data_annotated_only,
    get_clusters,
    score_paths,
    model_hyperparameter_combinations,
    include_binarize=True,
    method=f"{method}",
    logger_message={"logging": logging},
    cache={
        "name": "../outputs/experiment-results/{result}/{method}/cache/cache_dwug_old_data_annotated_only.csv",
        "result_name": "../outputs/experiment-results/{result}/{method}/results/results_dwug_old_data_annotated_only.csv",
    },
    run_experiments=no_experiments,
    dataset="dwug_old_data_annotated_only",
)

print(f"Elapsed time: {datetime.now() - start_time}")

{'adjusted_rand_score': 0.47547898255103427} {'binarize': True, 'percentile': 0, 'word_level_threshold': False, 'score_path': 'rusemshift-finetune', 'model_hyperparameters': {'threshold_cc': 0.5132412683336636, 'max_attempts': 2000, 'max_iters': 50000, 'max_senses': 10}, 'threshold': 0.5}
Saving. Total results: 1 / 80
{'adjusted_rand_score': 0.5760457513334357} {'binarize': True, 'percentile': 0, 'word_level_threshold': False, 'score_path': 'rusemshift-train', 'model_hyperparameters': {'threshold_cc': 0.48032299530612566, 'max_attempts': 2000, 'max_iters': 50000, 'max_senses': 10}, 'threshold': 0.5}
{'adjusted_rand_score': 0.4929236550367369} {'binarize': True, 'percentile': 0, 'word_level_threshold': False, 'score_path': 'ru-ru', 'model_hyperparameters': {'threshold_cc': 0.47635841415257474, 'max_attempts': 2000, 'max_iters': 50000, 'max_senses': 10}, 'threshold': 0.5}
{'adjusted_rand_score': 0.5345903217034036} {'binarize': True, 'percentile': 0, 'word_level_threshold': False, 'score

In [9]:
start_time = datetime.now()

model_hyperparameter_combinations = get_thresholds_for_cc(
    get_dwug_new_data_annotated_only, score_paths
)

grid_search_without_nclusters(
    get_dwug_new_data_annotated_only,
    get_clusters,
    score_paths,
    model_hyperparameter_combinations,
    include_binarize=True,
    method=f"{method}",
    logger_message={"logging": logging},
    cache={
        "name": "../outputs/experiment-results/{result}/{method}/cache/cache_dwug_new_data_annotated_only.csv",
        "result_name": "../outputs/experiment-results/{result}/{method}/results/results_dwug_new_data_annotated_only.csv",
    },
    run_experiments=no_experiments,
    dataset="dwug_new_data_annotated_only",
)

print(f"Elapsed time: {datetime.now() - start_time}")

{'adjusted_rand_score': 0.6733409732182981} {'binarize': True, 'percentile': 0, 'word_level_threshold': False, 'score_path': 'rusemshift-finetune', 'model_hyperparameters': {'threshold_cc': 0.5378196778052731, 'max_attempts': 2000, 'max_iters': 50000, 'max_senses': 10}, 'threshold': 0.5}
Saving. Total results: 1 / 80
{'adjusted_rand_score': 0.6330718482317365} {'binarize': True, 'percentile': 0, 'word_level_threshold': False, 'score_path': 'rusemshift-train', 'model_hyperparameters': {'threshold_cc': 0.48778792626991585, 'max_attempts': 2000, 'max_iters': 50000, 'max_senses': 10}, 'threshold': 0.5}
{'adjusted_rand_score': 0.6207610195662276} {'binarize': True, 'percentile': 0, 'word_level_threshold': False, 'score_path': 'ru-ru', 'model_hyperparameters': {'threshold_cc': 0.4754095574739985, 'max_attempts': 2000, 'max_iters': 50000, 'max_senses': 10}, 'threshold': 0.5}
{'adjusted_rand_score': 0.6004740885196651} {'binarize': True, 'percentile': 0, 'word_level_threshold': False, 'score_p