In [1]:
from datetime import datetime
import functools as f

from sklearn.cluster import AgglomerativeClustering
import numpy as np
from numpy.typing import NDArray

from common_generate_predictions import grid_search, load_data

In [2]:
max_number_clusters = 5
wic_data = True
method = "ac"
llm = "wic"
dataset = "dwug_es"
path_to_data = f"../input/wic-scores/{dataset}"
path_to_gold_data = "../test_data_es.csv"
prompts = ["wic1", "wic2", "wic3", "wic4", "wic5"]

In [3]:
def get_clusters(adj_matrix: NDArray[np.float64 | np.int32], hyperparameters: dict):
    clustering = AgglomerativeClustering(
        n_clusters=hyperparameters["n_clusters"],
        metric=hyperparameters["metric"],
        linkage=hyperparameters["linkage"],
    ).fit(adj_matrix)

    return clustering.labels_
    

In [4]:
def generate_hyparameters_combinations_for_ac(max_number_clusters: int):
    combinations = []
    for ncluster in range(2, max_number_clusters + 1):
        for metric in ["precomputed"]:
            for l in ["complete", "average", "single"]:
                combinations.append(
                    {
                        "n_clusters": ncluster,
                        "metric": metric,
                        "linkage": l,

                    }
                )

    return combinations
        

In [5]:
metadata = {
    "fill_diagonal": True,
    "normalize": True,
    "method": method,
    "path_to_gold_data": path_to_gold_data,
    "path_to_data": path_to_data,
    "llm": llm,
    "prompts": prompts,
    "dataset": dataset,
    "wic_data": wic_data,
}

In [6]:
start_time = datetime.now()

grid_search(
    f.partial(load_data, path_to_data),
    get_clusters,
    generate_hyparameters_combinations_for_ac(max_number_clusters=max_number_clusters),
    metadata=metadata,
)

print(f"Elapsed time: {datetime.now() - start_time}")

2024-08-20 11:00:00,664 - INFO - loading data from ../input/wic-scores/dwug_es/wic1 ...
2024-08-20 11:00:00,898 - INFO - loading data from ../input/wic-scores/dwug_es/wic2 ...
2024-08-20 11:00:01,107 - INFO - loading data from ../input/wic-scores/dwug_es/wic3 ...
2024-08-20 11:00:01,331 - INFO - loading data from ../input/wic-scores/dwug_es/wic4 ...
2024-08-20 11:00:01,541 - INFO - loading data from ../input/wic-scores/dwug_es/wic5 ...
2024-08-20 11:00:01,748 - INFO - prompt: wic1
2024-08-20 11:00:01,757 - INFO - training ac method ...
2024-08-20 11:00:01,757 - INFO - prompt: wic1
2024-08-20 11:00:01,760 - INFO -   1/480 - {'quantile': 0, 'fill_diagonal': True, 'normalize': False, 'model_hyperparameters': {'n_clusters': 2, 'metric': 'precomputed', 'linkage': 'complete'}}
2024-08-20 11:00:01,760 - INFO - get predictions ...
2024-08-20 11:00:01,765 - INFO - building adjacency matrix ...
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_inde

KeyboardInterrupt: 