In [1]:
from pathlib import Path
import sys

import pandas as pd
import numpy as np

from common_generate_predictions import (
    load_data,
    calculate_correlation,
    save_cv_results,
    save_correlation
)
from custom_types import Results
from cross_validation import cross_validation_for_dwug_es as cv

In [2]:
PROMPTS = ["zs", "fs", "ct"]
PATH_TO_SAVE_RESULTS = "../cv-apd-experiments-lscd"
path_data = "../input"
LLMS = ["llama3.1-8B", "mixtral-8xtb-v0.1"]
WIC_MODELS = ["wic1", "wic2", "wic3", "wic4", "wic5", "wic6", "wic7"]
DATASETS = ["dwug_es", "dwug_en"]
path_to_gold_data_en = "../test_data_en.csv"
path_to_gold_data_es = "../test_data_es.csv"
PATH_TO_TARGET_WORDS = {
    "dwug_es": "../test_data_es.csv",
    "dwug_en": "../test_data_en.csv",
}

In [3]:
def load_scores(llm: str, dataset: str, prompts: list, wic_data: bool = False):
    scores = {}

    for p in prompts:
        path_to_data = Path(f"{path_data}/{llm}/{dataset}/{p}")

        if wic_data is False:
            assert path_to_data.exists() is True, f"{path_to_data} does not exist"

        scores[p] = load_data(path_to_data, wic_data)

    return scores

In [4]:
def save_results(word: str, result: Results, parameters: dict, path_to_save: str):
    df = pd.read_csv(path_to_save)

    n_rows = df.shape[0]

    df.loc[n_rows, "word"] = word
    df.loc[n_rows, "apd"] = result.jsd
    df.loc[n_rows, "parameters"] = str(parameters)

    df.to_csv(path_to_save, index=False)

In [5]:
def get_APD(hyperparameters: dict, scores: pd.DataFrame, metadata: dict):
    apd_per_word = {}

    for word in scores["word"].unique().tolist():
        mask = scores["word"] == word
        score_per_word = scores[mask]

        apd_per_word[word] = Results(
            jsd=score_per_word["score"].mean() * metadata["factor"],
            cluster_to_freq1=None,
            cluster_to_freq2=None,
        )

        path_to_save_results = metadata["path_to_save_results"]
        kfold = metadata["kfold"]
        name_file = metadata["name_file"]

        save_results(
            word,
            apd_per_word[word],
            hyperparameters,
            f"{path_to_save_results}/{kfold}_fold/{name_file}.csv",
        )

    return apd_per_word

In [6]:
def train(
    hyperparameters: list[dict],
    scores: dict[str, pd.DataFrame],
    train_set: list,
    metadata: dict,
):
    max_spr = -10.0
    best_configuration = None
    dataset = metadata["dataset"]
    metadata["name_file"] = "results_training_set"

    for hyperparameter in hyperparameters:
        score_filtered = scores[hyperparameter["prompt"]]
        mask = score_filtered["word"].isin(train_set)
        train_scores = score_filtered[mask]

        apd = get_APD(hyperparameter, train_scores, metadata)
        spr = calculate_correlation(apd, PATH_TO_TARGET_WORDS[dataset])

        save_correlation(
            spr,
            hyperparameter,
            f"{metadata['path_to_save_results']}/{metadata['kfold']}_fold/training.csv",
        )

        if spr > max_spr:
            max_spr = spr
            best_configuration = hyperparameter

    return {"optimal_parameters": best_configuration, "max_spr_lscd": max_spr}

In [7]:
def eval(
    hyperparameters: dict,
    scores: dict[str, pd.DataFrame],
    test_set: list,
    metadata: dict,
):
    dataset = metadata["dataset"]
    metadata["name_file"] = "results_testing_set"

    for hyperparameter in [hyperparameters]:
        score_filtered = scores[hyperparameter["prompt"]]
        mask = score_filtered["word"].isin(test_set)
        test_scores = score_filtered[mask]

        apd_per_words = get_APD(hyperparameter, test_scores, metadata)
        spr = calculate_correlation(apd_per_words, PATH_TO_TARGET_WORDS[dataset])

        save_correlation(
            spr,
            hyperparameter,
            f"{metadata['path_to_save_results']}/{metadata['kfold']}_fold/testing.csv",
        )
        return spr

In [8]:
def cross_validation_using_apd(
    d: str, scores: dict, hyperparameters: list[dict], metadata: dict
):
    results = {}

    for index in cv[d].keys():
        train_set = cv[d][index]["train"]
        test_set = cv[d][index]["test"]

        metadata["kfold"] = index
        metadata["dataset"] = d

        configuration = train(hyperparameters, scores, train_set, metadata)
        test_corr = eval(configuration["optimal_parameters"], scores, test_set, metadata)

        results[index] = {"training": configuration, "testing": test_corr}
        path_to_save = (
            f"{metadata['path_to_save_results']}/{index}_fold/verbose_results.txt"
        )
        with open(path_to_save, "a") as f_out:
            f_out.write("best parameters for training: \n")
            f_out.write(f"  {configuration['optimal_parameters']}\n")
            f_out.write(f"training [spr_lscd]: \n")
            f_out.write(f"  {configuration['max_spr_lscd']}\n")

            f_out.write("\n")

            f_out.write(f"testing [spr_lscd]:\n")
            f_out.write(f"  {test_corr}")

    return results

In [9]:
def execute_experiments(llm: str, hyperparameters: dict, metadata: dict):
    for d in DATASETS:
        scores = load_scores(
            llm if metadata["wic_data"] is False else "wic-scores",
            d,
            metadata["prompts"],
            wic_data=metadata["wic_data"],
        )
        metadata["path_to_save_results"] = f"{PATH_TO_SAVE_RESULTS}/{llm}/{d}"

        results = cross_validation_using_apd(d, scores, hyperparameters, metadata)
        save_cv_results(results, metadata)

In [None]:
hyperparameters = [{"prompt": p} for p in PROMPTS]
hyperparameters

In [11]:
metadata = {
    "factor": -1.0,
    "wic_data": False,
    "prompts": PROMPTS
}

In [None]:
execute_experiments("llama3.1-8B", hyperparameters, metadata)

In [13]:
# with open("corr-apd.txt", "w") as f_out:

#     for llm in LLMS:
#         f_out.write(f"{llm}:\n")

#         for d in DATASETS:
#             f_out.write(f"  {d}:\n")

#             for p in PROMPTS:
#                 f_out.write(f"    {p}\n")
#                 print(f"    {p}")
#                 p = Path(f"{path_data}/{llm}/{d}/{p}")

#                 assert p.exists() is True, f"{p} does not exist"

#                 data = load_data(p)
#                 words_apd = {}

#                 for word in data["word"].unique().tolist():
#                     mask = data["word"] == word
#                     score_or_distance = data[mask]

#                     words_apd[word] = Results(
#                         jsd=score_or_distance.select_dtypes(include="number").mean(),
#                         cluster_to_freq1=None,
#                         cluster_to_freq2=None,
#                     )

#                 corr = calculate_correlation(words_apd, PATH_TO_TARGET_WORDS[d])
#                 f_out.write(f"      corr: {corr}")
#                 f_out.write(f"\n\n")

## WiC models APD

In [14]:
with open(f"corr-apd-wic.txt", "w") as f_out:

    for d in DATASETS:
        f_out.write(f"{d}:\n")

        for wic in WIC_MODELS:
            f_out.write(f"  {wic}:\n")

            p = Path(f"{path_data}/wic-scores/{d}")
            assert p.exists() is True, f"{p} does not exist"

            path_to_wic_model = p / f"{wic}"

            data = load_data(path_to_wic_model, wic_data=True)
            words_apd = {}

            for word in data["word"].unique().tolist():
                mask = data["word"] == word
                score_or_distance = data[mask]

                words_apd[word] = Results(
                    jsd=score_or_distance["score"].mean() * -1.0,
                    cluster_to_freq1=None,
                    cluster_to_freq2=None,
                )

            corr = calculate_correlation(words_apd, PATH_TO_TARGET_WORDS[d])
            f_out.write(f"    corr: {corr}")
            f_out.write(f"\n\n")