In [1]:
%%capture
!pip install numpy pandas scipy statsmodels

In [2]:
import pandas as pd


class Experiment:
    __model_name: str
    __single_call_dataset: pd.DataFrame
    __multi_call_dataset: pd.DataFrame

    def __init__(self, model_name: str, single_call_dataset: pd.DataFrame, multi_call_dataset: pd.DataFrame):
        self.__model_name = model_name
        self.__single_call_dataset = single_call_dataset
        self.__multi_call_dataset = multi_call_dataset
        assert self.__multi_call_dataset is not None
        assert self.__single_call_dataset is not None

    @property
    def model_name(self):
        return self.__model_name

    @property
    def single_call_dataset(self):
        return self.__single_call_dataset

    @property
    def multi_call_dataset(self):
        return self.__multi_call_dataset

    def __repr__(self):
        return f"{self.__model_name} Experiment"

In [3]:
from typing import List, Dict, Union
from os import listdir
from os.path import join, isdir

resources_path: str = join("..", "resources")

MODELS: List[str] = [d for d in listdir(resources_path) if isdir(join(resources_path, d))]

EXPERIMENTS: Dict[str, Experiment] = {}
for model in MODELS:
    multi_call_df_: pd.DataFrame = pd.read_csv(
        f"{resources_path}/{model}/sampled_reviews_with_output_multicall_{model}_evaluated.csv")
    single_call_df_: pd.DataFrame = pd.read_csv(
        f"{resources_path}/{model}/sampled_reviews_with_output_{model}_evaluated.csv")
    EXPERIMENTS[model] = Experiment(model_name=model, single_call_dataset=single_call_df_,
                                    multi_call_dataset=multi_call_df_)

EXPERIMENTS

{'gemma2_9b': gemma2_9b Experiment,
 'qwen2_7b': qwen2_7b Experiment,
 'llama3.1': llama3.1 Experiment,
 'phi3_medium': phi3_medium Experiment,
 'mistral_7b': mistral_7b Experiment}

In [4]:
EXPERIMENTS["gemma2_9b"].single_call_dataset.head()

Unnamed: 0,index,review,sentiment,entities,json,progressive_index,output,json_output,score,score_detail,review_score,sentiment_score,ner_accuracy,ner_precision,ner_recall,ner_f1
0,0,"1st watched 2/9/2008, 4 out of 10(Dir-J.S. Car...",negative,"[{'label': 'ORG', 'value': 'qwest'}, {'label':...","{""review"": ""1st watched 2/9/2008, 4 out of 10(...",435,"```json\n{\n ""sentiment"": ""negative"",\n ""rev...","{\n ""sentiment"": ""negative"",\n ""review"": ""1s...",0.666667,"{'review_score': 1.0, 'sentiment_score': 1.0, ...",1.0,1.0,0.0,0.0,0.0,0.0
1,1,Following on directly from the last episode of...,positive,"[{'label': 'PERSON', 'value': 'Hacker'}, {'lab...","{""review"": ""Following on directly from the las...",3279,"```json\n{\n ""sentiment"": ""positive"",\n ""rev...","{\n ""sentiment"": ""positive"",\n ""review"": ""Fo...",0.666667,"{'review_score': 1.0, 'sentiment_score': 1.0, ...",1.0,1.0,0.0,0.0,0.0,0.0
2,2,But at least this movie got what it deserved -...,negative,"[{'label': 'PERSON', 'value': 'Mike'}, {'label...","{""review"": ""But at least this movie got what i...",968,"```json\n{\n ""sentiment"": ""positive"",\n ""rev...","{\n ""sentiment"": ""positive"",\n ""review"": ""Bu...",0.544042,"{'review_score': 0.9654578807075669, 'sentimen...",0.965458,0.0,0.5,0.6,0.75,0.666667
3,3,A few buddies and myself have the strange hobb...,negative,"[{'label': 'PERSON', 'value': 'Larry Buchanan'...","{""review"": ""A few buddies and myself have the ...",1983,"```json\n{\n ""sentiment"": ""negative"",\n ""rev...","{\n ""sentiment"": ""negative"",\n ""review"": ""A ...",0.74784,"{'review_score': 0.9935190066267136, 'sentimen...",0.993519,1.0,0.142857,0.5,0.166667,0.25
4,4,On a routine mission in Iraq a group of Delta ...,positive,"[{'label': 'ORG', 'value': 'Delta'}, {'label':...","{""review"": ""On a routine mission in Iraq a gro...",2903,"```json\n{\n ""sentiment"": ""positive"",\n ""rev...","{\n ""sentiment"": ""positive"",\n ""review"": ""On...",0.722222,"{'review_score': 1.0, 'sentiment_score': 1.0, ...",1.0,1.0,0.090909,0.166667,0.166667,0.166667


In [12]:
from typing import Union
from scipy import stats as scipy_stats
import json

def get_mean(dataframe_: pd.DataFrame, metric: str) -> float:
    return dataframe_[metric].mean()


def get_max(dataframe_: pd.DataFrame, metric: str) -> float:
    return dataframe_[metric].max()


def get_min(dataframe_: pd.DataFrame, metric: str) -> float:
    return dataframe_[metric].min()


def get_standard_dev(dataframe_: pd.DataFrame, metric: str) -> float:
    return dataframe_[metric].std()


evaluation_columns: List[str] = [
    "score",
    "review_score",
    "sentiment_score",
    "ner_accuracy",
    "ner_precision",
    "ner_recall",
    "ner_f1"
]

In [22]:
from scipy.stats import shapiro, wilcoxon, friedmanchisquare
from statsmodels.stats.contingency_tables import mcnemar

statistics_: List[Dict[str, Union[str, float]]] = []

friedman_f1_single_task = []
friedman_f1_multi_task = []

friedman_bleu_single_task = []
friedman_bleu_multi_task = []

friedman_sentiment_single_task = []
friedman_sentiment_multi_task = []

friedman_score_single_task = []
friedman_score_multi_task = []

for model_, experiment in EXPERIMENTS.items():

    stats: dict = {
        "model": model_
    }
    single_call_ds_ = experiment.single_call_dataset
    multi_call_ds_ = experiment.multi_call_dataset

    # ---
    friedman_f1_single_task.append(multi_call_ds_["ner_f1"].tolist())
    friedman_f1_multi_task.append(single_call_ds_["ner_f1"].tolist())

    res = wilcoxon(single_call_ds_["ner_f1"].tolist(), multi_call_ds_["ner_f1"].tolist())
    stats["ner_f1_w_test_p_value"] = res.pvalue

    _, p_value = shapiro(single_call_ds_["ner_f1"].tolist())
    stats["ner_f1_multi_task_saphiro_p_value"] = p_value

    _, p_value = shapiro(multi_call_ds_["ner_f1"].tolist())
    stats["ner_f1_single_task_saphiro_p_value"] = p_value

    # ---
    friedman_bleu_single_task.append(multi_call_ds_["review_score"].tolist())
    friedman_bleu_multi_task.append(single_call_ds_["review_score"].tolist())

    res = wilcoxon(single_call_ds_["review_score"].tolist(),
                   multi_call_ds_["review_score"].tolist())
    stats["review_bleu_w_test_p_value"] = res.pvalue

    _, p_value = shapiro(single_call_ds_["review_score"].tolist())
    stats["review_bleu_multi_task_saphiro_p_value"] = p_value

    _, p_value = shapiro(multi_call_ds_["review_score"].tolist())
    stats["review_bleu_single_task_saphiro_p_value"] = p_value

    # ---
    friedman_sentiment_single_task.append(multi_call_ds_["sentiment_score"].tolist())
    friedman_sentiment_multi_task.append(single_call_ds_["sentiment_score"].tolist())

    contingency_table = [[0, 0], [0, 0]]  # Inizializzazione della tabella
    # Popolare la tabella di contingenza
    for val1, val2 in zip(single_call_ds_["sentiment_score"].tolist(), multi_call_ds_["sentiment_score"].tolist()):
        contingency_table[int(val1)][int(val2)] += 1

    res = mcnemar(table=contingency_table, exact=False)
    stats["sentiment_score_mcnemar_p_value"] = res.pvalue

    _, p_value = shapiro(single_call_ds_["sentiment_score"].tolist())
    stats["sentiment_multi_task_saphiro_p_value"] = p_value

    _, p_value = shapiro(multi_call_ds_["sentiment_score"].tolist())
    stats["sentiment_single_task_saphiro_p_value"] = p_value

    # ---
    friedman_score_single_task.append(multi_call_ds_["score"].tolist())
    friedman_score_multi_task.append(single_call_ds_["score"].tolist())

    res = wilcoxon(single_call_ds_["score"].tolist(), multi_call_ds_["score"].tolist())
    stats["score_w_test_p_value"] = res.pvalue

    _, p_value = shapiro(single_call_ds_["score"].tolist())
    stats["score_multi_task_saphiro_p_value"] = p_value

    _, p_value = shapiro(multi_call_ds_["score"].tolist())
    stats["score_single_task_saphiro_p_value"] = p_value

    for column_name in evaluation_columns:
        mean_single: float = get_mean(single_call_ds_, column_name)
        max_single: float = get_max(single_call_ds_, column_name)
        min_single: float = get_min(single_call_ds_, column_name)
        std_single: float = get_standard_dev(single_call_ds_, column_name)

        mean_multi: float = get_mean(multi_call_ds_, column_name)
        max_multi: float = get_max(multi_call_ds_, column_name)
        min_multi: float = get_min(multi_call_ds_, column_name)
        std_multi: float = get_standard_dev(multi_call_ds_, column_name)

        stats[f"single_call_mean_{column_name}"] = mean_single
        stats[f"multi_call_mean_{column_name}"] = mean_multi
        stats[f"single_call_max_{column_name}"] = max_single
        stats[f"multi_call_max_{column_name}"] = max_multi
        stats[f"single_call_min_{column_name}"] = min_single
        stats[f"multi_call_min_{column_name}"] = min_multi
        stats[f"single_call_std_{column_name}"] = std_single
        stats[f"multi_call_std_{column_name}"] = std_multi

    statistics_.append(stats)

metrics_single = [
    friedman_f1_single_task,
    friedman_bleu_single_task,
    friedman_sentiment_single_task,
    friedman_score_single_task
]
metrics_multi = [
    friedman_f1_multi_task,
    friedman_bleu_multi_task,
    friedman_sentiment_multi_task,
    friedman_score_multi_task
]

enum_ = {
    0: "F1",
    1: "BLEU",
    2: "Sentiment",
    3: "Score"
}

# Ciclo per eseguire i test di Friedman per ciascuna metrica in entrambe le condizioni
for i, (single, multi) in enumerate(zip(metrics_single, metrics_multi), start=0):
    # Test di Friedman per la condizione Single-Task
    stat_single, p_value_single = friedmanchisquare(*single)
    print(f"Metrica {enum_[i]} - Single-Task: statistic={stat_single}, p-value={p_value_single}")

    # Test di Friedman per la condizione Multi-Task
    stat_multi, p_value_multi = friedmanchisquare(*multi)
    print(f"Metrica {enum_[i]} - Multi-Task: statistic={stat_multi}, p-value={p_value_multi}")

statistics: pd.DataFrame = pd.DataFrame(statistics_)
statistics.head(10)

Metrica F1 - Single-Task: statistic=1030.8155127679918, p-value=7.485972527689155e-222
Metrica F1 - Multi-Task: statistic=1050.9175598782595, p-value=3.2924241439276734e-226
Metrica BLEU - Single-Task: statistic=1002.1648490996695, p-value=1.211833791393377e-215
Metrica BLEU - Multi-Task: statistic=1970.8805865921793, p-value=0.0
Metrica Sentiment - Single-Task: statistic=655.2125340599448, p-value=1.7341307280109658e-140
Metrica Sentiment - Multi-Task: statistic=792.7410468319543, p-value=2.868563973223419e-170
Metrica Score - Single-Task: statistic=1310.5779702300422, p-value=1.6931866198209944e-282
Metrica Score - Multi-Task: statistic=1808.2569633507846, p-value=0.0


Unnamed: 0,model,ner_f1_w_test_p_value,ner_f1_multi_task_saphiro_p_value,ner_f1_single_task_saphiro_p_value,review_bleu_w_test_p_value,review_bleu_multi_task_saphiro_p_value,review_bleu_single_task_saphiro_p_value,sentiment_score_mcnemar_p_value,sentiment_multi_task_saphiro_p_value,sentiment_single_task_saphiro_p_value,...,single_call_std_ner_recall,multi_call_std_ner_recall,single_call_mean_ner_f1,multi_call_mean_ner_f1,single_call_max_ner_f1,multi_call_max_ner_f1,single_call_min_ner_f1,multi_call_min_ner_f1,single_call_std_ner_f1,multi_call_std_ner_f1
0,gemma2_9b,0.0202706,3.849838e-26,5.657174999999999e-26,0.006172823,3.867686e-53,5.8981980000000004e-52,0.02878397,1.0811129999999999e-50,1.955713e-51,...,0.37271,0.369702,0.547466,0.557513,1.0,1.0,0.0,0.0,0.355707,0.347419
1,qwen2_7b,0.009995237,2.117988e-34,1.559851e-31,1.5850180000000002e-28,3.52217e-36,3.016398e-41,0.1322635,5.96123e-47,1.252548e-47,...,0.334084,0.359914,0.251268,0.269788,1.0,1.0,0.0,0.0,0.315313,0.299897
2,llama3.1,0.05823645,3.910228e-26,4.721143e-26,9.0576e-10,7.981735e-49,3.085769e-44,0.05859475,5.2867179999999995e-48,5.082396e-47,...,0.371594,0.379433,0.429982,0.440959,1.0,1.0,0.0,0.0,0.352558,0.34047
3,phi3_medium,6.961521e-16,6.034476e-47,1.066201e-36,6.979393999999999e-63,4.246282e-46,9.118438e-39,0.2855268,1.323348e-41,1.3638769999999999e-41,...,0.2744,0.344538,0.116834,0.226209,1.0,1.0,0.0,0.0,0.27412,0.30998
4,mistral_7b,0.02580441,7.080879e-36,8.19605e-30,2.452319e-07,6.294476999999999e-38,1.0632789999999999e-42,8.927138000000001e-27,2.020401e-49,2.6618719999999996e-44,...,0.372358,0.360578,0.306004,0.325585,1.0,1.0,0.0,0.0,0.378029,0.326269


In [20]:
statistics.T.head(50)

Unnamed: 0,0,1,2,3,4
model,gemma2_9b,qwen2_7b,llama3.1,phi3_medium,mistral_7b
ner_f1_w_test_p_value,0.020271,0.009995,0.058236,0.0,0.025804
ner_f1_multi_task_saphiro_p_value,0.0,0.0,0.0,0.0,0.0
ner_f1_single_task_saphiro_p_value,0.0,0.0,0.0,0.0,0.0
review_bleu_w_test_p_value,0.006173,0.0,0.0,0.0,0.0
review_bleu_multi_task_saphiro_p_value,0.0,0.0,0.0,0.0,0.0
review_bleu_single_task_saphiro_p_value,0.0,0.0,0.0,0.0,0.0
sentiment_score_mcnemar_p_value,0.028784,0.132264,0.058595,0.285527,0.0
score_w_test_p_value,0.03003,0.0,0.000015,0.0,0.602177
score_multi_task_saphiro_p_value,0.0,0.0,0.0,0.0,0.0


In [21]:
statistics.to_csv("../resources/statistics-revised-v2.csv")