In [1]:
import json
%%capture
!pip install numpy pandas scipy statsmodels

In [2]:
import pandas as pd


class Experiment:
    __model_name: str
    __single_call_dataset: pd.DataFrame
    __multi_call_dataset: pd.DataFrame

    def __init__(self, model_name: str, single_call_dataset: pd.DataFrame, multi_call_dataset: pd.DataFrame):
        self.__model_name = model_name
        self.__single_call_dataset = single_call_dataset
        self.__multi_call_dataset = multi_call_dataset
        assert self.__multi_call_dataset is not None
        assert self.__single_call_dataset is not None

    @property
    def model_name(self):
        return self.__model_name

    @property
    def single_call_dataset(self):
        return self.__single_call_dataset

    @property
    def multi_call_dataset(self):
        return self.__multi_call_dataset

    def __repr__(self):
        return f"{self.__model_name} Experiment"

In [3]:
from typing import List, Dict, Union
from os import listdir
from os.path import join, isdir

resources_path: str = join("..", "resources")

MODELS: List[str] = [d for d in listdir(resources_path) if isdir(join(resources_path, d))]

EXPERIMENTS: Dict[str, Experiment] = {}
for model in MODELS:
    multi_call_df_: pd.DataFrame = pd.read_csv(
        f"{resources_path}/{model}/sampled_reviews_with_output_multicall_{model}_evaluated.csv")
    single_call_df_: pd.DataFrame = pd.read_csv(
        f"{resources_path}/{model}/sampled_reviews_with_output_{model}_evaluated.csv")
    EXPERIMENTS[model] = Experiment(model_name=model, single_call_dataset=single_call_df_,
                                    multi_call_dataset=multi_call_df_)

EXPERIMENTS

{'gemma2_9b': gemma2_9b Experiment,
 'qwen2_7b': qwen2_7b Experiment,
 'llama3.1': llama3.1 Experiment,
 'phi3_medium': phi3_medium Experiment,
 'mistral_7b': mistral_7b Experiment}

In [34]:
EXPERIMENTS["gemma2_9b"].single_call_dataset.head()

Unnamed: 0,index,review,sentiment,entities,json,progressive_index,output,json_output,score,score_detail,review_score,sentiment_score,ner_accuracy,ner_precision,ner_recall,ner_f1
0,0,"1st watched 2/9/2008, 4 out of 10(Dir-J.S. Car...",negative,"[{'label': 'ORG', 'value': 'qwest'}, {'label':...","{""review"": ""1st watched 2/9/2008, 4 out of 10(...",435,"```json\n{\n ""sentiment"": ""negative"",\n ""rev...","{\n ""sentiment"": ""negative"",\n ""review"": ""1s...",0.666667,"{'review_score': 1.0, 'sentiment_score': 1.0, ...",1.0,1.0,0.0,0.0,0.0,0.0
1,1,Following on directly from the last episode of...,positive,"[{'label': 'PERSON', 'value': 'Hacker'}, {'lab...","{""review"": ""Following on directly from the las...",3279,"```json\n{\n ""sentiment"": ""positive"",\n ""rev...","{\n ""sentiment"": ""positive"",\n ""review"": ""Fo...",0.666667,"{'review_score': 1.0, 'sentiment_score': 1.0, ...",1.0,1.0,0.0,0.0,0.0,0.0
2,2,But at least this movie got what it deserved -...,negative,"[{'label': 'PERSON', 'value': 'Mike'}, {'label...","{""review"": ""But at least this movie got what i...",968,"```json\n{\n ""sentiment"": ""positive"",\n ""rev...","{\n ""sentiment"": ""positive"",\n ""review"": ""Bu...",0.544042,"{'review_score': 0.9654578807075669, 'sentimen...",0.965458,0.0,0.5,0.6,0.75,0.666667
3,3,A few buddies and myself have the strange hobb...,negative,"[{'label': 'PERSON', 'value': 'Larry Buchanan'...","{""review"": ""A few buddies and myself have the ...",1983,"```json\n{\n ""sentiment"": ""negative"",\n ""rev...","{\n ""sentiment"": ""negative"",\n ""review"": ""A ...",0.74784,"{'review_score': 0.9935190066267136, 'sentimen...",0.993519,1.0,0.142857,0.5,0.166667,0.25
4,4,On a routine mission in Iraq a group of Delta ...,positive,"[{'label': 'ORG', 'value': 'Delta'}, {'label':...","{""review"": ""On a routine mission in Iraq a gro...",2903,"```json\n{\n ""sentiment"": ""positive"",\n ""rev...","{\n ""sentiment"": ""positive"",\n ""review"": ""On...",0.722222,"{'review_score': 1.0, 'sentiment_score': 1.0, ...",1.0,1.0,0.090909,0.166667,0.166667,0.166667


In [46]:
from typing import Union
from scipy import stats as scipy_stats
import json

statistics_: List[Dict[str, Union[str, float]]] = []


def get_mean(dataframe_: pd.DataFrame, metric: str) -> float:
    return dataframe_[metric].mean()


def get_max(dataframe_: pd.DataFrame, metric: str) -> float:
    return dataframe_[metric].max()


def get_min(dataframe_: pd.DataFrame, metric: str) -> float:
    return dataframe_[metric].min()


def get_standard_dev(dataframe_: pd.DataFrame, metric: str) -> float:
    return dataframe_[metric].std()


evaluation_columns: List[str] = [
    "score",
    "review_score",
    "sentiment_score",
    "ner_accuracy",
    "ner_precision",
    "ner_recall",
    "ner_f1"
]

for model_, experiment in EXPERIMENTS.items():

    stats: dict = {
        "model": model_
    }
    single_call_ds_ = experiment.single_call_dataset
    multi_call_ds_ = experiment.multi_call_dataset

    _, p_value = scipy_stats.ttest_rel(single_call_ds_["ner_f1"].tolist(), multi_call_ds_["ner_f1"].tolist())
    stats["ner_f1_t_test_p_value"] = p_value

    _, p_value = scipy_stats.ttest_rel(single_call_ds_["review_score"].tolist(),
                                       multi_call_ds_["review_score"].tolist())
    stats["review_bleu_t_test_p_value"] = p_value


    contingency_table = [[0, 0], [0, 0]]  # Inizializzazione della tabella
    # Popolare la tabella di contingenza
    for val1, val2 in zip(single_call_ds_["sentiment_score"].tolist(), multi_call_ds_["sentiment_score"].tolist()):
        contingency_table[int(val1)][int(val2)] += 1

    chi2_stat, p_value, dof, expected = scipy_stats.chi2_contingency(contingency_table)
    stats["sentiment_score_chi2_p_value"] = p_value

    _, p_value = scipy_stats.ttest_rel(single_call_ds_["score"].tolist(), multi_call_ds_["score"].tolist())
    stats["score_t_test_p_value"] = p_value

    for column_name in evaluation_columns:
        mean_single: float = get_mean(single_call_ds_, column_name)
        max_single: float = get_max(single_call_ds_, column_name)
        min_single: float = get_min(single_call_ds_, column_name)
        std_single: float = get_standard_dev(single_call_ds_, column_name)

        mean_multi: float = get_mean(multi_call_ds_, column_name)
        max_multi: float = get_max(multi_call_ds_, column_name)
        min_multi: float = get_min(multi_call_ds_, column_name)
        std_multi: float = get_standard_dev(multi_call_ds_, column_name)

        stats[f"single_call_mean_{column_name}"] = mean_single
        stats[f"multi_call_mean_{column_name}"] = mean_multi
        stats[f"single_call_max_{column_name}"] = max_single
        stats[f"multi_call_max_{column_name}"] = max_multi
        stats[f"single_call_min_{column_name}"] = min_single
        stats[f"multi_call_min_{column_name}"] = min_multi
        stats[f"single_call_std_{column_name}"] = std_single
        stats[f"multi_call_std_{column_name}"] = std_multi

    statistics_.append(stats)

statistics: pd.DataFrame = pd.DataFrame(statistics_)


In [47]:
statistics.head()

Unnamed: 0,model,ner_f1_t_test_p_value,review_bleu_t_test_p_value,sentiment_score_chi2_p_value,score_t_test_p_value,single_call_mean_score,multi_call_mean_score,single_call_max_score,multi_call_max_score,single_call_min_score,...,single_call_std_ner_recall,multi_call_std_ner_recall,single_call_mean_ner_f1,multi_call_mean_ner_f1,single_call_max_ner_f1,multi_call_max_ner_f1,single_call_min_ner_f1,multi_call_min_ner_f1,single_call_std_ner_f1,multi_call_std_ner_f1
0,gemma2_9b,0.274078,0.01126912,2.497441e-125,0.1540392,0.807445,0.813172,1.0,1.0,0.0,...,0.37271,0.369702,0.547466,0.557513,1.0,1.0,0.0,0.0,0.355707,0.347419
1,qwen2_7b,0.08351762,1.643344e-33,1.7819200000000002e-60,1.11507e-17,0.540063,0.609789,1.0,1.0,0.0,...,0.334084,0.359914,0.251268,0.269788,1.0,1.0,0.0,0.0,0.315313,0.299897
2,llama3.1,0.281374,1.3438980000000001e-17,2.8187090000000003e-28,2.006397e-06,0.718807,0.672146,1.0,1.0,0.0,...,0.371594,0.379433,0.429982,0.440959,1.0,1.0,0.0,0.0,0.352558,0.34047
3,phi3_medium,1.258784e-16,3.9043329999999996e-90,0.8134746,3.548911e-34,0.25654,0.436825,1.0,1.0,0.0,...,0.2744,0.344538,0.116834,0.226209,1.0,1.0,0.0,0.0,0.27412,0.30998
4,mistral_7b,0.1980205,7.86997e-05,2.4329149999999997e-44,0.01256399,0.628788,0.602575,1.0,1.0,0.0,...,0.372358,0.360578,0.306004,0.325585,1.0,1.0,0.0,0.0,0.378029,0.326269


In [48]:
statistics.to_csv("../resources/statistics-revised.csv")