In [1]:
%%capture
!pip install pandas

In [2]:
import pandas as pd


class Experiment:
    __model_name: str
    __single_call_dataset: pd.DataFrame
    __multi_call_dataset: pd.DataFrame

    def __init__(self, model_name: str, single_call_dataset: pd.DataFrame, multi_call_dataset: pd.DataFrame):
        self.__model_name = model_name
        self.__single_call_dataset = single_call_dataset
        self.__multi_call_dataset = multi_call_dataset
        assert self.__multi_call_dataset is not None
        assert self.__single_call_dataset is not None

    @property
    def model_name(self):
        return self.__model_name

    @property
    def single_call_dataset(self):
        return self.__single_call_dataset

    @property
    def multi_call_dataset(self):
        return self.__multi_call_dataset

    def __repr__(self):
        return f"{self.__model_name} Experiment"

In [3]:
from typing import List, Dict, Union
from os import listdir
from os.path import join, isdir

resources_path: str = join("..", "resources")

MODELS: List[str] = [d for d in listdir(resources_path) if isdir(join(resources_path, d))]

EXPERIMENTS: Dict[str, Experiment] = {}
for model in MODELS:
    multi_call_df_: pd.DataFrame = pd.read_csv(
        f"{resources_path}/{model}/sampled_reviews_with_output_multicall_{model}_evaluated.csv")
    single_call_df_: pd.DataFrame = pd.read_csv(
        f"{resources_path}/{model}/sampled_reviews_with_output_{model}_evaluated.csv")
    EXPERIMENTS[model] = Experiment(model_name=model, single_call_dataset=single_call_df_,
                                    multi_call_dataset=multi_call_df_)

EXPERIMENTS

{'gemma2_9b': gemma2_9b Experiment,
 'qwen2_7b': qwen2_7b Experiment,
 'llama3.1': llama3.1 Experiment,
 'phi3_medium': phi3_medium Experiment,
 'mistral_7b': mistral_7b Experiment}

In [4]:
evaluation_columns: List[str] = ["score", "review_score", "sentiment_score", "ner_accuracy",
                                 "ner_precision", "ner_recall", "ner_f1"]

In [5]:
from typing import Union

statistics_: List[Dict[str, Union[str, float]]] = []


def get_mean(dataframe_: pd.DataFrame, metric: str) -> float:
    return dataframe_[metric].mean()


def get_max(dataframe_: pd.DataFrame, metric: str) -> float:
    return dataframe_[metric].max()


def get_min(dataframe_: pd.DataFrame, metric: str) -> float:
    return dataframe_[metric].min()


def get_standard_dev(dataframe_: pd.DataFrame, metric: str) -> float:
    return dataframe_[metric].std()


def get_25(dataframe_: pd.DataFrame, metric: str) -> float:
    return dataframe_[metric].quantile(0.25)


def get_50(dataframe_: pd.DataFrame, metric: str) -> float:
    return dataframe_[metric].quantile(0.50)


def get_75(dataframe_: pd.DataFrame, metric: str) -> float:
    return dataframe_[metric].quantile(0.75)


def get_90(dataframe_: pd.DataFrame, metric: str) -> float:
    return dataframe_[metric].quantile(0.9)


for model_, experiment in EXPERIMENTS.items():

    stats: dict = {
        "model": model_
    }
    single_call_ds_ = experiment.single_call_dataset
    multi_call_ds_ = experiment.multi_call_dataset
    for column_name in evaluation_columns:
        mean_single: float = get_mean(single_call_ds_, column_name)
        max_single: float = get_max(single_call_ds_, column_name)
        min_single: float = get_min(single_call_ds_, column_name)
        std_single: float = get_standard_dev(single_call_ds_, column_name)
        _25_single: float = get_25(single_call_ds_, column_name)
        _50_single: float = get_50(single_call_ds_, column_name)
        _75_single: float = get_75(single_call_ds_, column_name)
        _90_single: float = get_90(single_call_ds_, column_name)

        mean_multi: float = get_mean(multi_call_ds_, column_name)
        max_multi: float = get_max(multi_call_ds_, column_name)
        min_multi: float = get_min(multi_call_ds_, column_name)
        std_multi: float = get_standard_dev(multi_call_ds_, column_name)
        _25_multi: float = get_25(multi_call_ds_, column_name)
        _50_multi: float = get_50(multi_call_ds_, column_name)
        _75_multi: float = get_75(multi_call_ds_, column_name)
        _90_multi: float = get_90(multi_call_ds_, column_name)

        stats[f"single_call_mean_{column_name}"] = mean_single
        stats[f"multi_call_mean_{column_name}"] = mean_multi
        stats[f"single_call_max_{column_name}"] = max_single
        stats[f"multi_call_max_{column_name}"] = max_multi
        stats[f"single_call_min_{column_name}"] = min_single
        stats[f"multi_call_min_{column_name}"] = min_multi
        stats[f"single_call_std_{column_name}"] = std_single
        stats[f"multi_call_std_{column_name}"] = std_multi
        stats[f"single_call_25_{column_name}"] = _25_single
        stats[f"multi_call_25_{column_name}"] = _25_multi
        stats[f"single_call_50_{column_name}"] = _50_single
        stats[f"multi_call_50_{column_name}"] = _50_multi
        stats[f"single_call_75_{column_name}"] = _75_single
        stats[f"multi_call_75_{column_name}"] = _75_multi
        stats[f"single_call_90_{column_name}"] = _90_single
        stats[f"multi_call_90_{column_name}"] = _90_multi
    statistics_.append(stats)

statistics: pd.DataFrame = pd.DataFrame(statistics_)


In [6]:
statistics.head()

Unnamed: 0,model,single_call_mean_score,multi_call_mean_score,single_call_max_score,multi_call_max_score,single_call_min_score,multi_call_min_score,single_call_std_score,multi_call_std_score,single_call_25_score,...,single_call_std_ner_f1,multi_call_std_ner_f1,single_call_25_ner_f1,multi_call_25_ner_f1,single_call_50_ner_f1,multi_call_50_ner_f1,single_call_75_ner_f1,multi_call_75_ner_f1,single_call_90_ner_f1,multi_call_90_ner_f1
0,gemma2_9b,0.807445,0.813172,1.0,1.0,0.0,0.0,0.171131,0.166355,0.666667,...,0.355707,0.347419,0.285714,0.328947,0.615385,0.666667,0.842105,0.833333,1.0,1.0
1,qwen2_7b,0.540063,0.609789,1.0,1.0,0.0,0.0,0.25285,0.257303,0.354619,...,0.315313,0.299897,0.0,0.0,0.0,0.181818,0.5,0.5,0.769231,0.666667
2,llama3.1,0.718807,0.672146,1.0,1.0,0.0,0.0,0.252694,0.2864,0.666667,...,0.352558,0.34047,0.0,0.0,0.444444,0.5,0.666667,0.701471,1.0,0.888889
3,phi3_medium,0.25654,0.436825,1.0,1.0,0.0,0.0,0.291692,0.343134,0.0,...,0.27412,0.30998,0.0,0.0,0.0,0.0,0.0,0.444444,0.545455,0.666667
4,mistral_7b,0.628788,0.602575,1.0,1.0,0.0,0.0,0.243628,0.307657,0.471104,...,0.378029,0.326269,0.0,0.0,0.0,0.307692,0.6,0.6,1.0,0.8


In [7]:
statistics.to_csv("../resources/statistics.csv")