In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
from sklearn.metrics import accuracy_score, cohen_kappa_score
from IPython.display import HTML, display
import itertools
from statsmodels.stats.inter_rater import fleiss_kappa, aggregate_raters
import seaborn as sns
from pathlib import Path
dir2dir = {
    "7b_ps_df1": "llama_7b_pretrained_shorter",
    "7b_fts_df1": "llama_7b_fine_tuned_shorter",
    "65b_ps_df1": "llama_65b_pretrained_shorter",
    "65b_fts_df1": "llama_65b_fine_tuned_shorter"
}

In [2]:
_full_df = pd.read_csv("./llama/human_eval/svolgimenti/Svolgimenti_tutti.csv")

# sort data by questionary
full_df = _full_df.sort_values(by=["Questionario", "Data di inizio"])

# answers to float
q_cols = [f"Domanda {i}" for i in range(1, 21)]
full_df.loc[:, q_cols] = full_df.loc[:, q_cols].applymap(
    lambda x: float(
        x.replace("Risposta ", "")
        .replace(" (sicuramente scritto da una macchina)", "")
        .replace(" (sicuramente scritto da una persona)", "")
    ) if type(x) == str else x
)

full_df = full_df.reset_index(drop=True)
full_df = full_df.loc[full_df.Stato == "Completato", :]
full_df = full_df.loc[full_df.filter(regex="Domanda*").notna().all(axis=1), :]
full_df = full_df.loc[full_df.loc[:, "Domanda 7"] > 3, :]

  full_df.loc[:, q_cols] = full_df.loc[:, q_cols].applymap(


In [3]:
mask_7b_pretrain = full_df.Questionario.str.contains("7b_pretrained")
mask_7b_finetune = full_df.Questionario.str.contains("7b_fine_tuned")
mask_65b_pretrain = full_df.Questionario.str.contains("65b_pretrained")
mask_65b_finetune = full_df.Questionario.str.contains("65b_fine_tuned")

df_7b_pretrain = full_df.loc[mask_7b_pretrain, :].copy()
df_7b_finetune = full_df.loc[mask_7b_finetune, :].copy()
df_65b_pretrain = full_df.loc[mask_65b_pretrain, :].copy()
df_65b_finetune = full_df.loc[mask_65b_finetune, :].copy()

In [4]:
# for i in range(5):
#     print(processed_pretrain_65b_df.loc[:, i].isna().sum())
# processed_pretrain_65b_df.loc[:, 3].to_list()
# df_65b_pretrain

In [5]:
def normalize(x):
    return (x - x.mean()) / x.std()

def preprocess_df(df):
    new_bkg_dfs = []
    _new_dfs = []
    _dfs = df.groupby("Questionario")
    _qs = []
    for idx, (q_name, _df) in enumerate(_dfs):
        background_df = pd.read_csv(q_name, index_col=0)
        background_df.index = [f"Domanda {idx * 20 + i}" for i in range(1, 21)]
        no_control_questions = [f"Domanda {idx * 20 + i}" for i in range(1, 21) if i != 7]
        background_df = background_df.loc[no_control_questions, :]
        _df = (
            _df.filter(regex="Domanda*")
            .rename({f"Domanda {i}": f"Domanda {idx * 20 + i}" for i in range(1, 21)}, axis=1)
            .loc[:, no_control_questions].T
        )
        _new_dfs.append(_df)
        new_bkg_dfs.append(background_df)
        for i in range(_df.shape[0]):
            _qs.append(q_name)

    background_df = pd.concat(new_bkg_dfs, axis=0)
    _new_dfs = [pd.DataFrame([i[~np.isnan(i)] for i in _df.values]) for _df in _new_dfs]
    _df = pd.concat(_new_dfs, axis=0)
    _df.loc[:, "Questionario"] = _qs
    assert _df.shape[0] == background_df.shape[0], f"{_df.shape[0]} {background_df.shape[0]}"
    return _df, background_df

def get_metrics_df(_df, background_df):
    _metrics_df = pd.DataFrame()
    for col_idx, col in enumerate(_df.columns):
        if col == "Questionario":
            continue
        if _df.loc[:, col].notna().all():
            _metrics_df.loc[:, f"player {col_idx}"] = _df.loc[:, col].reset_index(drop=True)
    _metrics_df.loc[:, "avg_pred"] = _df.mean(axis=1, numeric_only=True).reset_index(drop=True)
    _metrics_df.loc[:, "bool_pred"] = _metrics_df["avg_pred"] <= 3
    _metrics_df.loc[:, "bool_scaled_pred"] = _metrics_df["avg_pred"] <= _metrics_df["avg_pred"].mean()
    _metrics_df.loc[:, "high_pred"] = _df.max(axis=1, numeric_only=True).reset_index(drop=True)
    _metrics_df.loc[:, "bool_high_pred"] = _metrics_df.high_pred <= 3
    _metrics_df.loc[:, "low_pred"] = _df.min(axis=1, numeric_only=True).reset_index(drop=True)
    _metrics_df.loc[:, "bool_low_pred"] = _metrics_df.low_pred <= 3
    _metrics_df.loc[:, "bool_is_human"] = background_df.is_human.reset_index(drop=True)
    _metrics_df.loc[:, "is_human"] = _metrics_df["bool_is_human"].apply(int)
    _metrics_df.loc[:, "missed"] = _metrics_df["bool_scaled_pred"] != _metrics_df["is_human"]
    _metrics_df.loc[:, "catched"] = _metrics_df["bool_scaled_pred"] == _metrics_df["is_human"]
    _metrics_df.loc[:, "Questionario"] = _df.loc[:, "Questionario"].values
    return _metrics_df

def compute_acc(metrics_df):
    return {
       "scaled mean": np.round(accuracy_score(metrics_df["bool_is_human"], metrics_df["bool_scaled_pred"]), 3) * 100,
       "mean": np.round(accuracy_score(metrics_df["bool_is_human"], metrics_df["bool_pred"]), 3) * 100,
    }

def compute_all_acc(metrics_df):
    n_players = metrics_df.filter(regex="player*").shape[1]
    all_cohens = []
    quest_scores = {}
    quest_scores["accuracy"] = compute_acc(metrics_df)
    players_scores = {}
    all_metrics_df = metrics_df.groupby("Questionario")
    acc_x = {}
    for idx, (_, _metrics_df) in enumerate(all_metrics_df):
        players_scores[f"Chunk {idx}"] = {}
        # quest_scores[f"Chunk {idx}"] = {}
        for key, val in compute_acc(_metrics_df).items():
            if key in acc_x:
                acc_x[key].append(val)
            else:
                acc_x[key] = [val]
        for i in range(n_players):
            acc = np.round(
                accuracy_score(
                    _metrics_df.loc[:, f"bool_is_human"],
                    _metrics_df.loc[:, f"player {i}"] <= 3.0
                ),
                3
            ) * 100

            players_scores[f"Chunk {idx}"][i] = {"accuracy": acc}
        players_scores[f"Chunk {idx}"]["std"] = np.std(
            [j["accuracy"] for j in players_scores[f"Chunk {idx}"].values()]
        )
        all_data = np.zeros((n_players, n_players))
        for i, j in itertools.combinations(range(n_players), r=2):
            all_data[i,j] = cohen_kappa_score(
                _metrics_df.loc[:, f"player {i}"] <= 3,
                _metrics_df.loc[:, f"player {j}"] <= 3
            )
        all_cohens.append(all_data)
    global_cohen = cohen_kappa_score(
            metrics_df.loc[:, f"player {i}"] <= metrics_df.loc[:, f"player {i}"].mean(),
            metrics_df.loc[:, f"player {j}"] <= metrics_df.loc[:, f"player {i}"].mean()
    )
    quest_scores["std"] = {key:np.std(val) for key, val in acc_x.items()}
    return quest_scores, players_scores, all_cohens

def plot_cohen_kappa(all_cohens):
    fig, ax = plt.subplots(1, 5, figsize=(20, 3))
    for i in range(5):
        sns.heatmap(
            all_cohens[i],
            annot=True,
            ax=ax[i]
        )

def _style_for_latex(scores, training):
    df = pd.DataFrame.from_dict(scores).reset_index(names="metric")
    df.loc[:, "model"] = [training for _ in range(df.shape[0])]
    df = df.set_index(["model", "metric"])
    return df

def style_for_latex(scores, trainings):
    dfs = [_style_for_latex(score, training) for score, training in zip(scores, trainings)]
    df = pd.concat(dfs, axis=0)
    return df

In [45]:
processed_pretrain_7b_df, background_pretrain_7b_df = preprocess_df(df_7b_pretrain)
pretrain_metrics_7b_df = get_metrics_df(processed_pretrain_7b_df, background_pretrain_7b_df)
pretrain_7b_quest_scores, pretrain_7b_players_scores, pretrain_7b_all_cohens = compute_all_acc(pretrain_metrics_7b_df)

processed_finetune_7b_df, background_finetune_7b_df = preprocess_df(df_7b_finetune)
finetune_metrics_7b_df = get_metrics_df(processed_finetune_7b_df, background_finetune_7b_df)
finetune_7b_quest_scores, finetune_7b_players_scores, finetune_7b_all_cohens = compute_all_acc(finetune_metrics_7b_df)

processed_pretrain_65b_df, background_pretrain_65b_df = preprocess_df(df_65b_pretrain)
pretrain_metrics_65b_df = get_metrics_df(processed_pretrain_65b_df, background_pretrain_65b_df)
pretrain_65b_quest_scores, pretrain_65b_players_scores, pretrain_65b_all_cohens = compute_all_acc(pretrain_metrics_65b_df)

processed_finetune_65b_df, background_finetune_65b_df = preprocess_df(df_65b_finetune)
finetune_metrics_65b_df = get_metrics_df(processed_finetune_65b_df, background_finetune_65b_df)
finetune_65b_quest_scores, finetune_65b_players_scores, finetune_65b_all_cohens = compute_all_acc(finetune_metrics_65b_df)

In [64]:
scores = [pretrain_7b_quest_scores, finetune_7b_quest_scores, pretrain_65b_quest_scores, finetune_65b_quest_scores]
trainings = ["pretrain", "finetune", "pretrain", "finetune"]
df = style_for_latex(scores, trainings)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,std
model,metric,Unnamed: 2_level_1,Unnamed: 3_level_1
pretrain,scaled mean,85.3,6.149992
pretrain,mean,82.1,7.13033
finetune,scaled mean,73.7,8.773004
finetune,mean,69.5,12.173644
pretrain,scaled mean,73.7,4.695785
pretrain,mean,74.7,6.136253
finetune,scaled mean,66.3,17.188764
finetune,mean,63.2,11.041123


In [60]:
global_cohens = []
fleiss = []
for df in [pretrain_metrics_7b_df, finetune_metrics_7b_df, pretrain_metrics_65b_df, finetune_metrics_65b_df]:
    cohens = []
    fleiss.append(
        fleiss_kappa(
            aggregate_raters(
                (df.filter(regex="player*") <= 3).astype(int).values
            )[0]
        )
    )
    n_players = df.filter(regex="player*").shape[1]
    for i,j in itertools.combinations(range(n_players), r=2):
        # print(i, j)
        cohens.append(cohen_kappa_score(
            # (df.iloc[:, i] <= df.iloc[:, i].median()).astype(int),
            # (df.iloc[:, j] <= df.iloc[:, j].median()).astype(int)
            (df.iloc[:, i] <= 3).astype(int),
            (df.iloc[:, j] <= 3).astype(int)
        ))
    global_cohens.append(np.mean(cohens))
    print("avg cohen", np.mean(cohens))
    print("fleiss", fleiss[-1])

avg cohen 0.3666467816337317
fleiss 0.36459303315210284
avg cohen 0.2224176294331277
fleiss 0.22306730690593854
avg cohen 0.33584722966515984
fleiss 0.33014216922724093
avg cohen 0.21266000463275017
fleiss 0.20568561872909702


In [79]:
from IPython.core.display import HTML, display
for i in range(5):
    df1 = pd.read_csv(f"/home/gpuccetti/Projects/llama/llama/human_eval/data/llama_65b_pretrained_shorter/human_eval_df1_{i}.csv", index_col=0)
    df2 = pd.read_csv(f"/home/gpuccetti/Projects/llama/llama/human_eval/data/llama_65b_pretrained_shorter/human_eval_df2_{i}.csv", index_col=0)
    # df1 = df1.loc[~df1.is_human, :]
    # df2 = df2.loc[~df2.is_human, :]
    for idx in range(df1.shape[0]):
        i1 = df1.iloc[idx, :].loc["Testo domanda"]
        t1 = df1.iloc[idx, :].loc["is_human"]
        i2 = df2.iloc[idx, :].loc["Testo domanda"]
        t2 = df2.iloc[idx, :].loc["is_human"]
        display(f"{idx}")
        display("First", HTML(str(t1)))
        display(HTML(i1))
        display("Second", HTML(str(t2)))
        display(HTML(i2))
    break


'0'

'First'

'Second'

'1'

'First'

'Second'

'2'

'First'

'Second'

'3'

'First'

'Second'

'4'

'First'

'Second'

'5'

'First'

'Second'

'6'

'First'

'Second'

'7'

'First'

'Second'

'8'

'First'

'Second'

'9'

'First'

'Second'

'10'

'First'

'Second'

'11'

'First'

'Second'

'12'

'First'

'Second'

'13'

'First'

'Second'

'14'

'First'

'Second'

'15'

'First'

'Second'

'16'

'First'

'Second'

'17'

'First'

'Second'

'18'

'First'

'Second'

'19'

'First'

'Second'

## T tests

In [12]:
from scipy.stats import ttest_ind
from itertools import combinations

real_mask = finetune_metrics_65b_df.bool_is_human
gen_mask = ~finetune_metrics_65b_df.bool_is_human

real_future_table = {}
gen_future_table = {}

for (name1, ds1), (name2, ds2) in combinations(zip(
    ["Llama 7b pretrain", "Llama 7b finetune", "Llama 65b pretrain", "Llama 65b finetune"],
    [pretrain_metrics_7b_df, finetune_metrics_7b_df, pretrain_metrics_65b_df, finetune_metrics_65b_df]
), 2):


    real_future_table[name1] = real_future_table.get(name1, {})
    real_tab1 = real_future_table[name1]
    real_tab1[name2] = real_tab1.get(name2, {})
    real_tab2 = real_tab1[name2]

    gen_future_table[name1] = gen_future_table.get(name1, {})
    gen_tab1 = gen_future_table[name1]
    # gen_tab1[name2] = gen_tab1.get(name2, {})
    # gen_tab2 = gen_tab1[name2]


    # print(name1, name2)
    gen_ds1 = ds1.loc[gen_mask, :].avg_pred
    gen_ds2 = ds2.loc[gen_mask, :].avg_pred
    gen_mean1 = gen_ds1.mean()
    gen_mean2 = gen_ds2.mean()
    gen_mean_test = ttest_ind(gen_ds1, gen_ds2)
    # print("Gen mean", gen_mean1, gen_mean2)
    # print("\t P value", gen_mean_test.pvalue)
    # gen_tab2["gen pval"] = gen_mean_test.pvalue
    gen_tab1[name2] = gen_mean_test.pvalue

    real_ds1 = ds1.loc[real_mask, :].avg_pred
    real_ds2 = ds2.loc[real_mask, :].avg_pred
    real_mean1 = real_ds1.mean()
    real_mean2 = real_ds2.mean()
    real_mean_test = ttest_ind(real_ds1, real_ds2)
    # print("Real mean", real_mean1, real_mean2)
    # print("\t P value", real_mean_test.pvalue)
    # real_tab2["real pval"] = real_mean_test.pvalue
    real_tab1[name2] = real_mean_test.pvalue

# ttest_ind(pretrain_metrics_7b_df.loc[gen_mask, :].avg_pred, finetune_metrics_65b_df.loc[gen_mask, :].avg_pred)
# ttest_ind(finetune_metrics_7b_df.loc[gen_mask, :].avg_pred, finetune_metrics_65b_df.loc[gen_mask, :].avg_pred)

# ttest_ind(pretrain_metrics_7b_df.loc[real_mask, :].avg_pred, finetune_metrics_7b_df.loc[ real_mask, :].avg_pred)
# ttest_ind(pretrain_metrics_7b_df.loc[real_mask, :].avg_pred, finetune_metrics_65b_df.loc[real_mask, :].avg_pred)
# ttest_ind(finetune_metrics_7b_df.loc[real_mask, :].avg_pred, finetune_metrics_65b_df.loc[real_mask, :].avg_pred)

In [13]:
print(pd.DataFrame.from_dict(real_future_table).T.to_latex())

\begin{tabular}{lrrr}
\toprule
{} &  Llama 7b finetune &  Llama 65b pretrain &  Llama 65b finetune \\
\midrule
Llama 7b pretrain  &           0.995618 &            0.412475 &            0.562456 \\
Llama 7b finetune  &                NaN &            0.425561 &            0.574688 \\
Llama 65b pretrain &                NaN &                 NaN &            0.807850 \\
\bottomrule
\end{tabular}



  print(pd.DataFrame.from_dict(real_future_table).T.to_latex())


In [14]:
print(pd.DataFrame.from_dict(gen_future_table).T.to_latex())

\begin{tabular}{lrrr}
\toprule
{} &  Llama 7b finetune &  Llama 65b pretrain &  Llama 65b finetune \\
\midrule
Llama 7b pretrain  &           0.000897 &            0.046171 &            0.001997 \\
Llama 7b finetune  &                NaN &            0.273690 &            0.954878 \\
Llama 65b pretrain &                NaN &                 NaN &            0.289662 \\
\bottomrule
\end{tabular}



  print(pd.DataFrame.from_dict(gen_future_table).T.to_latex())
