In [None]:
# !pip install dataframe_image

In [None]:
import os, json, time, pickle
import pandas as pd, numpy as np 
import matplotlib.pyplot as plt

os.chdir('/home/jovyan/work/')

modelname = 'mistral_'
indatadir = os.path.join('personas', 'mistraldata_llm_1_2')
outdatadir = os.path.join('personas', 'mistral_study_1_2')
resultsdir = os.path.join('personas', 'mistralresults')

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

#### Load data and annotations

Sap et al. large scale majority vote as "groundtruth data"

In [None]:
pd_data = pd.read_pickle('personas/data_ext/lscale_majVote.pkl')
pd_data.head(3)

Load persona annotations on Sap et al. large scale data

In [None]:
pd_persona = pd.read_pickle(os.path.join(indatadir, '1000_persona_you.pkl'))
len(pd_persona.columns)

In [None]:
pd_nopersona = pd.read_pickle(os.path.join(indatadir, '1000_nopersona_you.pkl'))
len(pd_nopersona.columns)

#### Analysis Functions

In [None]:
from sklearn.metrics import classification_report

def votes_from_runs_crowd(pd_):
    pd_results = pd.DataFrame()
    for c in range(len(pd_.columns)):
        sums = list(pd_.iloc[:,:(c+1)].sum(axis=1))
        pd_results[f'run_{c}'] = [1 if s>(c+1)/2 else 0 for s in sums]
    return pd_results

def votes_from_runs_individual(pd_):
    personas = [p.split('_')[1] for p in pd_.columns]
    pd_.columns = [f'run_{i}' for i in range(len(pd_.columns))]
    return pd_, personas

def reports_from_votes(pd_, groundtruth_label):
    results = []
    for c in range(len(pd_.columns)):
        results.append(classification_report(groundtruth_label, pd_[f'run_{c}'], output_dict=True))
    return results

def series_from_reports(list_, personas):
    if personas:
        pd_results = pd.DataFrame(personas, columns=['personaId'])
    else:
        pd_results = pd.DataFrame([i+1 for i in range(len(list_))], columns=['crowdsize'])
    for e in ['0', '1']:
        for k in list_[0][e].keys():
            pd_results[e+'_'+k] = [l[e][k] for l in list_]
    for k in list_[0]['macro avg'].keys():
        pd_results['mavg'+'_'+k] = [l['macro avg'][k] for l in list_]
    for k in list_[0]['weighted avg'].keys():
        pd_results['wavg'+'_'+k] = [l['weighted avg'][k] for l in list_]
    pd_results['accuracy'] = [l['accuracy'] for l in list_]
    return pd_results.drop(['0_support','1_support','mavg_support','wavg_support'], axis=1)

def plot_series(dict_results, savename):
    fig, axs = plt.subplots(5, 3, figsize=(6,8))
    for k_type, v_type in dict_results.items():
        r,c = 0,0
        for i in range(13):
            for k_run, v_run in v_type['dfs'].items():
                axs[r,c].plot(v_run.iloc[:,0], v_run.iloc[:,i+1], color=v_type['color'], label=v_type['label'], linewidth=0.5)
                axs[r,c].set_ylim((0,1.05))
                axs[r,c].title.set_text(v_run.columns[i+1])
            if c < 2:
                c += 1
            else:
                c = 0
                r += 1
    fig.delaxes(axs[4,1])
    fig.delaxes(axs[4,2])
    fig.tight_layout()
    plt.savefig(os.path.join(resultsdir, modelname+f'{savename}.pdf'))
    plt.show()

def plot_single_series(dict_results, metric, savename):
    fig, ax = plt.subplots(1, 1, figsize=(5,3))
    min_s, max_s = 1, 0
    for k_type, v_type in dict_results.items():
        for k_run, v_run in v_type['dfs'].items():
            ax.plot(v_run['crowdsize'], v_run[metric], color=v_type['color'], label=v_type['label'], linewidth=0.5)
            min_r, max_r = v_run[metric].min(), v_run[metric].max()
            min_s = min_r if min_r < min_s else min_s
            max_s = max_r if max_r > max_s else max_s
    ax.set_ylim((min_s-0.05,max_s+0.05))
    ax.set(xlabel='crowd size', ylabel=metric)
    plt.savefig(os.path.join(resultsdir, modelname+f'{savename}.pdf'))
    plt.show()

def plot_boxplots(dict_results, savename):
    metrics = [list(v.columns) for k,v in dict_results.items()][0][1:]
    fig, axs = plt.subplots(5, 3, figsize=(6,8))
    r, c = 0, 0
    for i in range(13):
        data_ = {k: v.iloc[:,1+i] for k,v in dict_results.items()}
        axs[r,c].boxplot(data_.values())
        # axs[r,c].set_xticklabels(['p_you','np_you','p_any','np_any'], rotation=45)
        axs[r,c].set_xticklabels(['p','np'], rotation=45)
        # axs[r,c].set_xticklabels(['p','np','p-old','np-old'], rotation=45)
        axs[r,c].set_ylim([0,1.05])
        axs[r,c].title.set_text(metrics[i])
        if c < 2:
            c += 1
        else:
            c = 0
            r += 1
    fig.delaxes(axs[4,1])
    fig.delaxes(axs[4,2])
    fig.tight_layout()
    plt.savefig(os.path.join(resultsdir, modelname+f'{savename}.pdf'))
    plt.show()

def plot_single_boxplot(dict_results, metric, savename):
    fig, ax = plt.subplots(1,1, figsize=(5,3))
    data_ = {k: v[metric] for k,v in dict_results.items()}
    min_b, max_b = 1, 0
    for v in data_.values():
        min_b = v.min() if v.min() < min_b else min_b
        max_b = v.max() if v.max() > max_b else max_b
    ax.boxplot(data_.values())
    ax.set_xticklabels(['p','np'])
    # ax.set_xticklabels(['p','np','p-old','np-old'], rotation=45)
    ax.set_ylim([min_b-0.05,max_b+0.05])
    ax.set(ylabel=metric)
    plt.savefig(os.path.join(resultsdir, modelname+f'{savename}.pdf'))
    plt.show()

import dataframe_image as dfi

def create_table(dict_results, shorts, savename):
    pd_scores = pd.DataFrame([list(v.columns) for k,v in dict_results.items()][0][1:], columns=['metric'])
    for t in ['min','mean','50%', 'max']:
        for k,v in dict_results.items():
            pd_scores[f'{shorts[k]}_{t}'] = list(v.describe().loc[t,:])
    pd_scores = np.round(pd_scores, 4)
    dfi.export(pd_scores, os.path.join(resultsdir, modelname+f'{savename}.png'), table_conversion='matplotlib')
    return pd_scores

#### Process Annotations

In [None]:
groundtruth = pd_data['ogLabelToxic']
size_crowd = 100

raw_runs = {
    'persona': {i: pd_persona.iloc[:,3+i*size_crowd:3+(i+1)*size_crowd].replace({'FALSE': 0, 'TRUE': 1}) for i in range(int((len(pd_persona.columns) - 3) / size_crowd))},
    'nopersona': {i: pd_nopersona.iloc[:,3+i*size_crowd:3+(i+1)*size_crowd].replace({'FALSE': 0, 'TRUE': 1}) for i in range(int((len(pd_nopersona.columns) - 3) / size_crowd))},
}

In [None]:
results_runs_series = {}
dict_results_boxplots = {}

for k_type, v_type in raw_runs.items():
    results_runs_series[k_type] = {}
    pd_boxplots = pd.DataFrame()
    for k_run, v_run in v_type.items():
        votes = votes_from_runs_crowd(v_run)
        reports = reports_from_votes(votes, groundtruth)
        results_runs_series[k_type][k_run] = series_from_reports(reports, None)
        pd_boxplots = pd.concat([pd_boxplots, v_run], axis=1)
    pd_boxplots, personas = votes_from_runs_individual(pd_boxplots)
    reports = reports_from_votes(pd_boxplots, groundtruth)
    dict_results_boxplots[k_type] = series_from_reports(reports, personas)

In [None]:
dict_results_series = {
    'persona': {'dfs': results_runs_series['persona'], 'color': 'blue', 'label': 'persona'},
    'nopersona': {'dfs': results_runs_series['nopersona'], 'color': 'red', 'label': 'no persona you'},
}

#### Show Descriptive Results

In [None]:
plot_single_series(dict_results_series, 'mavg_f1-score', 'study_1_crowds_mavg_f1-score')

In [None]:
plot_series(dict_results_series, 'study_1_crowds')

In [None]:
plot_single_boxplot(dict_results_boxplots, 'mavg_f1-score','study_2_boxplots_mavg_f1-score')

In [None]:
plot_boxplots(dict_results_boxplots, 'study_2_boxplots')

In [None]:
shorts = {
    'persona': 'p',
    'nopersona': 'np',
}

create_table(dict_results_boxplots, shorts, 'study_2_table')

#### Hypothesis Testing for Equality of Distributions

In [None]:
from scipy.stats import ranksums, ks_2samp

metrics = [list(v.columns) for k,v in dict_results_boxplots.items()][0][1:]

for m in metrics:
    data_p = dict_results_boxplots['persona'][m]
    data_np = dict_results_boxplots['nopersona'][m]
    test_wc = ranksums(data_p, data_np)
    decision_wc = 'Reject H_0' if test_wc.pvalue < 0.05 else 'Do not reject H_0'
    test_ks = ks_2samp(data_p, data_np)
    decision_ks = 'Reject H_0' if test_ks.pvalue < 0.05 else 'Do not reject H_0'
    print(f'{m}:\tWilcoxon\t{np.round(test_wc.statistic,2)}\t{test_wc.pvalue}\t{decision_wc}\n\t\tKS\t\t{np.round(test_ks.statistic,2)}\t{test_ks.pvalue}\t{decision_ks}')

#### Simulate Crowd Trajectory Permutations

In [None]:
n_permutations = 1000
crowd_size = 100

permutations of persona crowds

In [None]:
pd_persona_you = pd.read_pickle(os.path.join(indatadir,'1000_persona_you.pkl'))
pd_meta = pd_persona_you.iloc[:,:3]
pd_persona_you = pd_persona_you.iloc[:,3:]

pd_persona_crowd0 = pd_persona_you.sample(crowd_size, axis=1, random_state=1).replace({'FALSE': 0, 'TRUE': 1})
persona_permutations = {i: pd_persona_crowd0.sample(crowd_size, axis=1, random_state=i) for i in range(n_permutations)}

permutations of nopersona crowds

In [None]:
pd_nopersona_you = pd.read_pickle(os.path.join(indatadir,'1000_nopersona_you.pkl'))
pd_meta = pd_nopersona_you.iloc[:,:3]
pd_nopersona_you = pd_nopersona_you.iloc[:,3:]

pd_random_crowd0 = pd_nopersona_you.sample(crowd_size, axis=1, random_state=1).replace({'FALSE': 0, 'TRUE': 1})
random_permutations = {i: pd_random_crowd0.sample(crowd_size, axis=1, random_state=i) for i in range(n_permutations)}

In [None]:
raw_runs = {
    'persona_permutations': persona_permutations,
    'random_permutations': random_permutations
}

In [None]:
permutations_series = {}

for k_type, v_type in raw_runs.items():
    permutations_series[k_type] = {}
    for k_run, v_run in v_type.items():
        votes = votes_from_runs_crowd(v_run)
        reports = reports_from_votes(votes, groundtruth)
        permutations_series[k_type][k_run] = series_from_reports(reports, None)

In [None]:
dict_permutations_series = {
    'persona_permutations': {'dfs': permutations_series['persona_permutations'], 'color': 'blue', 'label': 'persona permutations'},
    'random_permutations': {'dfs': permutations_series['random_permutations'], 'color': 'red', 'label': 'random permutations'},
}

In [None]:
with open(os.path.join(outdatadir,'crowd_permutations.pkl'), 'wb') as f:
    pickle.dump(dict_permutations_series, f)

In [None]:
with open(os.path.join(outdatadir,'crowd_permutations.pkl'), 'rb') as f:
    dict_permutations_series = pickle.load(f)

In [None]:
plot_single_series(dict_permutations_series, 'mavg_f1-score', 'study_1_permutations_mavg_f1-score')

In [None]:
plot_series(dict_permutations_series, 'study_1_permutations')

#### Stability Experiments

create performance brackets

In [None]:
brackets = pd.DataFrame()

performances = dict_results_boxplots['persona'].sort_values('mavg_f1-score', ascending=True)

brackets = pd.concat([brackets, performances.iloc[:30]])
brackets = pd.concat([brackets, performances.iloc[486:516]])
brackets = pd.concat([brackets, performances.iloc[-30:]])

brackets['bracket'] = ['w' for i in range(30)] + ['m' for i in range(30)] + ['b' for i in range(30)]
brackets = brackets.reset_index(drop=True)

In [None]:
brackets.head(2)

In [None]:
brackets.tail(2)

In [None]:
# brackets.to_pickle(os.path.join(outdatadir,'performance_brackets.pkl'))

In [None]:
brackets = pd.read_pickle(os.path.join(outdatadir,'performance_brackets.pkl'))

---> sent to qwen_llm_1_2 file for re-annotation of bracket personas

process re-annotation of bracket personas

In [None]:
def series_for_persona(list_):
    pd_results = pd.DataFrame([f'run_{i}' for i in range(len(list_))], columns=['run'])
    for e in ['0', '1']:
        for k in list_[0][e].keys():
            pd_results[e+'_'+k] = [l[e][k] for l in list_]
    for k in list_[0]['macro avg'].keys():
        pd_results['mavg'+'_'+k] = [l['macro avg'][k] for l in list_]
    for k in list_[0]['weighted avg'].keys():
        pd_results['wavg'+'_'+k] = [l['weighted avg'][k] for l in list_]
    pd_results['accuracy'] = [l['accuracy'] for l in list_]
    return pd_results.drop(['0_support','1_support','mavg_support','wavg_support'], axis=1)

In [None]:
with open(os.path.join(outdatadir,'performance_brackets_results.pkl'), 'rb') as f:
    brackets_annotated = pickle.load(f)

In [None]:
for k,v in brackets_annotated.items():
    brackets_annotated[k] = v.replace({'TRUE': 1, 'FALSE': 0})

In [None]:
pd_data = pd.read_pickle('personas/data_ext/lscale_majVote.pkl')
groundtruth = pd_data['ogLabelToxic']
pd_data.head(2)

In [None]:
bracket_scores = {}

for k,v in brackets_annotated.items():
    reports = reports_from_votes(v.iloc[:,2:], groundtruth)
    bracket_scores[k] = series_for_persona(reports)

In [None]:
bracket_boxplots = {}
labels_boxplots = []

for i, row in brackets.iterrows():
    pid = int(row['personaId']) 
    if  pid in bracket_scores.keys():
        bracket_boxplots[pid] = bracket_scores[pid]['mavg_f1-score']
        labels_boxplots.append(row['bracket']+str(i))
    else:
        continue

In [None]:
boxplot_data = {'data_': {k: v['mavg_f1-score'] for k,v in dict_results_boxplots.items()},
               'bracket_boxplots': bracket_boxplots,
               'brackets': brackets}

with open(os.path.join('personas','joint_results','mistral_stability.pkl'), 'wb') as f:
    pickle.dump(boxplot_data, f)

In [None]:
fig, (im1, im2) = plt.subplots(1,2,figsize=(11,4),gridspec_kw={'width_ratios': [1, 5]})

data_ = {k: v['mavg_f1-score'] for k,v in dict_results_boxplots.items()}
im1.boxplot(data_.values(), widths=0.7)
im1.set_xticklabels(['pers.','no pers.']) #, rotation=45)
im1.set_ylabel('mavg_f1-score')
im1.set_ylim((0.35,0.85))
im1.set_title('a)')

im2.boxplot(bracket_boxplots.values())
im2.set_xlim((-2,92)) # create whitespace left and right
im2.set_ylim((0.35,0.85))
im2.plot([i+1 for i in range(len(brackets))], brackets['mavg_f1-score'], linestyle='', marker='o', markersize=2, mec='red')
im2.set_xticks([15,45,75], ['30 min personas', '30 median persona', '30 max personas'])
im2.set_yticks([])
im2.set_title('b)')

plt.subplots_adjust(wspace=0.02)
plt.savefig(os.path.join(resultsdir, modelname+'study_2_stability.png'), bbox_inches='tight')
plt.show()