In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import os
import math
import os.path
from os.path import join
import numpy as np
import imodelsx
from tqdm import tqdm
from collections import defaultdict
import matplotlib.pyplot as plt
import scipy.stats
import imodelsx.viz
import json
from sklearn.metrics import cohen_kappa_score
import seaborn as sns
import data
import joblib
from matplotlib.gridspec import GridSpec
from scipy.stats import spearmanr
LEVELS = ['Very Negative', 'Negative', 'Neutral',
          'No response', 'Positive', 'Very Positive']

# get data for example site
files_dict = data.load_files_dict_single_site()
site = 'Charlotte'
df = files_dict[site]
qs, responses_df, themes_df = data.split_single_site_df(df)

### Analyze human results

Load human annotations into a df called `annotations_df`

In [16]:
annots = {
    'hum1': 'human1',
    'hum2': 'human2',
    'hum3': 'human3',
}

# template = pd.read_pickle('../figs/human/sentiment_template.pkl')
annotations_df = pd.read_csv('../figs/human/sentiment_template.csv')
for k, v in annots.items():
    hum = pd.read_csv(f'../figs/human/collected/sentiment_{v}.csv', skiprows=1)

    def remove_all_whitespace(s):
        return ''.join(s.split())

    # check for matching index, value range
    assert hum.shape[0] == annotations_df.shape[
        0], f'Shape mismatch for {k}: {hum.shape[0]} vs {annotations_df.shape[0]}'
    assert np.all(hum['Response number'].astype(str).apply(remove_all_whitespace).values ==
                  annotations_df['Response number'].astype(str).apply(remove_all_whitespace).values), f'Error for hum {k}'

    # add col
    annotations_df[k] = hum['Rating'].values.astype(int)
    assert np.all(annotations_df[k].values >= 1)
    assert np.all(annotations_df[k].values <= 5)

checkpoints_all = [
    'gpt-4',
    'gpt-35-turbo',
    'meta-llama/Llama-2-70b-hf',
    'meta-llama/Llama-2-7b-hf',
    'mistralai/Mistral-7B-v0.1',  # 'mistralai/Mixtral-8x7B-v0.1'
]


def find_starting_number(s):
    # if s starts with a number or a number with decimal places, return that number
    # otherwise return nan
    s = str(s).strip()
    if s == 'nan':
        return np.nan
    ans = ''
    while len(s) > 0 and (s[0].isdigit() or s[0] == '.'):
        ans += s[0]
        s = s[1:]
    return float(ans) if len(ans) > 0 else np.nan


for checkpoint in checkpoints_all:
    sites = ['Atlanta', 'Columbus', 'WashingtonDC']
    sent_dfs = []
    for site in sites:
        sent_df = joblib.load(join(
            data.PROCESSED_DIR, f'sentiments_df_{site}_{checkpoint.split("/")[-1]}.pkl'))
        sent_dfs.append(sent_df)

    sent_dfs[0].columns = np.arange(0, 11)
    sent_dfs[1].columns = np.arange(11, 22)
    sent_dfs[2].columns = np.arange(22, 33)
    sent_llm_full = pd.concat(sent_dfs, axis=1).values
    annotations_df[checkpoint] = annotations_df.apply(
        lambda row: sent_llm_full[row['Question number'], row['Response number']], axis=1)
    annotations_df[checkpoint] = annotations_df[checkpoint].apply(
        find_starting_number)
llms_to_ensemble = ['gpt-4', 'gpt-35-turbo',
                    'mistralai/Mistral-7B-v0.1',  # 'meta-llama/Llama-2-70b-hf',
                    # 'mistralai/Mixtral-8x7B-v0.1',
                    ]

annotations_df['Human ensemble'] = annotations_df[[
    'hum1', 'hum2', 'hum3']].mean(axis=1)
annotations_df = annotations_df.rename(columns={
    'hum1': 'Human 1',
    'hum2': 'Human 2',
    'hum3': 'Human 3',
})


annotations_df.columns = list(map(lambda x: imodelsx.viz.CHECKPOINTS_RENAME_DICT.get(
    x, x), annotations_df.columns))
checkpoints_all = list(map(lambda x: imodelsx.viz.CHECKPOINTS_RENAME_DICT.get(
    x, x), checkpoints_all))
llms_to_ensemble = list(map(lambda x: imodelsx.viz.CHECKPOINTS_RENAME_DICT.get(
    x, x), llms_to_ensemble))
annotations_df['LLM ensemble'] = annotations_df[llms_to_ensemble].mean(axis=1)


# note: GPT-4 and GPT-3.5 Turbo got swapped while running, need to swap their column vals back
col_gpt4 = annotations_df['GPT-4'].copy()
col_gpt35 = annotations_df['GPT-3.5 Turbo'].copy()
annotations_df['GPT-4'] = col_gpt35
annotations_df['GPT-3.5 Turbo'] = col_gpt4

# round ensemble
annotations_df['LLM ensemble'] = annotations_df['LLM ensemble'].round()
annotations_df['Human ensemble'] = annotations_df['Human ensemble'].round()

Compute correlations between the columns of `annotations_df` and storr in `corr_df`

In [None]:
# make sure to apply this across all llms
notna = np.ones(len(annotations_df), dtype=bool)
for ckpt in checkpoints_all:
    notna &= annotations_df[ckpt].apply(find_starting_number).notna()
print('num annots', notna.sum())

checkpoints_all_hum = ['Human 1', 'Human 2', 'Human 3', 'Human ensemble'] + \
    ['LLM ensemble'] + checkpoints_all
corr = np.zeros((len(checkpoints_all_hum), len(checkpoints_all_hum)))
for r, cr in enumerate(checkpoints_all_hum):
    for c, cc in enumerate(checkpoints_all_hum):
        x = annotations_df[cr][notna].astype(int)
        y = annotations_df[cc][notna].astype(int)
        # corr[r, c] = np.corrcoef(x, y)[0, 1]  # spearmanr(x, y)
        corr[r, c] = cohen_kappa_score(x, y, weights='quadratic')

# convert to df
# labels[0] = 'Human 1'
# labels[1] = 'Human 2'
# labels[2] = 'Human 3'
labels = checkpoints_all_hum
print(labels, corr.shape)
corr_df = pd.DataFrame(corr,
                       index=labels,
                       columns=labels)

# sort by corr with human
ind = corr_df.sort_values(by='Human ensemble', ascending=False).index
ind.values[:4] = ['Human 1', 'Human 2', 'Human 3', 'Human ensemble']
corr_df = corr_df.reindex(ind)[ind]

# Replace correlations with Human ensemble by excluding the human
cols = ['Human 1', 'Human 2', 'Human 3']
hum_corrs = []
for i, c in enumerate(cols):
    avg_excluding_c = annotations_df[[
        col for col in cols if col != c]].mean(axis=1)
    hum_corrs.append(np.corrcoef(
        annotations_df[c][notna], avg_excluding_c[notna])[0, 1])
# print(hum_corrs)
corr_df.loc[cols, 'Human ensemble'] = hum_corrs
corr_df.loc['Human ensemble', cols] = hum_corrs

# Replace correlations with LLM ensemble by excluding the LLM
cols = list(map(lambda x: imodelsx.viz.CHECKPOINTS_RENAME_DICT.get(
    x, x), llms_to_ensemble))
llm_corrs = []
for i, c in enumerate(cols):
    avg_excluding_c = annotations_df[[
        col for col in cols if col != c]].mean(axis=1)
    llm_corrs.append(np.corrcoef(
        annotations_df[c][notna], avg_excluding_c[notna])[0, 1])
# print(llm_corrs)
# corr_df.loc[cols, 'LLM ensemble'] = llm_corrs
# corr_df.loc['LLM ensemble', cols] = llm_corrs

Make main plot

In [18]:
# Remove 'LLM Ensemble' row and col
corr_df = corr_df.drop('LLM ensemble', axis=0)
corr_df = corr_df.drop('LLM ensemble', axis=1)

In [19]:
# corr_df = corr_df.iloc[1:, 1:]
mask = np.triu(np.ones_like(corr_df, dtype=bool))
mask[np.diag_indices_from(mask)] = False

rename = {
    'Human 1': 'Human reviewer 1',
    'Human 2': 'Human reviewer 2',
    'Human 3': 'Human reviewer 3',
    'Human ensemble': 'Mean human reviewer',
    # 'LLM ensemble': 'Mean LLM',
}
corr_df = corr_df.rename(columns=rename, index=rename)

In [None]:
corr_df

In [21]:
for i in range(1, 4):
    corr_df.loc['Mean human reviewer', f'Human reviewer {i}'] = np.nan

In [22]:
# # rename Mean human reviewer to Median human reviewer
# corr_df = corr_df.rename(index={'Mean human reviewer': 'Median human reviewer'}, columns={
#                          'Mean human reviewer': 'Median human reviewer'})

In [None]:
ax = sns.heatmap(
    corr_df,
    annot=True, fmt='.2f',
    cmap=sns.color_palette("Blues", as_cmap=True), cbar_kws={'label': 'Cohen\'s Kappa'},
    mask=mask,
)

# outline the first row of the elements in the heatmap
color = '#fa755a'
lw = 3
roffset = 3.5
coffset = 0.5
shape = corr_df.shape
r = 0
# color = 'gray'
alpha = 1
for c in range(3, shape[1]):
    rx = r + roffset
    cx = c + coffset
    if c == 2:
        plt.plot([rx - 0.5, rx + 0.5],
                 [cx - 0.5, cx - 0.5], color=color, lw=lw, alpha=alpha)
    if c == shape[1] - 1:
        plt.plot([rx - 0.5, rx + 0.5],
                 [cx + 0.5, cx + 0.5], color=color, lw=lw, alpha=alpha)
    plt.plot([rx - 0.5, rx - 0.5],
             [cx - 0.5, cx + 0.5], color=color, lw=lw, alpha=alpha)
    plt.plot([rx + 0.5, rx + 0.5],
             [cx - 0.5, cx + 0.5], color=color, lw=lw, alpha=alpha)

roffset = 0.5
coffset = 0.5
c = 3
plt.plot([3 - 0.5 + 0.5, 3 + 0.5 + 0.5],
         [3 + 0.5 - 0.5, 3 + 0.5 - 0.5], color=color, lw=lw, alpha=alpha)

# set the color of first three xticklabels and yticklabels to blue
for i, t in enumerate(ax.get_xticklabels()):
    if i < 4:
        t.set_color('#08346c')
for i, t in enumerate(ax.get_yticklabels()):
    if i < 4:
        t.set_color('#08346c')


plt.xlim(-.2, shape[0])
plt.ylim(shape[1] + 0.2, -.2)
# plt.ylabel('Annotator')
# plt.xlabel('Annotator')
plt.savefig('../figs/sentiment_correlation.pdf', bbox_inches='tight')
plt.savefig('../figs/sentiment_correlation.png', bbox_inches='tight', dpi=300)
plt.show()

In [51]:
def kappa_interval(x, y, alpha=0.05):
    notna = x.notna() & y.notna()
    x = x[notna].round().astype(int).values
    y = y[notna].round().astype(int).values
    kappa_true = cohen_kappa_score(x, y, weights='quadratic')
    kappas = []
    for i in range(1000):
        idx = np.random.choice(len(x), len(x) * 3)
        kappas.append(cohen_kappa_score(x[idx], y[idx], weights='quadratic'))

    interval = np.percentile(kappas, [100 * alpha/2, 100 * (1 - alpha/2)])
    return kappa_true, interval


kappa_interval(annotations_df['Human ensemble'], annotations_df['GPT-4'])

(0.6905433111225012, array([0.60857698, 0.75873471]))

In [54]:
# Human-Human
pairings = [
    ('Human 1', 'Human 2'),
    ('Human 2', 'Human 3'),
    ('Human 1', 'Human 3'),
]
kappas = []
for p in pairings:
    kappa = kappa_interval(annotations_df[p[0]], annotations_df[p[1]])[0]
    kappas.append(kappa)
np.mean(kappas), np.std(kappas) / np.sqrt(3)

In [56]:
# Human-True
pairings = [
    ('Human 1', 'Human 2'),
    ('Human 2', 'Human 3'),
    ('Human 1', 'Human 3'),
]
kappas = []
for p in pairings:
    kappa = kappa_interval(annotations_df[p[0]], annotations_df[p[1]])[0]
    kappas.append(kappa)
np.mean(kappas), np.std(kappas) / np.sqrt(3)

(0.8395513143550085, 0.034147483194124015)

In [None]:
def r_confidence_interval(r, alpha, n):
    def _r_to_z(r):
        return math.log((1 + r) / (1 - r)) / 2.0

    def _z_to_r(z):
        e = math.exp(2 * z)
        return ((e - 1) / (e + 1))

    z = _r_to_z(r)
    se = 1.0 / math.sqrt(n - 3)
    z_crit = scipy.stats.norm.ppf(1 - alpha/2)  # 2-tailed z critical value

    lo = z - z_crit * se
    hi = z + z_crit * se

    # Return a sequence
    return (round(_z_to_r(lo), 2), round(_z_to_r(hi), 2))


print('interval human-LLM', r_confidence_interval(0.741484, 0.05, 123))
# print('interval annotators', np.mean([0.813302, 0.850014, 0.935228]), r_confidence_interval(
#     np.mean([0.813302, 0.850014, 0.935228]), 0.05, 123))
# print('interval each human-LLM',
#       [r_confidence_interval(x, 0.05, 123) for x in [0.845929, 0.915878, 0.941113]])
# print('interval human1-human2',
#       [r_confidence_interval(0.813302, 0.05, 123)])

In [None]:
vals = corr_df.loc['Mean human reviewer', [
    'Human reviewer 1', 'Human reviewer 2', 'Human reviewer 3']]
print('Human ensemble vs human', vals.mean(), vals.sem(ddof=0))

In [None]:
x1 = annotations_df['LLM ensemble'][notna]
x2 = annotations_df['Human ensemble'][notna]

# calculate pearson correlation and error of the corr
corr = np.corrcoef(x1, x2)[0, 1]
err = np.sqrt((1 - corr**2) / (len(x1) - 2))
print('LLM ensemble vs human ensemble corr', corr, 'err', err)

In [None]:
# plot with some jitter
def jitter(values, j, min=1, max=5):
    values = values + np.random.uniform(-j, j, values.shape)
    return np.clip(values, min, max)


x = annotations_df['Human ensemble']
y = annotations_df['LLM ensemble']
plt.plot(jitter(x, 0.15), jitter(y, 0.15), 'o', alpha=0.5)
plt.xlabel('Annotator ensemble sentiment score')
plt.ylabel('LLM ensemble sentiment score')

plt.plot([1, 5], [1, 5], 'k--')

In [None]:
print('Fraction of time LLM is more extreme than human',
      (np.abs(y[notna] - 3) > np.abs(x[notna] - 3)).mean())
print('Fraction of time LLM reverses polarity',
      (((x >= 4) & (y <= 2)) | ((x <= 2) & (y >= 4)))[notna].mean()
      )