In [None]:
import datasets 
import matplotlib.pyplot as plt
import mpl_lego as mplego
import numpy as np
import pandas as pd
import simpledorff

from mpl_lego.colorbar import append_colorbar_to_axis
from mpl_lego.labels import bold_text, apply_subplot_labels
from hate_target import keys
from scipy.stats import bootstrap

%matplotlib inline

In [None]:
mplego.style.use_latex_style()

In [None]:
target_groups = sorted(keys.target_groups[:-1])
target_labels = np.array(sorted(keys.target_labels[:-1]))
n_targets = len(target_groups)

In [None]:
dataset = datasets.load_dataset('ucberkeley-dlab/measuring-hate-speech', 'binary')   
data = dataset['train'].to_pandas()
targets = data[['comment_id', 'annotator_id'] + sorted(target_groups)]

In [None]:
votes = targets.astype({key: 'int' for key in target_groups})
agreement = data[['comment_id'] + target_groups].groupby('comment_id').mean()
is_target = (agreement >= 0.5).astype('int')
n_comments = is_target.sum()
sorted_idx = np.argsort(n_comments)

In [None]:
alphas = [
    simpledorff.calculate_krippendorffs_alpha_for_df(targets, 'comment_id', 'annotator_id', group)
    for group in target_groups]

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 5), sharey=True)

plt.subplots_adjust(wspace=0.07)


axes[0].barh(y=np.arange(n_targets),
             width=np.flip(n_comments[sorted_idx]),
             color='lightgrey',
             edgecolor='black')

axes[0].grid(axis='x')
axes[0].set_axisbelow(True)
axes[0].set_xticks([0, 5000, 10000, 15000])

violins = axes[1].violinplot(
    dataset=[agreement[col] for col in agreement.iloc[:, sorted_idx].columns],
    positions=np.flip(np.arange(n_targets)),
    vert=False,
    bw_method=0.35,
    showmeans=True,
    widths=0.65)

for pc in violins['bodies']:
    pc.set_facecolor('gray')
    pc.set_edgecolor('black')
    pc.set_linewidth(1.5)
violins['cmeans'].set_edgecolor('red')
violins['cmeans'].set_linewidth(2)
violins['cbars'].set_edgecolor('black')
violins['cbars'].set_linewidth(1)
violins['cmins'].set_edgecolor('black')
violins['cmins'].set_linewidth(1.5)
violins['cmaxes'].set_edgecolor('black')
violins['cmaxes'].set_linewidth(1.5)

axes[1].set_yticks(np.arange(n_targets))
axes[1].set_yticklabels(bold_text(np.flip(target_labels[sorted_idx])))


axes[2].barh(y=np.arange(n_targets),
             width=np.flip(np.array(alphas)[sorted_idx]),
             color='lightgrey',
             edgecolor='black')
axes[2].grid(axis='x')
axes[2].set_axisbelow(True)
axes[2].set_xlim([0, 1])
axes[2].set_xticks([0, 0.25, 0.50, 0.75, 1.0])

for ax in axes:
    ax.tick_params(labelsize=15)
axes[0].tick_params(axis='y', labelsize=17)

axes[0].set_xlabel(bold_text("Number of Comments"), fontsize=17)
axes[1].set_xlabel(bold_text('Proportion of Annotators\nIdentifying Target'), fontsize=17)
axes[2].set_xlabel(bold_text(r"Krippendorff's $\alpha$"), fontsize=17)


apply_subplot_labels(axes, bold=True, x=-0.04, size=23)
plt.savefig('figure4.pdf', bbox_inches='tight')