## Task B: Category Detection



In [11]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [13]:
%load_ext autoreload
%autoreload 2

from hatedetection import load_datasets
import glob
import json

train_dataset, dev_dataset, test_dataset = load_datasets(add_body=True)

no_context_evals = []
context_evals = []
full_context_evals = []

for path in glob.glob("../evaluations/non-context-category*"):
    with open(path) as f:
        obj = json.load(f)
        obj["file"] = path
        no_context_evals.append(obj)

for path in glob.glob("../evaluations/context-category*"):
    with open(path) as f:
        obj = json.load(f)
        obj["file"] = path
        context_evals.append(obj)


for path in glob.glob("../evaluations/title-body-category*"):
    with open(path) as f:
        obj = json.load(f)
        obj["file"] = path
        full_context_evals.append(obj)

print(f"We have {len(full_context_evals)} full context evaluations")
print(f"We have {len(context_evals)} context evaluations")
print(f"We have {len(no_context_evals)} no context evaluations")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
We have 9 full context evaluations
We have 16 context evaluations
We have 16 no context evaluations


In [14]:
import pandas as pd

metric_columns = [
    'eval_calls_f1', 'eval_women_f1', 'eval_lgbti_f1', 'eval_racism_f1',
    'eval_class_f1', 'eval_politics_f1', 'eval_disabled_f1',
    'eval_appearance_f1', 'eval_criminal_f1', 'eval_mean_f1',
    'eval_mean_precision', 'eval_mean_recall'    
]



df_full_context_evals = pd.DataFrame([
    {**{"file": evaluation["file"]}, **evaluation["metrics"]} for evaluation in full_context_evals
])

df_context_evals = pd.DataFrame([
    {**{"file": evaluation["file"]}, **evaluation["metrics"]} for evaluation in context_evals
])

df_no_context_evals = pd.DataFrame([
    {**{"file": evaluation["file"]}, **evaluation["metrics"]} for evaluation in no_context_evals
])

full_context_df = pd.DataFrame({
    "full context mean": df_full_context_evals[metric_columns].mean(), 
    "full context std": df_full_context_evals[metric_columns].std()})

context_df = pd.DataFrame({
    "context mean": df_context_evals[metric_columns].mean(), 
    "context std": df_context_evals[metric_columns].std()
})

no_context_df = pd.DataFrame({
    "no context mean": df_no_context_evals[metric_columns].mean(), 
    "no context std": df_no_context_evals[metric_columns].std()})


result_df = pd.concat([full_context_df, context_df, no_context_df], axis=1)

result_df

Unnamed: 0,full context mean,full context std,context mean,context std,no context mean,no context std
eval_calls_f1,0.801411,0.019641,0.801637,0.009916,0.784165,0.008949
eval_women_f1,0.713182,0.011052,0.672225,0.014997,0.652158,0.010933
eval_lgbti_f1,0.859784,0.010172,0.842527,0.020611,0.590471,0.017874
eval_racism_f1,0.939435,0.009133,0.942906,0.004489,0.862699,0.004899
eval_class_f1,0.738182,0.015776,0.726768,0.012011,0.593249,0.01106
eval_politics_f1,0.777379,0.008716,0.752979,0.00736,0.717805,0.014555
eval_disabled_f1,0.793302,0.014691,0.750196,0.028966,0.786369,0.015904
eval_appearance_f1,0.890847,0.011963,0.878756,0.009601,0.844621,0.004387
eval_criminal_f1,0.910827,0.004219,0.901019,0.007573,0.744453,0.008515
eval_mean_f1,0.824928,0.005597,0.807668,0.00625,0.730666,0.003954


In [55]:
def print_mean(row, context, ):
    mean = row[context + " mean"]
    std = row[context + " std"]
    return f"{mean:.3f} +- {std:.3f}"

display_df = pd.DataFrame()

for context in ["full context", "context", "no context"]:
    display_df[context] = result_df.apply(lambda x: print_mean(x, context), axis=1)


display_df["metrics"] = display_df.index.map(lambda x: " ".join(x.split("_")[1:]))
display_df.reset_index(inplace=True)
del display_df["index"]
display_df.set_index("metrics", inplace=True)
print(display_df.to_latex(escape=False, longtable=False))

\begin{tabular}{llll}
\toprule
{} &    full context &         context &      no context \\
metrics        &                 &                 &                 \\
\midrule
calls f1       &  0.801 +- 0.020 &  0.802 +- 0.010 &  0.784 +- 0.009 \\
women f1       &  0.713 +- 0.011 &  0.672 +- 0.015 &  0.652 +- 0.011 \\
lgbti f1       &  0.860 +- 0.010 &  0.843 +- 0.021 &  0.590 +- 0.018 \\
racism f1      &  0.939 +- 0.009 &  0.943 +- 0.004 &  0.863 +- 0.005 \\
class f1       &  0.738 +- 0.016 &  0.727 +- 0.012 &  0.593 +- 0.011 \\
politics f1    &  0.777 +- 0.009 &  0.753 +- 0.007 &  0.718 +- 0.015 \\
disabled f1    &  0.793 +- 0.015 &  0.750 +- 0.029 &  0.786 +- 0.016 \\
appearance f1  &  0.891 +- 0.012 &  0.879 +- 0.010 &  0.845 +- 0.004 \\
criminal f1    &  0.911 +- 0.004 &  0.901 +- 0.008 &  0.744 +- 0.009 \\
mean f1        &  0.825 +- 0.006 &  0.808 +- 0.006 &  0.731 +- 0.004 \\
mean precision &  0.852 +- 0.006 &  0.853 +- 0.007 &  0.786 +- 0.004 \\
mean recall    &  0.801 +- 0.006 &  

In [20]:
result_df.apply(lambda row: print(row), axis=1)

full context mean    0.801411
full context std     0.019641
context mean         0.801637
context std          0.009916
no context mean      0.784165
no context std       0.008949
Name: eval_calls_f1, dtype: float64
full context mean    0.713182
full context std     0.011052
context mean         0.672225
context std          0.014997
no context mean      0.652158
no context std       0.010933
Name: eval_women_f1, dtype: float64
full context mean    0.859784
full context std     0.010172
context mean         0.842527
context std          0.020611
no context mean      0.590471
no context std       0.017874
Name: eval_lgbti_f1, dtype: float64
full context mean    0.939435
full context std     0.009133
context mean         0.942906
context std          0.004489
no context mean      0.862699
no context std       0.004899
Name: eval_racism_f1, dtype: float64
full context mean    0.738182
full context std     0.015776
context mean         0.726768
context std          0.012011
no context mean

eval_calls_f1          None
eval_women_f1          None
eval_lgbti_f1          None
eval_racism_f1         None
eval_class_f1          None
eval_politics_f1       None
eval_disabled_f1       None
eval_appearance_f1     None
eval_criminal_f1       None
eval_mean_f1           None
eval_mean_precision    None
eval_mean_recall       None
dtype: object

In [16]:



print(result_df.to_latex())

\begin{tabular}{lrrrrrr}
\toprule
{} &  full context mean &  full context std &  context mean &  context std &  no context mean &  no context std \\
\midrule
eval\_calls\_f1       &           0.801411 &          0.019641 &      0.801637 &     0.009916 &         0.784165 &        0.008949 \\
eval\_women\_f1       &           0.713182 &          0.011052 &      0.672225 &     0.014997 &         0.652158 &        0.010933 \\
eval\_lgbti\_f1       &           0.859784 &          0.010172 &      0.842527 &     0.020611 &         0.590471 &        0.017874 \\
eval\_racism\_f1      &           0.939435 &          0.009133 &      0.942906 &     0.004489 &         0.862699 &        0.004899 \\
eval\_class\_f1       &           0.738182 &          0.015776 &      0.726768 &     0.012011 &         0.593249 &        0.011060 \\
eval\_politics\_f1    &           0.777379 &          0.008716 &      0.752979 &     0.007360 &         0.717805 &        0.014555 \\
eval\_disabled\_f1    &           0.79