## Contextualized model

Let's check where the context really helps

In [22]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [23]:
%load_ext autoreload
%autoreload 2

from hatedetection import load_datasets
import glob
import json

train_dataset, dev_dataset, test_dataset = load_datasets(add_body=True)

no_context_evals = []
context_evals = []

for path in glob.glob("../evaluations/non-context-category*"):
    with open(path) as f:
        obj = json.load(f)
        obj["file"] = path
        no_context_evals.append(obj)

for path in glob.glob("../evaluations/context-category*"):
    with open(path) as f:
        obj = json.load(f)
        obj["file"] = path
        context_evals.append(obj)

print(f"We have {len(context_evals)} context evaluations")
print(f"We have {len(no_context_evals)} no context evaluations")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
We have 7 context evaluations
We have 11 no context evaluations


In [24]:
import pandas as pd

df_context_evals = pd.DataFrame([
    {**{"file": evaluation["file"]}, **evaluation["metrics"]} for evaluation in context_evals
])

df_context_evals

Unnamed: 0,file,eval_loss,eval_calls_f1,eval_women_f1,eval_lgbti_f1,eval_racism_f1,eval_class_f1,eval_politics_f1,eval_disabled_f1,eval_appearance_f1,...,eval_runtime,eval_samples_per_second,init_mem_cpu_alloc_delta,init_mem_gpu_alloc_delta,init_mem_cpu_peaked_delta,init_mem_gpu_peaked_delta,test_mem_cpu_alloc_delta,test_mem_gpu_alloc_delta,test_mem_cpu_peaked_delta,test_mem_gpu_peaked_delta
0,../evaluations/context-category-3.json,0.151657,0.790528,0.698225,0.839237,0.946281,0.725275,0.757974,0.776,0.871585,...,20.092,89.439,51854,0,18258,0,407566,0,301620,189069824
1,../evaluations/context-category-6.json,0.15031,0.812057,0.681188,0.861789,0.953464,0.731884,0.757576,0.750988,0.884038,...,13.7372,130.813,51854,0,18258,0,407565,0,301676,189069824
2,../evaluations/context-category-4.json,0.152175,0.794918,0.659751,0.862637,0.944099,0.698182,0.74856,0.722892,0.872825,...,14.5786,123.263,51854,0,18258,0,407567,0,301676,189069824
3,../evaluations/context-category-1.json,0.149099,0.784906,0.670732,0.853261,0.942029,0.727941,0.767176,0.742188,0.889481,...,17.348,103.586,51854,0,18258,0,407488,0,301676,189069824
4,../evaluations/context-category-5.json,0.146769,0.80212,0.657084,0.875,0.939611,0.741573,0.762452,0.71875,0.862691,...,13.8276,129.958,51854,0,18258,0,407383,0,301739,189069824
5,../evaluations/context-category-2.json,0.150904,0.810714,0.684647,0.839779,0.942149,0.741818,0.750958,0.742188,0.88172,...,22.4385,80.086,51854,0,18258,0,407439,0,301676,189069824
6,../evaluations/context-category-0.json,0.153173,0.792593,0.666667,0.816438,0.942268,0.723247,0.741573,0.725806,0.890374,...,20.4946,87.682,51854,0,18258,0,407512,0,301729,189069824


In [25]:
metric_columns = [
    'eval_calls_f1', 'eval_women_f1', 'eval_lgbti_f1', 'eval_racism_f1',
    'eval_class_f1', 'eval_politics_f1', 'eval_disabled_f1',
    'eval_appearance_f1', 'eval_criminal_f1', 'eval_mean_f1',
    'eval_mean_precision', 'eval_mean_recall'    
]

context_cat_df = pd.DataFrame({"context mean": df_context_evals[metric_columns].mean(), "context std": df_context_evals[metric_columns].std()})

context_cat_df

Unnamed: 0,context mean,context std
eval_calls_f1,0.798262,0.010342
eval_women_f1,0.674042,0.014751
eval_lgbti_f1,0.849734,0.019496
eval_racism_f1,0.944272,0.004543
eval_class_f1,0.727131,0.014743
eval_politics_f1,0.755181,0.008728
eval_disabled_f1,0.73983,0.019882
eval_appearance_f1,0.878959,0.010256
eval_criminal_f1,0.904045,0.005744
eval_mean_f1,0.80794,0.005546


In [26]:
import pandas as pd

df_no_context_evals = pd.DataFrame([
    {**{"file": evaluation["file"]}, **evaluation["metrics"]} for evaluation in no_context_evals
])

no_context_cat_df = pd.DataFrame({
    "no context mean": df_no_context_evals[metric_columns].mean(), 
    "no context std": df_no_context_evals[metric_columns].std()})
no_context_cat_df

Unnamed: 0,no context mean,no context std
eval_calls_f1,0.784228,0.008607
eval_women_f1,0.64918,0.01039
eval_lgbti_f1,0.586006,0.01778
eval_racism_f1,0.862981,0.005573
eval_class_f1,0.592604,0.013013
eval_politics_f1,0.71968,0.014439
eval_disabled_f1,0.790039,0.011361
eval_appearance_f1,0.843687,0.003843
eval_criminal_f1,0.746758,0.008376
eval_mean_f1,0.730574,0.004333


In [29]:
result_df = pd.concat([context_cat_df, no_context_cat_df], axis=1)

result_df

Unnamed: 0,context mean,context std,no context mean,no context std
eval_calls_f1,0.798262,0.010342,0.784228,0.008607
eval_women_f1,0.674042,0.014751,0.64918,0.01039
eval_lgbti_f1,0.849734,0.019496,0.586006,0.01778
eval_racism_f1,0.944272,0.004543,0.862981,0.005573
eval_class_f1,0.727131,0.014743,0.592604,0.013013
eval_politics_f1,0.755181,0.008728,0.71968,0.014439
eval_disabled_f1,0.73983,0.019882,0.790039,0.011361
eval_appearance_f1,0.878959,0.010256,0.843687,0.003843
eval_criminal_f1,0.904045,0.005744,0.746758,0.008376
eval_mean_f1,0.80794,0.005546,0.730574,0.004333
