In [1]:
import numpy as np
import os
import matplotlib.pyplot as plt
import mpl_lego as mplego
import pandas as pd

from hatespeech.keys import *
from hatespeech import utils

%matplotlib inline

In [2]:
data_path = "~/data/hatespeech/clean_qualtrics_irt_rollout.feather"

In [3]:
data = pd.read_feather(data_path).rename(columns={'violence_phys': 'violence'})

In [4]:
new_data = data[['comment_id', 'labeler_id'] + items].copy()

In [5]:
new_data['raw_text'] = data['raw_text']

In [6]:
# Target race
new_data['target_race_asian'] = ~data[target_race_to_col['asian']].isna()
new_data['target_race_black'] = ~data[target_race_to_col['black']].isna()
new_data['target_race_latinx'] = ~data[target_race_to_col['latinx']].isna()
new_data['target_race_middle_eastern'] = ~data[target_race_to_col['middle_eastern']].isna()
new_data['target_race_native_american'] = ~data[target_race_to_col['native_american']].isna()
new_data['target_race_pacific_islander'] = ~data[target_race_to_col['pacific_islander']].isna()
new_data['target_race_white'] = ~data[target_race_to_col['white']].isna()
new_data['target_race_other'] = ~data[target_race_to_col['other_race']].isna()
new_data['target_race_other_text'] = data[target_race_to_col['other_race'] + '_text']
new_data['target_race'] = new_data[target_race_cols].any(axis=1)

In [7]:
# Target religion
new_data['target_religion_atheist'] = ~data[target_religion_to_col['atheist']].isna()
new_data['target_religion_buddhist'] = ~data[target_religion_to_col['buddhist']].isna()
new_data['target_religion_christian'] = ~data[target_religion_to_col['christian']].isna()
new_data['target_religion_hindu'] = ~data[target_religion_to_col['hindu']].isna()
new_data['target_religion_jewish'] = ~data[target_religion_to_col['jewish']].isna()
new_data['target_religion_mormon'] = ~data[target_religion_to_col['mormon']].isna()
new_data['target_religion_muslim'] = ~data[target_religion_to_col['muslim']].isna()
new_data['target_religion_other'] = ~data[target_religion_to_col['other']].isna()
new_data['target_religion_other_text'] = data[target_religion_to_col['other'] + '_text']
new_data['target_religion'] = new_data[target_religion_cols].any(axis=1)

In [8]:
# Target origin
new_data['target_origin_immigrant'] = ~data[target_origin_to_col['immigrant']].isna()
new_data['target_origin_migrant_worker'] = ~data[target_origin_to_col['migrant_worker']].isna()
new_data['target_origin_specific_country'] = ~data[target_origin_to_col['specific_country']].isna()
new_data['target_origin_undocumented'] = ~data[target_origin_to_col['undocumented']].isna()
new_data['target_origin_other'] = ~data[target_origin_to_col['other']].isna()
new_data['target_origin_other_text'] = data[target_origin_to_col['other'] + '_text']
new_data['target_origin'] = new_data[target_origin_cols].any(axis=1)

In [9]:
# Target gender
new_data['target_gender_men'] = ~data[target_gender_to_col['men']].isna()
new_data['target_gender_non_binary'] = ~data[target_gender_to_col['non-binary']].isna()
new_data['target_gender_transgender_men'] = ~data[target_gender_to_col['transgender_men']].isna()
new_data['target_gender_transgender_unspecified'] = ~data[target_gender_to_col['transgender_unspecified']].isna()
new_data['target_gender_transgender_women'] = ~data[target_gender_to_col['transgender_women']].isna()
new_data['target_gender_women'] = ~data[target_gender_to_col['women']].isna()
new_data['target_gender_other'] = ~data[target_gender_to_col['other_gender']].isna()
new_data['target_gender_other_text'] = data[target_gender_to_col['other_gender'] + '_text']
new_data['target_gender'] = new_data[target_gender_cols].any(axis=1)

In [10]:
# Target sexuality
new_data['target_sexuality_bisexual'] = ~data[target_sexuality_to_col['bisexual']].isna()
new_data['target_sexuality_gay'] = ~data[target_sexuality_to_col['gay']].isna()
new_data['target_sexuality_lesbian'] = ~data[target_sexuality_to_col['lesbian']].isna()
new_data['target_sexuality_straight'] = ~data[target_sexuality_to_col['straight']].isna()
new_data['target_sexuality_other'] = ~data[target_sexuality_to_col['other']].isna()
new_data['target_sexuality_other_text'] = data[target_sexuality_to_col['other'] + '_text']
new_data['target_sexuality'] = new_data[target_sexuality_cols].any(axis=1)

In [11]:
# Target age
new_data['target_age_children'] = ~data[target_age_to_col['children']].isna()
new_data['target_age_teenagers'] = ~data[target_age_to_col['teenagers']].isna()
new_data['target_age_young_adults'] = ~data[target_age_to_col['young_adults']].isna()
new_data['target_age_middle_aged'] = ~data[target_age_to_col['middle_aged']].isna()
new_data['target_age_seniors'] = ~data[target_age_to_col['seniors']].isna()
new_data['target_age_other'] = ~data[target_age_to_col['other']].isna()
new_data['target_age_other_text'] = data[target_age_to_col['other'] + '_text']
new_data['target_age'] = new_data[target_age_cols].any(axis=1)

In [12]:
# Target disability
new_data['target_disability_physical'] = ~data[target_disability_to_col['physical']].isna()
new_data['target_disability_cognitive'] = ~data[target_disability_to_col['cognitive']].isna()
new_data['target_disability_neurological'] = ~data[target_disability_to_col['neurological']].isna()
new_data['target_disability_visually_impaired'] = ~data[target_disability_to_col['visually_impaired']].isna()
new_data['target_disability_hearing_impaired'] = ~data[target_disability_to_col['hearing_impaired']].isna()
new_data['target_disability_unspecific'] = ~data[target_disability_to_col['unspecific']].isna()
new_data['target_disability_other'] = ~data[target_disability_to_col['other']].isna()
new_data['target_disability_other_text'] = data[target_disability_to_col['other'] + '_text']
new_data['target_disability'] = new_data[target_disability_cols].any(axis=1)

In [13]:
# Target political group
new_data['target_politics_alt_right'] = ~data[target_politics_to_col['alt_right']].isna()
new_data['target_politics_communist'] = ~data[target_politics_to_col['communist']].isna()
new_data['target_politics_conservative'] = ~data[target_politics_to_col['conservative']].isna()
new_data['target_politics_democrat'] = ~data[target_politics_to_col['democrat']].isna()
new_data['target_politics_green_party'] = ~data[target_politics_to_col['green_party']].isna()
new_data['target_politics_leftist'] = ~data[target_politics_to_col['leftist']].isna()
new_data['target_politics_liberal'] = ~data[target_politics_to_col['liberal']].isna()
new_data['target_politics_libertarian'] = ~data[target_politics_to_col['libertarian']].isna()
new_data['target_politics_republican'] = ~data[target_politics_to_col['republican']].isna()
new_data['target_politics_socialist'] = ~data[target_politics_to_col['socialist']].isna()
new_data['target_politics_other'] = ~data[target_politics_to_col['other']].isna()
new_data['target_politics_other_text'] = data[target_politics_to_col['other'] + '_text']
new_data['target_politics'] = new_data[target_politics_cols].any(axis=1)

In [14]:
# Annotator columns with only one value allowed
new_data['annotator_gender'] = data[annotator_gender_col].map(swap_key_val(annotator_gender))
new_data['annotator_trans'] = data[annotator_trans_col].map(swap_key_val(annotator_trans))
new_data['annotator_educ'] = data[annotator_educ_col].map(swap_key_val(annotator_educ))
new_data['annotator_income'] = data[annotator_income_col].map(swap_key_val(annotator_income))
new_data['annotator_ideology'] = data[annotator_ideology_col].map(swap_key_val(annotator_ideology))

In [15]:
# Annotator race
new_data['annotator_race_asian'] = ~data[annotator_race_to_col['asian']].isna()
new_data['annotator_race_black'] = ~data[annotator_race_to_col['black']].isna()
new_data['annotator_race_latinx'] = ~data[annotator_race_to_col['latinx']].isna()
new_data['annotator_race_middle_eastern'] = ~data[annotator_race_to_col['middle_eastern']].isna()
new_data['annotator_race_native_american'] = ~data[annotator_race_to_col['native_american']].isna()
new_data['annotator_race_pacific_islander'] = ~data[annotator_race_to_col['pacific_islander']].isna()
new_data['annotator_race_white'] = ~data[annotator_race_to_col['white']].isna()
new_data['annotator_race_other'] = ~data[annotator_race_to_col['other_race']].isna()
new_data['annotator_race_other_text'] = data[annotator_race_to_col['other_race'] + '_text']

In [19]:
# Annotator age
new_data['annotator_age'] = data[annotator_age_col].apply(annotator_age_converter)

In [16]:
# Annotator religion
new_data['annotator_religion_atheist'] = ~data[annotator_religion_to_col['atheist']].isna()
new_data['annotator_religion_buddhist'] = ~data[annotator_religion_to_col['buddhist']].isna()
new_data['annotator_religion_christian'] = ~data[annotator_religion_to_col['christian']].isna()
new_data['annotator_religion_hindu'] = ~data[annotator_religion_to_col['hindu']].isna()
new_data['annotator_religion_jewish'] = ~data[annotator_religion_to_col['jewish']].isna()
new_data['annotator_religion_mormon'] = ~data[annotator_religion_to_col['mormon']].isna()
new_data['annotator_religion_muslim'] = ~data[annotator_religion_to_col['muslim']].isna()
new_data['annotator_religion_nothing'] = ~data[annotator_religion_to_col['nothing']].isna()
new_data['annotator_religion_other'] = ~data[annotator_religion_to_col['other']].isna()
new_data['annotator_religion_other_text'] = data[annotator_religion_to_col['other'] + '_text']

In [17]:
# Annotator sexuality
new_data['annotator_sexuality_bisexual'] = ~data[annotator_sexual_orientation_to_col['bisexual']].isna()
new_data['annotator_sexuality_gay'] = ~data[annotator_sexual_orientation_to_col['gay']].isna()
new_data['annotator_sexuality_straight'] = ~data[annotator_sexual_orientation_to_col['straight']].isna()
new_data['annotator_sexuality_other'] = ~data[annotator_sexual_orientation_to_col['other']].isna()
new_data['annotator_sexuality_other_text'] = data[annotator_sexual_orientation_to_col['other'] + '_text']