In [1]:
import os
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

info_fp = '../data/hateful_memes/info.csv'
fine_grained_data_dir = '../data/hateful_memes_finegrained'

# EDA

In [2]:
split = 'train'
fp = f'{fine_grained_data_dir}/{split}.json'
fine_grained_df = pd.read_json(fp, lines=True)
fine_grained_df.head()

Unnamed: 0,id,set_name,img,text,gold_hate,gold_pc,gold_attack,pc,attack
0,42953,train,img/42953.png,its their character not their color that matters,[not_hateful],[pc_empty],[attack_empty],,
1,23058,train,img/23058.png,don't be afraid to love again everyone is not ...,[not_hateful],[pc_empty],[attack_empty],,
2,13894,train,img/13894.png,putting bows on your pet,[not_hateful],[pc_empty],[attack_empty],,
3,37408,train,img/37408.png,i love everything and everybody! except for sq...,[not_hateful],[pc_empty],[attack_empty],,
4,82403,train,img/82403.png,"everybody loves chocolate chip cookies, even h...",[not_hateful],[pc_empty],[attack_empty],,


In [3]:
fine_grained_df.tail()

Unnamed: 0,id,set_name,img,text,gold_hate,gold_pc,gold_attack,pc,attack
8495,10423,train,img/10423.png,nobody wants to hang auschwitz me,[hateful],[religion],[mocking],"[[religion], [religion], [religion]]","[[mocking], [mocking], [mocking]]"
8496,98203,train,img/98203.png,when god grants you a child after 20 years of ...,[hateful],[nationality],[dehumanizing],"[[nationality], [nationality], [religion]]","[[dehumanizing], [inciting_violence], []]"
8497,36947,train,img/36947.png,gays on social media: equality! body positivit...,[hateful],[sex],[exclusion],"[[sex], [sex], [sex]]","[[exclusion], [exclusion], [exclusion]]"
8498,16492,train,img/16492.png,having a bad day? you could be a siamese twin ...,[hateful],"[sex, disability]",[inferiority],"[[sex, disability], [sex, disability], [sex, d...","[[], [inferiority], [inferiority]]"
8499,15937,train,img/15937.png,i hate muslims too they take their religion to...,[hateful],[religion],"[inferiority, contempt]","[[religion], [religion], [religion]]","[[inferiority, contempt], [inferiority, contem..."


In [4]:
(fine_grained_df['gold_pc'].apply(lambda x: len(x)) > 1).sum()

389

In [5]:
(fine_grained_df['gold_attack'].apply(lambda x: len(x)) > 1).sum()

327

In [6]:
mlb = MultiLabelBinarizer()
transformed = mlb.fit_transform(fine_grained_df['gold_attack'])
print(transformed.shape)
print(mlb.classes_)

(8500, 8)
['attack_empty' 'contempt' 'dehumanizing' 'exclusion' 'inciting_violence'
 'inferiority' 'mocking' 'slurs']


In [7]:
def find_unique_labels(lists, empty_replacement):

     if isinstance(lists, list):
          return list({item for lst in lists for item in lst})
     return [empty_replacement]

In [8]:
(fine_grained_df['pc'].apply(find_unique_labels, empty_replacement='pc_empty').apply(lambda x: len(x)) > 1).sum()

1077

In [9]:
(fine_grained_df['attack'].apply(find_unique_labels, empty_replacement='attack_empty').apply(lambda x: len(x)) > 1).sum()

1476

# Transform data

In [10]:
def find_unique_labels(lists, empty_replacement):

     if isinstance(lists, list):
          return list({item for lst in lists for item in lst})
     return [empty_replacement]

In [11]:
splits = ['train', 'dev_seen', 'dev_unseen']
fine_grained_dfs = []
for split in splits:
    fp = f'{fine_grained_data_dir}/{split}.json'
    fine_grained_df = pd.read_json(fp, lines=True)
    fine_grained_dfs.append(fine_grained_df)
fine_grained_df = pd.concat(fine_grained_dfs)

mlb_pc, mlb_attack = MultiLabelBinarizer(), MultiLabelBinarizer()
mlb_pc.fit(fine_grained_df['gold_pc'])
mlb_attack.fit(fine_grained_df['gold_attack'])

MultiLabelBinarizer()

In [12]:
splits = ['train', 'dev_seen', 'dev_unseen']
fine_grained_dfs = []
for split in splits:
    fp = f'{fine_grained_data_dir}/{split}.json'
    fine_grained_df = pd.read_json(fp, lines=True)
    # transform 'pc' like 'gold_pc'
    fine_grained_df['pc'] = fine_grained_df['pc'].apply(find_unique_labels, empty_replacement='pc_empty')
    # transform 'attack' like 'gold_attack'
    fine_grained_df['attack'] = fine_grained_df['attack'].apply(find_unique_labels, empty_replacement='attack_empty')
    # binarize 'gold_pc' and 'gold_attack'
    new_cols = [x+'_gold_pc' for x in mlb_pc.classes_]
    fine_grained_df[new_cols] = mlb_pc.transform(fine_grained_df['gold_pc'])
    new_cols = [x+'_gold_attack' for x in mlb_attack.classes_]
    fine_grained_df[new_cols] = mlb_attack.transform(fine_grained_df['gold_attack'])
    # binarize 'pc' and 'attack'
    new_cols = [x+'_pc' for x in mlb_pc.classes_]
    fine_grained_df[new_cols] = mlb_pc.transform(fine_grained_df['pc'])
    new_cols = [x+'_attack' for x in mlb_attack.classes_]
    fine_grained_df[new_cols] = mlb_attack.transform(fine_grained_df['attack'])

    fine_grained_dfs.append(fine_grained_df)

fine_grained_df = pd.concat(fine_grained_dfs)
print(fine_grained_df.shape)
fine_grained_df.head()
    

(9540, 37)




Unnamed: 0,id,set_name,img,text,gold_hate,gold_pc,gold_attack,pc,attack,disability_gold_pc,...,religion_pc,sex_pc,attack_empty_attack,contempt_attack,dehumanizing_attack,exclusion_attack,inciting_violence_attack,inferiority_attack,mocking_attack,slurs_attack
0,42953,train,img/42953.png,its their character not their color that matters,[not_hateful],[pc_empty],[attack_empty],[pc_empty],[attack_empty],0,...,0,0,1,0,0,0,0,0,0,0
1,23058,train,img/23058.png,don't be afraid to love again everyone is not ...,[not_hateful],[pc_empty],[attack_empty],[pc_empty],[attack_empty],0,...,0,0,1,0,0,0,0,0,0,0
2,13894,train,img/13894.png,putting bows on your pet,[not_hateful],[pc_empty],[attack_empty],[pc_empty],[attack_empty],0,...,0,0,1,0,0,0,0,0,0,0
3,37408,train,img/37408.png,i love everything and everybody! except for sq...,[not_hateful],[pc_empty],[attack_empty],[pc_empty],[attack_empty],0,...,0,0,1,0,0,0,0,0,0,0
4,82403,train,img/82403.png,"everybody loves chocolate chip cookies, even h...",[not_hateful],[pc_empty],[attack_empty],[pc_empty],[attack_empty],0,...,0,0,1,0,0,0,0,0,0,0


In [13]:
(fine_grained_df['gold_pc'] == fine_grained_df['pc']).mean()

0.8805031446540881

In [14]:
(fine_grained_df['gold_attack'] == fine_grained_df['attack']).mean()

0.8276729559748428

In [15]:
cols_to_remove = ['img', 'text', 'gold_hate']
fine_grained_df = fine_grained_df.drop(columns=cols_to_remove)
fine_grained_df = fine_grained_df.rename(columns={'set_name':'split'})
print(fine_grained_df.shape)
fine_grained_df.head()

(9540, 34)


Unnamed: 0,id,split,gold_pc,gold_attack,pc,attack,disability_gold_pc,nationality_gold_pc,pc_empty_gold_pc,race_gold_pc,...,religion_pc,sex_pc,attack_empty_attack,contempt_attack,dehumanizing_attack,exclusion_attack,inciting_violence_attack,inferiority_attack,mocking_attack,slurs_attack
0,42953,train,[pc_empty],[attack_empty],[pc_empty],[attack_empty],0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
1,23058,train,[pc_empty],[attack_empty],[pc_empty],[attack_empty],0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
2,13894,train,[pc_empty],[attack_empty],[pc_empty],[attack_empty],0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
3,37408,train,[pc_empty],[attack_empty],[pc_empty],[attack_empty],0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
4,82403,train,[pc_empty],[attack_empty],[pc_empty],[attack_empty],0,0,1,0,...,0,0,1,0,0,0,0,0,0,0


In [16]:
fine_grained_df.columns

Index(['id', 'split', 'gold_pc', 'gold_attack', 'pc', 'attack',
       'disability_gold_pc', 'nationality_gold_pc', 'pc_empty_gold_pc',
       'race_gold_pc', 'religion_gold_pc', 'sex_gold_pc',
       'attack_empty_gold_attack', 'contempt_gold_attack',
       'dehumanizing_gold_attack', 'exclusion_gold_attack',
       'inciting_violence_gold_attack', 'inferiority_gold_attack',
       'mocking_gold_attack', 'slurs_gold_attack', 'disability_pc',
       'nationality_pc', 'pc_empty_pc', 'race_pc', 'religion_pc', 'sex_pc',
       'attack_empty_attack', 'contempt_attack', 'dehumanizing_attack',
       'exclusion_attack', 'inciting_violence_attack', 'inferiority_attack',
       'mocking_attack', 'slurs_attack'],
      dtype='object')

In [17]:
info_df = pd.read_csv(info_fp)
print(info_df.shape)
info_df.head()

(10000, 8)


Unnamed: 0,id,img,label,text,split,text_idx,pseudo_text_idx,pseudo_img_idx
0,42953,img/42953.png,0,its their character not their color that matters,train,0,4901,0
1,23058,img/23058.png,0,don't be afraid to love again everyone is not ...,train,1,3620,3486
2,13894,img/13894.png,0,putting bows on your pet,train,2,0,3956
3,37408,img/37408.png,0,i love everything and everybody! except for sq...,train,3,4451,4313
4,82403,img/82403.png,0,"everybody loves chocolate chip cookies, even h...",train,4,5609,1


In [18]:
# merge fine_grained_df with info_df
info_df = pd.merge(info_df, fine_grained_df, on=['id', 'split'], how='left')
print(info_df.shape)
info_df.tail()

(10000, 40)


Unnamed: 0,id,img,label,text,split,text_idx,pseudo_text_idx,pseudo_img_idx,gold_pc,gold_attack,...,religion_pc,sex_pc,attack_empty_attack,contempt_attack,dehumanizing_attack,exclusion_attack,inciting_violence_attack,inferiority_attack,mocking_attack,slurs_attack
9995,3869,img/03869.png,0,a mother's love for the child is a divine thing,test_seen,8041,4783,4628,,,...,,,,,,,,,,
9996,23817,img/23817.png,0,sea monkeys,test_seen,1103,367,6246,,,...,,,,,,,,,,
9997,56280,img/56280.png,0,little miss muffet sat on her tuffet,test_seen,8042,2518,2395,,,...,,,,,,,,,,
9998,29384,img/29384.png,0,they're in a row,test_seen,8043,6811,1609,,,...,,,,,,,,,,
9999,34127,img/34127.png,0,that feeling when you win a fifa game after be...,test_seen,8044,2908,1632,,,...,,,,,,,,,,


In [19]:
float_cols = info_df.select_dtypes(float).columns
info_df[float_cols] = info_df.select_dtypes(float).astype('Int64')
info_df.head()

Unnamed: 0,id,img,label,text,split,text_idx,pseudo_text_idx,pseudo_img_idx,gold_pc,gold_attack,...,religion_pc,sex_pc,attack_empty_attack,contempt_attack,dehumanizing_attack,exclusion_attack,inciting_violence_attack,inferiority_attack,mocking_attack,slurs_attack
0,42953,img/42953.png,0,its their character not their color that matters,train,0,4901,0,[pc_empty],[attack_empty],...,0,0,1,0,0,0,0,0,0,0
1,23058,img/23058.png,0,don't be afraid to love again everyone is not ...,train,1,3620,3486,[pc_empty],[attack_empty],...,0,0,1,0,0,0,0,0,0,0
2,13894,img/13894.png,0,putting bows on your pet,train,2,0,3956,[pc_empty],[attack_empty],...,0,0,1,0,0,0,0,0,0,0
3,37408,img/37408.png,0,i love everything and everybody! except for sq...,train,3,4451,4313,[pc_empty],[attack_empty],...,0,0,1,0,0,0,0,0,0,0
4,82403,img/82403.png,0,"everybody loves chocolate chip cookies, even h...",train,4,5609,1,[pc_empty],[attack_empty],...,0,0,1,0,0,0,0,0,0,0


In [20]:
info_df['split'].value_counts()

train        8500
test_seen    1000
dev_seen      500
Name: split, dtype: int64

In [21]:
info_df.describe()

Unnamed: 0,id,label,text_idx,pseudo_text_idx,pseudo_img_idx,disability_gold_pc,nationality_gold_pc,pc_empty_gold_pc,race_gold_pc,religion_gold_pc,...,religion_pc,sex_pc,attack_empty_attack,contempt_attack,dehumanizing_attack,exclusion_attack,inciting_violence_attack,inferiority_attack,mocking_attack,slurs_attack
count,10000.0,10000.0,10000.0,10000.0,10000.0,9000.0,9000.0,9000.0,9000.0,9000.0,...,9000.0,9000.0,9000.0,9000.0,9000.0,9000.0,9000.0,9000.0,9000.0,9000.0
mean,49934.6871,0.3756,3808.3432,2996.1323,2878.3948,0.030778,0.039,0.638778,0.120667,0.130333,...,0.139333,0.113333,0.638556,0.058778,0.190667,0.022778,0.079556,0.136889,0.055444,0.031222
std,28537.752855,0.484302,2370.330576,2297.623341,2267.02814,0.172725,0.193606,0.480381,0.325758,0.336689,...,0.346313,0.317017,0.480446,0.235221,0.392849,0.149203,0.270619,0.343749,0.228858,0.173927
min,1235.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,25728.0,0.0,1685.75,920.0,830.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,49739.0,0.0,3681.5,2498.5,2351.5,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,74387.0,1.0,5843.25,4998.25,4851.25,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,98764.0,1.0,8044.0,7498.0,7351.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [22]:
info_df.to_csv(info_fp.replace('info', 'info_fine_grained'), index=False)

In [24]:
pc_columns = [col for col in info_df.columns if col.endswith('_pc') and not 'gold' in col]
attack_columns = [col for col in info_df.columns if col.endswith('_attack') and not 'gold' in col]
fine_grained_labels = pc_columns + attack_columns
print(fine_grained_labels)

['disability_pc', 'nationality_pc', 'pc_empty_pc', 'race_pc', 'religion_pc', 'sex_pc', 'attack_empty_attack', 'contempt_attack', 'dehumanizing_attack', 'exclusion_attack', 'inciting_violence_attack', 'inferiority_attack', 'mocking_attack', 'slurs_attack']


In [27]:
with open('fine_grained_labels.txt', 'w') as file:
    file.writelines([line+'\n' for line in fine_grained_labels])