In [62]:
import pandas as pd
import os 
import numpy as np
import json

## Extraction of the hatebase vocabulary

This notebook is aimed at extracting a set of words from the Hatebase database (https://hatebase.org/). The data is spread among 16 different json files corresponding to 16 unique API requests sent to the hatebase server with the only parameter being specified being words in english.
From the json response files, we extract the `term` (a string with an offensive word), and some one hot parameters providing information on the minority the word is attacking. The parameters are:
`"is_about_nationality", "is_about_ethnicity", "is_about_religion", "is_about_gender", "is_about_sexual_orientation", "is_about_disability", "is_about_class".`

In [71]:
hatebase_path = "./hatebase/response"
json_extension = ".json"

number_list = ["{:02d}".format(i) for i in range(1, 17)]

hate_list = []

features = [
    "is_about_nationality",
    "is_about_ethnicity",
    "is_about_religion",
    "is_about_gender",
    "is_about_sexual_orientation",
    "is_about_disability",
    "is_about_class"
]

one_hot_features = np.zeros((1556, len(features)))
row = 0

for response_number in number_list:
    path_to_file = hatebase_path + response_number + json_extension  
    with open(path_to_file) as file:
        data = json.load(file)
        for entry in data['result']:
            features_temp = [entry[i] for i in features]
            one_hot_features[row] = features_temp
            hate_list.append(str(entry['term']))
            row += 1
print(one_hot_features.shape)
print(len(hate_list))
            

(1556, 7)
1556


In [75]:
out_df = pd.DataFrame({
    "Term" : hate_list
})

out_df.head()

out_df[features] = one_hot_features
out_df.head()

Unnamed: 0,Term,is_about_nationality,is_about_ethnicity,is_about_religion,is_about_gender,is_about_sexual_orientation,is_about_disability,is_about_class
0,mudslime,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,muslimette,0.0,0.0,1.0,1.0,0.0,0.0,0.0
2,sand apes,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,sand ape,0.0,1.0,1.0,0.0,0.0,0.0,0.0
4,porki,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [76]:
path_to_outlist = "./hatebase/hatelist.txt"
with open(path_to_outlist, 'w') as outfile:
    outfile.write(json.dumps((hate_list)))

path_to_outdf = "./hatebase/hateframe.csv"
out_df.to_csv(path_to_outdf)
