In [1]:
import os
import csv
import pickle

## Load the dataset

In [2]:
data_dir = "../data"
hurtlex_file = "revised_hurtlex.tsv"

lexicon_list = []
with open(os.path.join(data_dir, hurtlex_file)) as infile:
    reader = csv.DictReader(infile, delimiter='\t')
    for row in reader:
        lexicon_list.append(row)

In [3]:
lexicon_list[0].keys()

dict_keys(['pos', 'category', 'lemma', 'offensiveness_score'])

In [4]:
def create_lexicon_dict(lexicon_list):
    lexicon_dict = dict()
    for item in lexicon_list:
        try:
            lexicon_dict[item['lemma']] = float(item['offensiveness_score'])
        except TypeError:
            lexicon_dict[item['lemma']] = 0
    return lexicon_dict

lexicon_dict =create_lexicon_dict(lexicon_list)

In [5]:
lexicon_dict.get("zavorra")

1.9338529209592425

## Annotate Documents

We simply attribute to each document an offensiveness score which is the sum of the scores of all lemmas in the document.

We use the text already lemmatized with the Stanza pipeline.

In [6]:
results_dir = "../results"
pickle_file = "stanza_proc_train.pkl"

with open(os.path.join(results_dir, pickle_file), 'rb') as infile:
    train_set = pickle.load(infile)

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
doc = train_set[100]['proc_text']

In [8]:
def get_offensiveness_score(document):
    score = 0
    for word in document['proc_text'].iter_words():
        score += lexicon_dict.get(word.lemma, 0)
    return score

In [9]:
scores = []

for document in train_set:
    scores.append(get_offensiveness_score(document))

In [10]:
import pandas as pd

scores_df = pd.DataFrame(scores, index=[doc['id'] for doc in train_set], columns=['offensiveness_score'])
scores_df.tail()

Unnamed: 0,offensiveness_score
9340,0.0
9121,1.752922
8549,0.0
9240,0.0
8000,0.0


In [11]:
scores_df.to_csv(os.path.join(results_dir, 'offensiveness_train.csv'))