In [1]:
import os
import csv
import pickle

## Load the dataset

In [2]:
data_dir = "../data"
hurtlex_file = "revised_hurtlex.tsv"

lexicon_list = []
with open(os.path.join(data_dir, hurtlex_file)) as infile:
    reader = csv.DictReader(infile, delimiter='\t')
    for row in reader:
        lexicon_list.append(row)

In [3]:
lexicon_list[0].keys()

dict_keys(['pos', 'category', 'lemma', 'offensiveness_score'])

In [4]:
lexicon_list[10]['category'].split(';')

['qas', 'ddf', '']

In [5]:
def get_score(item):
    try:
        score = float(item['offensiveness_score'])
    except TypeError:
        score = 0
    return score

In [6]:
def get_categories(item):
    category_list = item['category'].split(';')
    categories = [cat.strip() for cat in category_list if cat]
    return categories

In [14]:
set([cat
     for item in lexicon_list
     for cat in get_categories(item)])

{'an',
 'asf',
 'asm',
 'cds',
 'ddf',
 'ddp',
 'dfc',
 'dm',
 'dmc',
 'is',
 'mal',
 'mi',
 'min',
 'om',
 'op',
 'or',
 'pa',
 'pr',
 'ps',
 'qas',
 'rci',
 're',
 'svp'}

In [8]:
lexicon_dict = {
    item['lemma'] : (get_score(item), get_categories(item))
    for item in lexicon_list
}

In [9]:
lexicon_dict.get("zavorra")

(1.9338529209592425, ['qas'])

## Annotate Documents

### Global offensiveness score

We simply attribute to each document an offensiveness score which is the sum of the scores of all lemmas in the document.

We use the text already lemmatized with the Stanza pipeline.

In [10]:
results_dir = "../results"
pickle_file = "stanza_proc_train.pkl"

with open(os.path.join(results_dir, pickle_file), 'rb') as infile:
    train_set = pickle.load(infile)

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
doc = train_set[100]['proc_text']

In [22]:
def get_offensiveness_score(document):
    score = 0
    for word in document['proc_text'].iter_words():
        score += lexicon_dict.get(word.lemma, (0, None))[0]
    return score

In [18]:
scores = []

for document in train_set:
    scores.append(get_offensiveness_score(document))

In [31]:
import pandas as pd

scores_df = pd.DataFrame(scores, index=[doc['id'] for doc in train_set], columns=['offensiveness_score'])
scores_df.sort_values('offensiveness_score', ascending=False).head()

Unnamed: 0,offensiveness_score
8482,30.823949
9180,25.760301
8705,25.418626
6946,23.854857
9201,23.581941


In [20]:
scores_df.to_csv(os.path.join(results_dir, 'offensiveness_train.csv'))

### Offensiveness by category

Each offensive term in the hurtlex lexicon is related to one or more categories.
We want to compute the offensiveness of each document with reference to each category.

In [39]:
def get_offensiveness_score_by_cat(document):
    scores = dict()
    for word in document['proc_text'].iter_words():
        score, categories = lexicon_dict.get(word.lemma, (0, None))
        if score:
            try:
                for cat in categories:
                    scores[cat] = scores.get(cat, 0) + score
            except Exception:
                print(categories)
                break
    return scores

In [42]:
scores = []
for document in train_set:
    scores.append(get_offensiveness_score_by_cat(document))

In [49]:
scores_by_cat_df = pd.DataFrame(scores, index=[doc['id'] for doc in train_set]).fillna(0)
scores_by_cat_df.columns

Index(['rci', 'ps', 'ddp', 'qas', 'dmc', 'cds', 're', 'svp', 'is', 'an', 'min',
       'op', 'dfc', 'pr', 'asf', 'mi', 'asm', 'ddf', 'pa', 'mal', 'or', 'om'],
      dtype='object')

In [50]:
scores_by_cat_df.to_csv(os.path.join(results_dir, 'offensiveness_by_cat_train.csv'))