In [1]:
import pymongo
import csv
import json
from random import randint
from bson.objectid import ObjectId
from nltk import agreement
import datetime
from json import JSONEncoder

# 1. Select messages based on classifiers
In this part we select messages that we will then label by hand. We select the messages in the following way:

* In total we select 700 messages
* There should not be any duplicates
* For each of the seven classifiers (Perpective API, Germeval 18, Germeval 19, Germeval 1819, Hasoc 2020, Hasoc 1920, Covid 2021) we select 50 OFFENSIVE messages and 50 NEUTRAL messages

In [10]:
client = pymongo.MongoClient("mongodb://localhost:27017/")

In [11]:
messages_collection = client.telegram.messages


def get_german_messages():
    query = {"language" : "de"}
    query_res = messages_collection.find(query, {"_id": 1, "text": 1, "link_preview": 1, "link_preview_title": 1,
                                                 "hate_germeval_18": 1, "hate_germeval_19": 1, 
                                                "hate_germeval_1819": 1, "toxicity_score_preprocessed": 1, "toxicity_score": 1,
                                                "hate_hasoc_2020": 1, "hate_hasoc_1920": 1, "hate_covid_2021": 1})
    messages = [msg for msg in query_res if msg['text'] is not None]
    return messages

In [6]:
messages = get_german_messages()
print(len(messages))

5439691


As the different classifiers have different labels, we define a helper function that converts these different labels in OFFENSIVE and NEUTRAL.

In [4]:
OFFENSE = 1
NEUTRAL = 0

In [5]:
def get_label(msg, classifier):
    if classifier == "perspective_api":
        score = 0
        if "toxicity_score_preprocessed" in msg:
            score = msg['toxicity_score_preprocessed']
        else:
            score = msg['toxicity_score']
        return OFFENSE if score > 0.5 else NEUTRAL
    
    if classifier in ["hate_germeval_18", "hate_germeval_19", "hate_germeval_1819"]:
        return OFFENSE if msg[classifier]['label'] == 'OFFENSE' else NEUTRAL
    
    if classifier in ["hate_hasoc_2020", "hate_hasoc_1920"]:
        return OFFENSE if msg[classifier]['label'] == 'HOF' else NEUTRAL
    
    if classifier == "hate_covid_2021":
        return OFFENSE if msg[classifier]['label'] == 'abusive' else NEUTRAL

If there is a link preview and/or a link preview title, then we want to append those at the end of the message. To do this we define another helper function that appends the link preview (title) if available.

In [6]:
def get_full_text(msg):
    text = msg['text']
    if msg['link_preview_title'] is not None:
        text += '\n+++ LINK PREVIEW TITLE ++++++++++++++++++++++++++\n' + msg['link_preview_title']
    if msg['link_preview'] is not None:
        text += '\n+++ LINK PREVIEW ++++++++++++++++++++++++++\n' + msg['link_preview'] + '\n++++++++++++++++++++++++++++++++++++++++++++++++++++'
    return text

Now we select for each classifer 50 offensive and 50 neutral messages

In [10]:
classifiers = ["perspective_api", "hate_germeval_18", "hate_germeval_19", "hate_germeval_1819", "hate_hasoc_2020",
               "hate_hasoc_1920", "hate_covid_2021"]
selected_messages = set()

In [11]:
for classifier in classifiers:
    count_offense = 0
    count_neutral = 0
    while count_offense < 50 or count_neutral < 50:
        random_number = randint(0, len(messages) - 1)
        msg = messages[random_number]
        label = get_label(msg, classifier)
        if label == OFFENSE and count_offense < 50:
            text = get_full_text(msg)
            if text not in selected_messages:
                selected_messages.add((text, msg['_id']))
                count_offense += 1
        elif label == NEUTRAL and count_neutral < 50:
            text = get_full_text(msg)
            if text not in selected_messages:
                selected_messages.add((text, msg['_id']))
                count_neutral += 1

All the selected messages are saved in a separate text file. The name of the text file is the id of the message, such that we can later query the corresponding message again.

In [13]:
for msg, _id in selected_messages:
    f = open("messages_for_annotation/" + str(_id) + ".txt", "w+")
    f.write(msg)
    f.close()

# 2. Select messages according to topic model
Additionally, we select 450 messages according to a topic model. We did not do this by ourself but have received a list of message ids that  were selected. Now we have to read that list, select the corresponding messages and save them.

In [10]:
with open('ID_collection_topic_model.csv', newline='') as f:
    reader = csv.reader(f)
    ids = list(reader)[1:]

print(ids)

[['605356bdfb9ea2ab4ac179aa'], ['60551b84210cd4ce6eb7ef3f'], ['605614484a7c0d0e24d9fb3c'], ['6054f325077685f1acb7d574'], ['6057cf17732b5380b43bc614'], ['6051ff616216bf3ffd44fda1'], ['605201fce0a38c0d01451e64'], ['6054f325077685f1acb7d589'], ['60534ed22e37b25760c11a4e'], ['605201b85c9a117ec7451a9b'], ['6051ff1d6216bf3ffd44fa41'], ['605205493e09b72d0a45445e'], ['605201dc7fbae3865545234a'], ['6055eec4f7099c2a96d9fa67'], ['605940615d6e5dfe12366588'], ['60521a50e22eda1ba8454f6e'], ['60595738fa0b4206ba366ea2'], ['605209fd207466b93f456e9a'], ['6059efed35bdf11df0366ba8'], ['6057d903fe5b029d853bcbee'], ['605201227fbae386554514d1'], ['60538610ed2127b15fc10322'], ['6055e050f6bbcd7175da20de'], ['60563d43c6a1cc1c7cd9ea24'], ['60534eff2e37b25760c11c30'], ['60522f1d97d4fc578a452ffc'], ['605770b143cddce45a60d396'], ['6059ff158f0c73b185366d21'], ['60552ce03df9ed0b73b8443c'], ['60537bbf06a5ab4d73c0fe76'], ['605208a5207466b93f455a98'], ['605527cdea54dee495b7f024'], ['6054d5939c8a27168dc336d9'], ['60550b0

In [19]:
selected_messages = []
for l in ids:
    _id = l[0]
    query = {'_id': ObjectId(_id)}
    msg = messages_collection.find_one(query, {"_id": 1, "text": 1, "link_preview": 1, "link_preview_title": 1})
    if msg['text'] is not None:
        selected_messages.append((get_full_text(msg), msg['_id']))

In [20]:
print(len(selected_messages))

449


In [21]:
for msg, _id in selected_messages:
    f = open("sampled_messages_topic_model/" + str(_id) + ".txt", "w+")
    f.write(msg)
    f.close()

# 3. Analyze results of the sample round
We labeled 50 messages is a sample round, to see whether the inter-rater-reliability is reasonably good. In this part we will analyze the results of this sample round and calculate the Krippendorff alpha.

In [23]:
f = open('sample_round_results.json',)
data = json.load(f)
print(len(data))

50


In [24]:
print(data)

[{'content': 'https://cloud.kili-technology.com/api/label/v2/files?id=933825df-fc22-4e1a-8966-fe1e61355703', 'externalId': '605a06cede26ef3a9b36650e.txt', 'id': 'ckql4dq8301kk0lrqf3i03go9', 'jsonMetadata': {}, 'labels': [{'author': {'email': 'maximilian.wich@tum.de', 'id': 'ckql085ur05cc0kpm2e3f0mk1', 'name': 'maximilian.wich@tum.de'}, 'createdAt': '2021-07-01T16:26:33.474Z', 'isLatestLabelForUser': True, 'jsonResponse': 'NEUTRAL', 'labelType': 'DEFAULT', 'modelName': None, 'skipped': False}, {'author': {'email': 'jeremias.bohn@tum.de', 'id': 'ckql0eeh0000r0kqh59l9h8vw', 'name': 'jeremias.bohn@tum.de'}, 'createdAt': '2021-07-01T16:34:05.004Z', 'isLatestLabelForUser': True, 'jsonResponse': 'NEUTRAL', 'labelType': 'DEFAULT', 'modelName': None, 'skipped': False}, {'author': {'email': 'ge46rev@mytum.de', 'id': 'ckql0dvoi00280kqg68v67voe', 'name': 'ge46rev@mytum.de'}, 'createdAt': '2021-07-01T19:24:53.437Z', 'isLatestLabelForUser': True, 'jsonResponse': 'NEUTRAL', 'labelType': 'DEFAULT', 'm

In [28]:
assignments = []
for entry in data:
    _id = entry['externalId'][0:-4]
    labels = entry['labels']
    for label_entry in labels:
        author_id = label_entry['author']['id']
        if label_entry['isLatestLabelForUser'] and not label_entry['skipped']:
            label = label_entry['jsonResponse']
            assignments.append((author_id, _id, label))
            

In [29]:
print(len(assignments))

245


In [30]:
ratingtask = agreement.AnnotationTask(data=assignments)
print('Krippendorff\'s alpha:',ratingtask.alpha())

Krippendorff's alpha: 0.6315102040816327


# 4. Get gold labels from annotation results
Now we will analyze the annotation results and set the gold labels for the selected messages. The gold label will be the label with the majority vote. If there is a tie, then the message will be relabeled in another labeling round.

In [17]:
neutral_label = 'NEUTRAL'
offensive_label = 'OFFENSIVE_ABUSIVE'

In [15]:
def save_label(msg_id, label):
    query = {'_id': ObjectId(msg_id)}
    new_value = {"$set": {"gold_label": label}}
    messages_collection.update_one(query, new_value)

In [18]:
def analyze_annotation_results(annotation_data):
    indifferent_messages = []
    for entry in annotation_data:
        _id = entry['externalId'][0:-4]
        labels = entry['labels']
        neutral_count = 0
        offensive_count = 0
        for label_entry in labels:
            if label_entry['isLatestLabelForUser'] and not label_entry['skipped']:
                label = label_entry['jsonResponse']
                if label == neutral_label:
                    neutral_count += 1
                elif label == offensive_label:
                    offensive_count += 1
        if neutral_count > offensive_count:
            save_label(_id, neutral_label)
        elif offensive_count > neutral_count:
            save_label(_id, offensive_label)
        else:
            indifferent_messages.append(_id)
    return indifferent_messages

First we will look at the 50 messages from the sample round.

In [4]:
f = open('sample_round_results.json',)
data = json.load(f)
print(len(data))

50


In [10]:
indifferent_messages = analyze_annotation_results(data)
print(len(indifferent_messages))

0


There is no tie, so we do not need to relabel any message from the sample roud.
Next we analyze the 1099 messages from the main round.

In [7]:
f = open('annotation_results.json',)
data = json.load(f)
print(len(data))

1099


In [13]:
indifferent_messages = analyze_annotation_results(data)
print(len(indifferent_messages))

126


We have in total 126 messages with a tie. These messages get labeled in a third round.

In [11]:
for _id in indifferent_messages:
    query = {'_id': ObjectId(_id)}
    msg = messages_collection.find_one(query, {"_id": 1, "text": 1, "link_preview": 1, "link_preview_title": 1})
    f = open("third_annotation_round/" + str(_id) + ".txt", "w+")
    if msg['text'] is not None:
        msg = get_full_text(msg)
    else:
        continue
    f.write(msg)
    f.close()

Let us analyze the results of the third annotation round

In [13]:
f = open('third_round_results.json',)
data = json.load(f)
print(len(data))

126


In [19]:
indifferent_messages = analyze_annotation_results(data)
print(len(indifferent_messages))

8


Let's have a closer look at thiose eight messages...

In [20]:
for _id in indifferent_messages:
    query = {'_id': ObjectId(_id)}
    msg = messages_collection.find_one(query, {"_id": 1, "text": 1, "link_preview": 1, "link_preview_title": 1})
    f = open("third_annotation_round/" + str(_id) + ".txt", "w+")
    if msg['text'] is not None:
        msg = get_full_text(msg)
        print(_id + '\n' + msg + '\n\n\n')

60535dbb1c2b21aba4c0ea0d
Zu diesem Video folgende Überlegungen:
1. In den Herkunftsländern der Migranten ist es sicherlich wärmer.
2. Was hat man eigentlich zu fordern, wenn man illegal in ein Land eindringt?
Es wird immer so getan, als würden diese Menschen ungeheurem Leid ausgesetzt sein. Niemand hat sie gezwungen, ihre Heimat zu verlassen und illegal in andere Länder einzufallen.
Mehr unter: 
t.me/nachrichtenportal



605613debd7aaf419bd9f091
http://www.guidograndt.de/2020/09/10/kollegenbeitrag-riesen-zoff-um-querdenker-demo-in-wien-dem-zerreissen-einer-paedo-fahne-eine-richtigstellung-mit-fotos-videos/?fbclid=IwAR2MrG6jKm_ghx_bcjomKm47BO0hGLfRhoJTT1TjkjaqfHnL_5H-tqXR1Ac
+++ LINK PREVIEW TITLE ++++++++++++++++++++++++++
KOLLEGENBEITRAG: „Riesen-Zoff um Querdenker-Demo in Wien & dem Zerreißen einer Pädo-Fahne!“ – Eine Richtigstellung! (Mit Fotos…
+++ LINK PREVIEW ++++++++++++++++++++++++++
EIN KOLLEGENBEITRAG VON MANUEL C. MITTAS (Recherchenetzwerk Satanismus & rituelle Gewalt) Schau

We did discuss those eight message in group and decided on one label which we will set here manually.

In [16]:
save_label( '60535dbb1c2b21aba4c0ea0d', neutral_label)
save_label( '605613debd7aaf419bd9f091', neutral_label)
save_label( '605730e0d81900375dc99873', neutral_label)
save_label( '6052025f92c67d3aac451d9c', neutral_label)
save_label( '6054916d92c52865064d6a33', neutral_label)
save_label( '60521862e22eda1ba8453b2c', neutral_label)
save_label( '60538740ed2127b15fc115fd', neutral_label)

In [17]:
save_label( '605770263bb6af174c60c6bc', offensive_label)

# 5. Write Data to JSON Format
In order to reuse the dataset that we have created, we store the 1149 messages with the gold label and the raw annotations in a file using the JSON format.

In [18]:
data = {}
data['messages'] = []

In [19]:
authors = {}

def prepare_for_json(annotation_data):
    messages = []
    for entry in annotation_data:
        _id = entry['externalId'][0:-4]
        query = {'_id': ObjectId(_id)}
        msg = messages_collection.find_one(query, {"_id": 1, "text": 1, "link_preview": 1, "link_preview_title": 1, 
                                                   "channel_name": 1, "datetime": 1, "post_id": 1, "gold_label": 1})
        msg['_id'] = _id
        msg['raw_annotations'] = {}
        neutral_count = 0
        offensive_count = 0
        labels = entry['labels']
        for label_entry in labels:
            if label_entry['isLatestLabelForUser'] and not label_entry['skipped']:
                label = label_entry['jsonResponse']
                if label == neutral_label:
                    neutral_count += 1
                elif label == offensive_label:
                    offensive_count += 1
                author_id = label_entry['author']['id']
                if author_id not in authors:
                    authors[author_id] = 'annotator_' + str(len(authors) + 1)
                msg['raw_annotations'][authors[author_id]] = label
        if neutral_count != offensive_count:
            messages.append(msg)
    return messages

First the 50 sample round messages

In [20]:
f = open('sample_round_results.json',)
sample_round_data = json.load(f)
print(len(sample_round_data))

50


In [21]:
data['messages'].extend(prepare_for_json(sample_round_data))

In [22]:
print(len(data['messages']))

50


Next the 1099 messages from the maiin annotation round

In [23]:
f = open('annotation_results.json',)
main_annotation_data = json.load(f)
print(len(main_annotation_data))

1099


In [24]:
data['messages'].extend(prepare_for_json(main_annotation_data))
print(len(data['messages']))

1023


Next the 126 messages that had to be relabeled in a third round due to a tie

In [25]:
f = open('third_round_results.json',)
third_round_data = json.load(f)
print(len(third_round_data))

126


In [26]:
data['messages'].extend(prepare_for_json(third_round_data))
print(len(data['messages']))

1141


Now we have to manually add the eight messages that we have discussed in person

In [27]:
ids = ['60535dbb1c2b21aba4c0ea0d', '605613debd7aaf419bd9f091', '605730e0d81900375dc99873', '6052025f92c67d3aac451d9c', 
      '6054916d92c52865064d6a33', '60521862e22eda1ba8453b2c', '60538740ed2127b15fc115fd', '605770263bb6af174c60c6bc']
daniel_bartmann_id = "ckqjnxpqy00he0ku30wexbe5l"
maximilian_wich_id = "ckql085ur05cc0kpm2e3f0mk1"

for _id in ids:
    query = {'_id': ObjectId(_id)}
    msg = messages_collection.find_one(query, {"_id": 1, "text": 1, "link_preview": 1, "link_preview_title": 1, 
                                                   "channel_name": 1, "datetime": 1, "post_id": 1, "gold_label": 1})
    msg['_id'] = _id
    
    msg['raw_annotations'] = {}    
    msg['raw_annotations'][authors[daniel_bartmann_id]] = msg['gold_label']
    msg['raw_annotations'][authors[maximilian_wich_id]] = msg['gold_label']
    data['messages'].append(msg)

print(len(data['messages']))

1149


In [28]:
class DateTimeEncoder(JSONEncoder):
        def default(self, obj):
            if isinstance(obj, (datetime.date, datetime.datetime)):
                return obj.isoformat()

with open('annotated_dataset.txt', 'w') as outfile:
    json.dump(data, outfile, indent=4, cls=DateTimeEncoder)

# 6. Calculate Krippendorff's Alpha over whole Annotations

In [29]:
f = open('annotated_dataset.txt',)
data = json.load(f)
print(data)



In [30]:
assignments = []
for msg in data['messages']:
    _id = msg['_id']
    for author, label in msg['raw_annotations'].items():
        assignments.append((author, _id, label))

In [31]:
ratingtask = agreement.AnnotationTask(data=assignments)
print('Krippendorff\'s alpha:', ratingtask.alpha())

Krippendorff's alpha: 0.7387221322705194
