In [1]:
import json
from bson import ObjectId
import pandas as pd
import pymongo
import bson

# Part A - Get data from database

### 1. Connect to database collection "tasks"

In [2]:
# local
databaseURL = "mongodb://localhost:27017/annotation?readPreference=primary&appname=MongoDB%20Compass&directConnection=true&ssl=false"
# staging
# databaseURL = "mongodb+srv://rshallam:d0nqzmT0ly7kugX1@annotation-iq5d1.mongodb.net/annotation?retryWrites=true&w=majority"
# production
# databaseURL = "mongodb+srv://behavannoserver:UsbThVH9VOIrOSBL@cluster0.k8tlu.mongodb.net/annotation?retryWrites=true&w=majority"

client = pymongo.MongoClient(databaseURL)
db = client["annotation"]
Task = db["tasks"]
Experiment = db["experiments"]
User = db["users"]

### 2. Import label ambiguity data

In [3]:
exptId = "6347a0186941add14c07d2fb" # replace experiment id here
id_ = bson.objectid.ObjectId(exptId)

expt = Experiment.find_one({"_id": id_})

# Fetch all task
tasks = []
for task in Task.find({
    "experiment": id_,
}):
    tasks.append(task)

print("Number of tasks in experiment: ", len(tasks))
# print("Task IDs:")
# for task in tasks:
#     print(task["_id"])

Number of tasks in experiment:  2


### 3. Load task data into dataframe

In [4]:
def getTokens(task):
    tokens = []
    for sent in task["sents"]:
        tokens = [*tokens, *sent["words"]]
    return tokens

In [5]:
# Get data for each subject
raw = {}
subjects = []
raw["tokens"] = []
for index,task in enumerate(tasks):
    for sub in task["subjects"]:
        subjectKey = sub["username"]
        subjects.append(subjectKey)
        rtKey = subjectKey + "_RTs"
        tagKey = subjectKey + "_tags"
        if(rtKey not in raw.keys()):
            raw[rtKey] = []
        if(tagKey not in raw.keys()):
            raw[tagKey] = []
        for data in sub["data"]:
            raw[rtKey] = [*raw[rtKey], *data["wordRTs"]]
            raw[tagKey] = [*raw[tagKey], *data["wordTags"]]
    raw["tokens"] = [*raw["tokens"], *getTokens(task)]




# names of annotators
names = list(set(subjects))
names_RTs = [name + "_RTs" for name in names]
names_tags = [name + "_tags" for name in names]


rawDf = pd.DataFrame(raw)
# Convert RT to seconds
for name in names_RTs:
    rawDf[name] = rawDf[name]/1000

print("Total tokens: ", rawDf.shape[1])
rawDf.tail()

Total tokens:  5


Unnamed: 0,tokens,din_ann1_RTs,din_ann1_tags,din_ann2_RTs,din_ann2_tags
79,अत्‍यंत,0.526,ADP,0.584,DET
80,ही,0.407,CCONJ,0.472,ADP
81,आकर्षक,0.697,ADP,0.654,ADJ
82,है,0.83,CCONJ,0.772,PUNCT
83,।,0.637,DET,0.709,X


In [33]:
# t = tasks[0]
# data = t["subjects"][0]
# # print(data)
# # print(len(raw["tokens"]))
# print(raw)
# print(len(raw['din_ann1_RTs']))
# print(len(raw['din_ann1_tags']))

{'tokens': ['Die', 'registrasiesertifikaat', 'of', 'Vorm', 'C', ':', 'Customary', 'marriage', 'certificate', 'sal', 'aan', 'jou', 'uitgereik', 'word', '.', 'Die', 'naam', 'van', 'die', 'aansoeker', 'of', 'die', 'geregistreerde', 'naam', 'van', 'die', 'entiteit', ',', 'en', 'nié', 'die', 'handelsnaam', 'nie', ',', 'moet', 'in', 'hierdie', 'veld', 'ingevul', 'word', '.', 'Hierdie', 'regsplig', 'om', 'te', 'onderhou', 'word', '"', 'die', 'onderhoudsplig', '"', 'genoem', '.', 'Alle', 'misstowwe', 'wat', 'in', 'Suid-Afrika', 'ingevoer', ',', 'vervaardig', ',', 'geproduseer', 'of', 'verkoop', 'word', ',', 'moet', 'geregistreer', 'wees', 'by', 'die', 'Registrateur', 'van', 'Wet', '36', 'van', '1947', '.'], 'din_ann1_RTs': [535, 672, 477, 465, 576, 523, 1519, 301, 1200, 1174, 707, 611, 919, 460, 402, 2358, 1489, 1159, 1130, 1029, 902, 810, 760, 488, 1068, 700, 648, 509, 603, 1004, 594, 1481, 765, 1010, 603, 990, 695, 627, 700, 1117, 610], 'din_ann1_tags': ['NUM', 'ADJ', 'ADP', 'DET', 'SCONJ', 

### # Part B - Read ambiguity values and upload

In [6]:
for user in names:
    # Load label ambiguity into dictionary
    labelAmbiguity = {}
    df = pd.read_csv(user + ".csv")
    for idx, row in df.iterrows():
        labelAmbiguity[row["Tag"]] = row["Avrg_RT/Freq"]

    # Upload label ambiguity to the database
    filter = { "username": user }
    newvalues = { "$set": { "labelAmbiguity":labelAmbiguity } }

    result = User.update_one(filter, newvalues)
    if (result.modified_count):
        print("Updated label ambiguity in user: ", user)

df

Updated label ambiguity in user:  din_ann1
Updated label ambiguity in user:  din_ann2


Unnamed: 0,Tag,Avrg_RT,Frequency,Is subset?,Avrg_RT/Freq
0,ADJ,1.768366,1496,No,0.001182
1,ADP,1.531838,667,No,0.002297
2,ADV,1.947291,798,Yes,0.00244
3,AUX,1.899927,684,Yes,0.002778
4,CCONJ,5.690474,1778,No,0.0032
5,DET,2.673196,408,No,0.006552
6,INTJ,2.04368,297,No,0.006881
7,NOUN,7.000827,921,No,0.007601
8,NUM,2.643218,285,Yes,0.009274
9,PART,1.764244,156,No,0.011309
