In [1]:
import json
from bson import ObjectId
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import scipy.stats as ss
import random
import pymongo
import bson

# Part A - Get data from database

### 1. Connect to database collection "tasks"

In [2]:
databaseURL = "mongodb://localhost:27017/annotation?readPreference=primary&appname=MongoDB%20Compass&directConnection=true&ssl=false"
# staging
# databaseURL = "mongodb+srv://rshallam:d0nqzmT0ly7kugX1@annotation-iq5d1.mongodb.net/annotation?retryWrites=true&w=majority"
# production
# databaseURL = "mongodb+srv://behavannoserver:UsbThVH9VOIrOSBL@cluster0.k8tlu.mongodb.net/annotation?retryWrites=true&w=majority"
client = pymongo.MongoClient(databaseURL)
db = client["annotation"]
Task = db["tasks"]
Experiment = db["experiments"]

### 2. Fetch tasks of experiment from database
#### Specify the id of the experiment to extract in  `exptId`

In [3]:
exptId = "6336bc5fdb4c9799afdc50bb" # replace experiment id here
id_ = bson.objectid.ObjectId(exptId)

expt = Experiment.find_one({"_id": id_})

# Fetch all task
tasks = []
for task in Task.find({
    "experiment": id_,
}):
    tasks.append(task)

print("Number of tasks in experiment: ", len(tasks))
# print("Task IDs:")
# for task in tasks:
#     print(task["_id"])

Number of tasks in experiment:  2


### 3. Load task data into dataframe

In [4]:
def getTokens(task):
    tokens = []
    for sent in task["sents"]:
        tokens = [*tokens, *sent["words"]]
    return tokens

In [5]:
# Get data for each subject
raw = {}
subjects = []
raw["tokens"] = []
for index,task in enumerate(tasks):
    for sub in task["subjects"]:
        subjectKey = sub["username"]
        subjects.append(subjectKey)
        rtKey = subjectKey + "_RTs"
        tagKey = subjectKey + "_tags"
        if(rtKey not in raw.keys()):
            raw[rtKey] = []
        if(tagKey not in raw.keys()):
            raw[tagKey] = [] 
        for data in sub["data"]:
            raw[rtKey] = [*raw[rtKey], *data["wordRTs"]]
            raw[tagKey] = [*raw[tagKey], *data["wordTags"]]
    raw["tokens"] = [*raw["tokens"], *getTokens(task)]

# for item in raw:
#     print(item, len(item))

# names of annotators
names = list(set(subjects))
names_RTs = [name + "_RTs" for name in names]
names_tags = [name + "_tags" for name in names]

rawDf = pd.DataFrame(raw)
# Convert RT to seconds
for name in names_RTs:
    rawDf[name] = rawDf[name]/1000


print("Total tokens: ", rawDf.shape[0])
rawDf.tail()

Total tokens:  76


Unnamed: 0,tokens,din_ann1_RTs,din_ann1_tags,din_ann2_RTs,din_ann2_tags
71,teen,0.499,CCONJ,1.212,ADJ
72,2014,0.547,NUM,0.605,NUM
73,te,0.593,NUM,0.927,CCONJ
74,verhoog,0.525,X,0.653,PUNCT
75,.,0.857,PUNCT,0.982,X


# Part B model and get output

### 1. Remove outliers based on RT threshold and standard deviation cutoff

In [6]:
dfClean = rawDf.copy(deep=True)

# RT threshold removal
threshold = 90
for name in names_RTs:
    dfClean[name] = rawDf[name][rawDf[name] < threshold]

# Standard deviation outlier removal
z_thresh = 3
for name in names_RTs:
    dfClean[name] = dfClean[name][np.abs(ss.zscore(dfClean[name], nan_policy='omit')) < z_thresh]

### 2. Calculate scaled [0-1] values of log(RT) 

In [7]:
dfScaled = dfClean.copy(deep=True)
min_max_scaler = MinMaxScaler()
name_logRTs = [name + "_logRTs" for name in names]
name_scaled = [name + "_scaled" for name in names]

# Normalize cleaned RT data
dfScaled[name_logRTs] = np.log(dfScaled[names_RTs])
dfScaled[name_scaled] = min_max_scaler.fit_transform(dfScaled[name_logRTs])
dfScaled = dfScaled[["tokens", *names_RTs,*name_scaled, *names_tags]]
dfScaled

Unnamed: 0,tokens,din_ann1_RTs,din_ann2_RTs,din_ann1_scaled,din_ann2_scaled,din_ann1_tags,din_ann2_tags
0,Soos,0.892,0.527,0.655213,0.350833,ADJ,ADJ
1,die,0.842,0.896,0.618791,0.764531,X,X
2,agbare,0.845,0.543,0.621036,0.374146,PUNCT,PUNCT
3,lede,0.863,0.604,0.634345,0.457133,NUM,NUM
4,weet,0.686,0.722,0.489416,0.596231,CCONJ,CCONJ
...,...,...,...,...,...,...,...
71,teen,0.499,1.212,0.288461,1.000000,CCONJ,ADJ
72,2014,0.547,0.605,0.346450,0.458422,NUM,NUM
73,te,0.593,0.927,0.397433,0.791044,NUM,CCONJ
74,verhoog,0.525,0.653,0.320531,0.517934,X,PUNCT


### 3. Tag-specific Scaling
##### a. Outlier removal and scaling

In [8]:
def removeOutliers(dfRaw):
    threshold = 90
    z_thresh = 3

    dfCleaned = dfRaw.copy(deep=True)
    dfCleaned[dfCleaned.columns[0]] = dfCleaned[dfCleaned.columns[0]][dfCleaned[dfCleaned.columns[0]] < threshold]
    dfCleaned[dfCleaned.columns[0]] = dfCleaned[dfCleaned.columns[0]][np.abs(ss.zscore(dfCleaned[dfCleaned.columns[0]], nan_policy='omit')) < z_thresh]
    return dfCleaned

def normalize(dfRaw):
    min_max_scaler = MinMaxScaler()
    dfNormed = dfRaw.copy(deep=True)
    
    dfNormed["tagNorm"] = np.log(dfNormed[dfNormed.columns[0]])
    dfNormed["tagNorm"] = min_max_scaler.fit_transform(dfNormed[["tagNorm"]])
#     print(dfNormed.shape[0])
    return dfNormed

In [9]:
dfTags = rawDf.copy(deep=True) # Choose df instead of dfScaled as df has raw RT while dfScaled has outliers removed
groupsByTags = []
# Group by tag for each annotator
for name in names_tags:
    groupsByTags.append(dfTags.groupby(name))

# annotatorTagsDf is an array of dictionary where key is each 
# tag and value is a dataframe with RT and scaled RT as columns
annotatorTagsDf = [{} for name in names]

for index, group in enumerate(groupsByTags):
    for key in group.groups.keys():
        annotatorTagsDf[index][key] = removeOutliers(group.get_group(key)[[names_RTs[index]]])
        annotatorTagsDf[index][key] = normalize(annotatorTagsDf[index][key])


##### b. Merge normalized tag specific complexity back into table

In [10]:
name_scaled_tag = [name + "_tagRTScaled" for name in names]

# Concatenate tags into one dataframe for each annotator
annotatorDf = []
for index, tagGroup in enumerate(annotatorTagsDf):
    concatenatedDf = pd.concat(tagGroup.values())[["tagNorm"]].rename(columns={"tagNorm": name_scaled_tag[index]}).sort_index()
    annotatorDf.append(concatenatedDf)

# Merge tag normalized data back into dataframe
df = dfScaled.copy(deep=True)
for annoDf in annotatorDf:
    df = pd.merge(df, annoDf, left_index=True, right_index=True)
df.shape[0]
# df.describe()

76

### Example output

In [11]:
def model(row, scaled):
    return len(row["tokens"]) + row[scaled]

In [12]:
outputColumns = [name + "_output" for name in names]
for idx, column in enumerate(outputColumns):
    df[column] = df.apply(model, args=(name_scaled_tag[idx],), axis=1)
df

Unnamed: 0,tokens,din_ann1_RTs,din_ann2_RTs,din_ann1_scaled,din_ann2_scaled,din_ann1_tags,din_ann2_tags,din_ann1_tagRTScaled,din_ann2_tagRTScaled,din_ann1_output,din_ann2_output
0,Soos,0.892,0.527,0.655213,0.350833,ADJ,ADJ,0.655213,0.334113,4.655213,4.334113
1,die,0.842,0.896,0.618791,0.764531,X,X,0.649895,0.876691,3.649895,3.876691
2,agbare,0.845,0.543,0.621036,0.374146,PUNCT,PUNCT,0.761504,0.027804,6.761504,6.027804
3,lede,0.863,0.604,0.634345,0.457133,NUM,NUM,0.691421,0.542723,4.691421,4.542723
4,weet,0.686,0.722,0.489416,0.596231,CCONJ,CCONJ,1.000000,0.734986,5.000000,4.734986
...,...,...,...,...,...,...,...,...,...,...,...
71,teen,0.499,1.212,0.288461,1.000000,CCONJ,ADJ,0.479935,1.000000,4.479935,5.000000
72,2014,0.547,0.605,0.346450,0.458422,NUM,NUM,0.267825,0.544254,4.267825,4.544254
73,te,0.593,0.927,0.397433,0.791044,NUM,CCONJ,0.342839,1.000000,2.342839,3.000000
74,verhoog,0.525,0.653,0.320531,0.517934,X,PUNCT,0.000000,0.239464,7.000000,7.239464


# Part C Insert output values of each annotator into database

### 1. Describe output column. 
#### One column per annotator

In [13]:
# Replace outlier values with string "outlier"
df = df.fillna("outlier")
outputColumns = [name + "_tagRTScaled" for name in names]
outputColumns

['din_ann1_tagRTScaled', 'din_ann2_tagRTScaled']

#### Check that tasks are loaded

In [14]:
print("Tasks to be updated")
for task in tasks:
    taskID = task["_id"]
    print(taskID)

Tasks to be updated
6336bda913464cfd4108f6c6
6336bdaa13464cfd4108f6c7


### 2. Convert output columns into task and sentence level data structure

In [15]:
output = []
for index, subject in enumerate(names):
    outputColumn = outputColumns[index]
    outputValues = df[outputColumn].values.tolist()

    data = {} 

    for task in tasks:
        taskID = task["_id"]
        data[taskID] = {
            "model": []
        }
        for sub in task["subjects"]:
            subjectKey = sub["username"]
            if (subjectKey==subject):

                for sentData in sub["data"]:
                    count = len(sentData["wordRTs"])
                    data[taskID]["model"].append(outputValues[:count])
                    outputValues = outputValues[count:]
#     print(output)
    output.append(data)        

### 3. Sanity Check if number of tokens in output is same as input

In [16]:
for idx, subject in enumerate(names):
    flag = 0
    for task in tasks:
        taskID = task["_id"]
        for sub in task["subjects"]:
            subjectKey = sub["username"]
            if (subjectKey==subject):

                for index, sentData in enumerate(sub["data"]):
                    count = len(sentData["wordRTs"])
                    if(count != len(output[idx][taskID]["model"][index])):
                        flag = 1
                        print("mismatch in size in task: ", taskID)
if (flag==0):
    print("Sanity maintained")
else:
    print("Error: check if output column has same number of tokens as input")

Sanity maintained


### 4. Push output to each task in DB

In [17]:
for idx, subject in enumerate(names):
    for taskId in output[idx]:
        filter = {
            "_id": bson.objectid.ObjectId(taskId),
            "subjects": { "$elemMatch": {"username": subject} }
                 } 
        # Values to be updated. 
        newvalues = { "$set": { "subjects.$.normalizedData": output[idx][taskId] } } 
        # Update
        result = Task.update_one(filter, newvalues)
        if (result.modified_count):
            print("Updated task: ", taskId)
        print(result.matched_count, result.modified_count)

Updated task:  6336bda913464cfd4108f6c6
1 1
Updated task:  6336bdaa13464cfd4108f6c7
1 1
Updated task:  6336bda913464cfd4108f6c6
1 1
Updated task:  6336bdaa13464cfd4108f6c7
1 1
