# Task Preparation and Inserion

In [1]:
import pprint
import json
import glob
import random
import pymongo
import bson

### 1. Prepare connection to MongoDb database

In [2]:
# local
databaseURL = "mongodb://localhost:27017/?readPreference=primary&appname=MongoDB%20Compass&directConnection=true&ssl=false"

# production
# databaseURL = "mongodb+srv://behavannoserver:UsbThVH9VOIrOSBL@cluster0.k8tlu.mongodb.net/annotation?retryWrites=true&w=majority"
client = pymongo.MongoClient(databaseURL)
db = client["bbat"]
colExpt = db["experiments"] # reference to collection "experiments"
colTasks = db["tasks"] # reference to collection "tasks"

### 2. Get path of all treebank json files in datasets directory

The root name is considered the name of the treebank. eg. UD_Afrikaans-AfriBooms is a treebank
All the files inside that treebank are considered its documents. eg. The treebank UD_Afrikaans-AfriBooms contains three documents: af_afribooms-ud-dev, af_afribooms-ud-test, and af_afribooms-ud-train

In [3]:
# files = glob.glob("**/*.json", recursive=True) 
# remove ud_afrikaans later
files = glob.glob("treebanks/UD_Hindi-HDTB/*.json", recursive=True) 
files

['treebanks/UD_Hindi-HDTB\\hi_hdtb-ud-dev.json',
 'treebanks/UD_Hindi-HDTB\\hi_hdtb-ud-test.json',
 'treebanks/UD_Hindi-HDTB\\hi_hdtb-ud-train.json']

### 3. Create a key-value pair with treebank name as the key and its array of documents as the value

In [4]:
treebanks = {}

for file in files:
    with open(file) as f:
        data = json.load(f)
        treebank = data["dir"][0]

        if treebank in treebanks:
            treebanks[treebank].append(data)
        else:
            treebanks[treebank] = [data]            

print("Treebanks stored in dictionary: ", ', '.join(list(treebanks.keys())) )

Treebanks stored in dictionary:  UD_Hindi-HDTB


### 4. Function chunks(list, n) splits a list into smaller chunks containing n elements

In [5]:
# split list into sublist of equal parts of n and remainder
def chunks(sents, n):
    rem = len(sents) % n
    for i in range(0,len(sents)-rem, n):
        yield sents[i:i+n]
    
    if (rem != 0):
        rest = sents[-rem:]
        yield rest

In [14]:
# Example chunks fn
a = list(range(99))
for i in chunks(a, 7):
    print(i)

[0, 1, 2, 3, 4, 5, 6]
[7, 8, 9, 10, 11, 12, 13]
[14, 15, 16, 17, 18, 19, 20]
[21, 22, 23, 24, 25, 26, 27]
[28, 29, 30, 31, 32, 33, 34]
[35, 36, 37, 38, 39, 40, 41]
[42, 43, 44, 45, 46, 47, 48]
[49, 50, 51, 52, 53, 54, 55]
[56, 57, 58, 59, 60, 61, 62]
[63, 64, 65, 66, 67, 68, 69]
[70, 71, 72, 73, 74, 75, 76]
[77, 78, 79, 80, 81, 82, 83]
[84, 85, 86, 87, 88, 89, 90]
[91, 92, 93, 94, 95, 96, 97]
[98]


### 5. Task Preparation
Each sentence in a treebank is prepared as an object containing information about it and stored in the array sents. <br>
The sents array is then shuffled to randomize sentence order.<br> 
The sents array is then partitioned into tasks of size task_size(3) and relevant keys are attached to it<br>

In [6]:
task_size = 3 # Maximum number of sentences in each task

tasks = []
for treebank in treebanks:
    
    sents = []
    for doc in treebanks[treebank]:
        for sentence in doc["sentences"]:
            sent = {}
            sent["file"] = doc["_id"] #
            sent["sentId"] = sentence["metadata"]["sent_id"]
            sent["text"] = sentence["metadata"]["text"]
            sent["flag"] = sentence["metadata"]["flag"]
            words = list(map(lambda x: x["form"], sentence["token"]))
            sent["words"] = words
            sents.append(sent)

    random.shuffle(sents) # remove to preserve sentence order
    
    #Partition sents into chunks for task
    
    for chunk in chunks(sents, task_size):
        task = {}
        task["sents"] = chunk    
        task["subjects"] = []
        task["adjudicators"] = []
        task["treebank"] = treebank
        tasks.append(task)
        
print("Number of tasks prepared: ", len(tasks))

Number of tasks prepared:  223


### 6. (Optional) Uncomment and run to wipe all tasks and subject data
#### Warning: permanently removes task data from database

In [5]:
# # DELETE!
# query = {}
# docs = colTasks.delete_many(query)
# pprint.pprint(docs.raw_result)

{'n': 1098, 'ok': 1.0}


### 7. Insert tasks into database
Each task has to be associated with an experiment. Create an experiment in the tool and get the id from the database.

In [7]:
exptId="636f94c9693092cab0dcc169" #example id

# tasks[0:num] , where num is the number of tasks
# you want to upload to database
for i, task in enumerate(tasks[0:2]):
    task["experiment"] = bson.objectid.ObjectId(exptId)
    result = colTasks.insert_one(task)
    print("\rUploaded", i+1, "/" , len(tasks), end="")

Uploaded 2 / 223

### 8. (Optional) insert default UPOS tags to experiment

In [8]:
tags = [
	{
		"tag":"ADJ",
		"description":"adjective"
	},
	{
		"tag":"ADP",
		"description":"adposition"
	},
	{
		"tag":"ADV",
		"description":"adverb"
	},
	{
		"tag":"AUX",
		"description":"auxiliary"
	},
	{
		"tag":"CCONJ",
		"description":"coordinating conjunction"
	},
	{
		"tag":"DET",
		"description":"determiner"
	},
	{
		"tag":"INTJ",
		"description":"interjection"
	},
	{
		"tag":"NOUN",
		"description":"noun"
	},
	{
		"tag":"NUM",
		"description":"numeral"
	},
	{
		"tag":"PART",
		"description":"particle"
	},
	{
		"tag":"PRON",
		"description":"pronoun"
	},
	{
		"tag":"PROPN",
		"description":"proper noun"
	},
	{
		"tag":"PUNCT",
		"description":"punctuation"
	},
	{
		"tag":"SCONJ",
		"description":"subordinating conjunction"
	},
	{
		"tag":"SYM",
		"description":"symbol"
	},
	{
		"tag":"VERB",
		"description":"verb"
	},
	{
		"tag":"X",
		"description":"other"
	}
]

for i, tag in enumerate(tags):
    tags[i]["enabled"] = True
    
result = colExpt.find_one_and_update(
    {"_id": bson.objectid.ObjectId(exptId)},
    {'$set' : {'tags': tags}}
)
print(result)

{'_id': ObjectId('636f94c9693092cab0dcc169'), 'title': 'Default', 'experimenter': ObjectId('636f94c9693092cab0dcc168'), 'parameters': {'numShared': 2, 'numSharedAdju': 1, 'scaling': 'model', 'highlightBoundaries': [], 'highlights': [], 'matchMismatch': True, 'annConfidence': False, 'sentenceDiff': False}, 'subjects': [{'id': ObjectId('636f94df693092cab0dcc16e'), 'username': 'din_ann1', 'role': 'annotator'}, {'id': ObjectId('636f94ea693092cab0dcc175'), 'username': 'din_ann2', 'role': 'annotator'}, {'id': ObjectId('636f94f2693092cab0dcc17c'), 'username': 'din_adj', 'role': 'adjudicator'}], 'tags': [], '__v': 3}
