## Extract data from database and store in JSON and conllu format

In [1]:
import pymongo
import pprint 
import json
import glob
import bson
import os
from bson.json_util import dumps # serialize ObjectId to store as json

### 1. Connect to database collection "tasks"

In [25]:
databaseURL = "mongodb+srv://rshallam:d0nqzmT0ly7kugX1@annotation-iq5d1.mongodb.net/annotation?retryWrites=true&w=majority"
client = pymongo.MongoClient(databaseURL)
db = client["annotation"]
Task = db["tasks"]
Experiment = db["experiments"]

## 2. Fetch task data from database
#### Specify the id of the experiment to extract in  `exptId`
1. all tasks
2. tasks that have been adjudicated
3. tasks that have been annotated 
<br>


In [26]:
exptId = "5efdd7340aa8c40bf405aa38" # replace experiment id here
id_ = bson.objectid.ObjectId(exptId)

expt = Experiment.find_one({"_id": id_})

# Fetch all task
allTasks = []
for task in Task.find({
    "experiment": id_,
}):
    allTasks.append(task)

adjudicated = []
for task in Task.find({
    "experiment": id_,
    "adjudicators.completed": True
}):
    adjudicated.append(task)
    
annotated = []
for task in Task.find({
    "experiment": id_,
    "subjects.completed": True
}):
    annotated.append(task)
    
print("Number of tasks in experiment: ", len(allTasks))
print("Tasks adjudicated at least once in experiment: ", len(adjudicated))
print("Tasks annotated at least once in experiment: ", len(annotated))

Number of tasks in experiment:  25
Tasks adjudicated at least once in experiment:  2
Tasks annotated at least once in experiment:  5


# 3. JSON outputs

### 3.1 Write experiment metadata into JSON file
directory structure: `data/<exptId>/experiment_metadata.json`

In [24]:
directory = "data/" + exptId 
if not os.path.exists(directory):
    os.makedirs(directory)

json_serialized = json.loads(dumps(task))
json.dump(json_serialized, open("data/" + exptId + "/experiment_metadata"  + '.json', 'w'), indent=4, separators=(',', ': '))

### 3.2 Copy snapshot of tasks collection into JSON file
directory structure: `data/<exptId>/JSON_tasks_snapshot/<taskId>.json`

In [18]:
directory = "data/" + exptId + "/JSON_tasks_snapshot/"
if not os.path.exists(directory):
    os.makedirs(directory)

for task in allTasks:
    id = str(task["_id"])
    json_serialized = json.loads(dumps(task))
    json.dump(json_serialized, open(directory + id + '.json', 'w'), indent=4, separators=(',', ': '))

### 3.3 Write annotated data into JSON file
directory structure: `data/<exptId>/JSON_annotators/<taskId>/<username>.json`

In [21]:
directory = "data/" + exptId + "/JSON_annotators/"
if not os.path.exists(directory):
    os.makedirs(directory)

for task in annotated:
    id = str(task["_id"])
    
    for sub in task["subjects"]:
        if(sub["completed"]):
            
            sub_directory = directory + id + "/"
            if not os.path.exists(sub_directory):
                os.makedirs(sub_directory)
                
            json.dump(sub, open(sub_directory + sub["username"] + ".json", 'w'), indent=4, separators=(',', ': '))            

### 3.4 Write adjudicated data into JSON file
directory structure: `data/<exptId>/JSON_adjudicated/<taskId>/<username>.json`

In [22]:
directory = "data/" + exptId + "/JSON_adjudicated/"
if not os.path.exists(directory):
    os.makedirs(directory)

for task in adjudicated:
    id = str(task["_id"])
    
    for sub in task["adjudicators"]:
        if(sub["completed"]):
            
            sub_directory = directory + id + "/"
            if not os.path.exists(sub_directory):
                os.makedirs(sub_directory)
                
            json.dump(sub, open(sub_directory + sub["username"] + ".json", 'w'), indent=4, separators=(',', ': '))            

# 4. CONLLU OUTPUTS

### 4.1 Write conllu files for annotators who have finished annotating the task
directory structure: `data/<exptId>/conllu_annotators/<taskId>/<username>.conllu`

In [10]:
directory = "data/" + exptId + "/conllu_annotators/"
if not os.path.exists(directory):
    os.makedirs(directory)

for task in annotated:
    id = str(task["_id"])
    for sub in task["subjects"]:
        if(sub["completed"]):
            sub_directory = directory + id + "/"
            if not os.path.exists(sub_directory):
                os.makedirs(sub_directory)
            
            with open(sub_directory + sub["username"] + ".conllu", "w", encoding="utf-8") as f:
                
                for i in range(len(task["sents"])):
                    sent = task["sents"][i]
                    data = sub["data"][i]
                    
                    # Write sentence metadata 
                    f.write("# sent_id = " + sent["sentId"] + "\n")
                    f.write("# text = " + sent["text"] + "\n")
                    
                    for j, (x,y) in enumerate(zip(sent["words"], data["wordTags"])):
                        f.write(str(j+1) + " " +  x + " " + y + "\n")
                    f.write("\n")

### 4.2 Write conllu files for adjudicator who have finished their task
directory structure: `data/<exptId>/conllu_adjudicators/<taskId>/<username>.conllu`

In [11]:
directory = "data/" + exptId + "/conllu_adjudicators/"
if not os.path.exists(directory):
    os.makedirs(directory)

for task in adjudicated:
    id = str(task["_id"])
    if "adjudicators" in task:
        for sub in task["adjudicators"]:
            if(sub["completed"]):
                sub_directory = directory + id + "/"
                if not os.path.exists(sub_directory):
                    os.makedirs(sub_directory)
                
                with open(sub_directory + sub["username"] + ".conllu", "w", encoding="utf-8") as f:

                    for i in range(len(task["sents"])):
                        sent = task["sents"][i]
                        data = sub["data"][i]

                        # Write sentence metadata 
                        f.write("# sent_id = " + sent["sentId"] + "\n")
                        f.write("# text = " + sent["text"] + "\n")

                        for j, (x,y) in enumerate(zip(sent["words"], data["wordTags"])):
                            f.write(str(j+1) + " " +  x + " " + y + "\n")
                        f.write("\n")