In [1]:
import os
import re
import csv
import json
import math

import pandas as pd
   
from collections import Counter

In [2]:
def scan(basedir, phase='', doc_type=''):
    """
    This function should scan the directories and create an data structure that
    should have an key to a .ner file and all ocurrences of that file within that
    basedir.
    """

    output = {}

    search_directory = os.path.join(phase, doc_type)

    for root, _, leaves in os.walk(basedir):
        if root.endswith(search_directory):
            for leaf in leaves:
                path = os.path.join(root, leaf)
                output.update({leaf: [path]}) if leaf not in output else output[leaf].append(path)

    return output

In [3]:
def open_document_and_get_initial_tokens(filename):
    """
    This function should return the initial tokens of the document. This will be used later to
    indicates which annotation was the most agreed between peers.
    """

    with open(filename, 'r') as fh:
        csvreader = csv.reader(fh, delimiter=';')
        output = [row for row in csvreader][1:]
        
    return output

In [4]:
data = scan('./mock', 'treino_1', 'Documentos')

In [5]:
D = {}
document = '20150302_RE_861115_305513530.ner.csv'
for annotator_file in data[document]:
    annotator_id = annotator_file.split('/')[2]
    annotator_payload = open_document_and_get_initial_tokens(annotator_file)
    D.update({annotator_id: annotator_payload})

In [6]:
df = pd.DataFrame()
for key, value in D.items():
    tokens, tags = zip(*value)
    annotator_df = pd.DataFrame(data=[tags], columns=tokens, index=[key])    
    df = pd.concat([df, annotator_df], axis=0)

In [7]:
df.iloc[::,2120:2146]

Unnamed: 0,de,segurança,concedido,.,Ofensa,ao,art,..1,5º,",",...,..2,Precedentes,..3,É,nula,a,decisão,do,Tribunal,de.1
161704902,O,O,O,O,O,O,B_Ref. Legislativa,I_Ref. Legislativa,I_Ref. Legislativa,I_Ref. Legislativa,...,I_Ref. Legislativa,O,O,O,O,O,O,O,O,O
171300018,O,O,O,O,O,O,I_Ref. Legislativa,I_Ref. Legislativa,I_Ref. Legislativa,I_Ref. Legislativa,...,I_Ref. Legislativa,O,O,O,O,O,O,O,O,O
171300011,O,O,O,O,O,O,O,O,O,O,...,O,O,O,O,O,O,O,O,O,O


In [8]:
import math

from collections import Counter

number_of_annotators = 3

for idx, column in enumerate(df.columns):    
    counter = Counter(df.iloc[::,idx].values)
    majority_vote = list(filter(lambda x: counter[x] >= math.floor(number_of_annotators / 2) + 1, counter))
    df.iloc[::,idx] = majority_vote[0] if majority_vote else 'O'

In [9]:
df.iloc[::,2126:2146]

Unnamed: 0,art,.,5º,",",LIV,e,LV,",.1",da,CF,..1,Precedentes,..2,É,nula,a,decisão,do,Tribunal,de
161704902,O,I_Ref. Legislativa,I_Ref. Legislativa,I_Ref. Legislativa,I_Ref. Legislativa,I_Ref. Legislativa,I_Ref. Legislativa,I_Ref. Legislativa,I_Ref. Legislativa,I_Ref. Legislativa,I_Ref. Legislativa,O,O,O,O,O,O,O,O,O
171300018,O,I_Ref. Legislativa,I_Ref. Legislativa,I_Ref. Legislativa,I_Ref. Legislativa,I_Ref. Legislativa,I_Ref. Legislativa,I_Ref. Legislativa,I_Ref. Legislativa,I_Ref. Legislativa,I_Ref. Legislativa,O,O,O,O,O,O,O,O,O
171300011,O,I_Ref. Legislativa,I_Ref. Legislativa,I_Ref. Legislativa,I_Ref. Legislativa,I_Ref. Legislativa,I_Ref. Legislativa,I_Ref. Legislativa,I_Ref. Legislativa,I_Ref. Legislativa,I_Ref. Legislativa,O,O,O,O,O,O,O,O,O
