## Core modules

In [1]:
import os
import sys 
import django

## Django/Doccano setup and imports

In [2]:
sys.path.append(os.path.abspath('../../app'))
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'app.settings')
django.setup()

from django.contrib.auth.models import User
from django.shortcuts import get_object_or_404
from server.models import Project, SequenceLabelingProject,\
Label, SequenceAnnotation, Document

### Annotators identifiers

In [None]:
with open('../creation-pipeline/data/annotators.old.csv', 'r') as fh:
    content = fh.readlines()

In [None]:
identifiers = [line.split(',')[1] for line in content[1:]]

### Annotations per user

In [None]:
count = {}
for annotator in identifiers:
    user = User.objects.filter(username=annotator).first()
    project = Project.objects.all().filter(users__username=user).first()
    project_id = project.id
    documents = Document.objects.all().filter(project=project)
    annotations = SequenceAnnotation.objects.all().filter(document__in=documents)
    count[annotator] = annotations.count()
count

### Annotations by label

In [None]:
count = {}
for annotator in identifiers:
    user = User.objects.filter(username=annotator).first()
    project = Project.objects.all().filter(users__username=user).first()    
    documents = Document.objects.all().filter(project=project)
    label_instance = Label.objects.filter(text='Precedente', project=project).first()
    annotations = SequenceAnnotation.objects.all().filter(document__in=documents, label=label_instance)
    count[annotator] = annotations.count()
count

### Truncate Documents > 10K chars with no annotations

* All projects where name endswith == \[PRATICA_ETAPA_1\]
* All documents for those projects
* All annotations so far for those documents
* Create two sets (all_document_ids, document_ids_with_annotations)
* Difference between sets
* Filter len(documents.text) > 10k
* Apply documents.txt[:10000]
* Update these documents
* Persist

In [None]:
user = User.objects.filter(username='181300008').first()

In [None]:
projects = Project.objects.all().filter(name__endswith='[PRATICA_ETAPA1]', users__username=user)

In [None]:
projects.count()

In [None]:
projects

In [None]:
documents = Document.objects.all().filter(project__in=projects)

In [None]:
documents.count()

In [None]:
annotations = SequenceAnnotation.objects.all().filter(document__in=documents)

In [None]:
annotations

In [None]:
documents_with_annotations = Document.objects.all().filter(id__in=[ann.document.id for ann in annotations])

In [None]:
documents_without_annotation = documents.difference(documents_with_annotations)

In [None]:
for document in documents_without_annotation:
    d = Document.objects.get(pk=document.id)
    if len(d.text) < 15000:
        continue    
    d_id = d.text.split('\n')[-1]
    d_text = d.text[:15000]
    d_text = '\n'.join(d_text.split("\n")[:-1])
    d_text += '\n%s' % (d_id)
    d.text = d_text
    d.save()

### Filtering documents by last line id

id: 20170906_ADI_5491_312653776 ; id: 20170801_ADI_5327_312309333 ; id: 20110825_MS_28447_629807

user: vinicius

In [None]:
user = User.objects.filter(username='181300040').first()

In [None]:
projects = Project.objects.all().filter(name__endswith='[PRATICA_ETAPA1]', users__username=user)

In [None]:
projects

In [None]:
documents = Document.objects.all().filter(project__in=projects)

In [None]:
len(documents.filter(text__endswith='\nid: 20110825_MS_28447_629807').first().text)

### all annotations

In [3]:
projects = Project.objects.all().filter(name__startswith='Documentos', name__endswith='[PRATICA_ETAPA1]')

In [4]:
documents = Document.objects.all().filter(project__in=projects)

In [5]:
labels = Label.objects.filter(project__in=projects, text__in=['Precedente', 'Doutrina'])

In [6]:
annotations = SequenceAnnotation.objects.all().filter(document__in=documents, label__in=labels)

In [None]:
project.users.exclude(username__in=['admin', 'carla'])

In [29]:
import hashlib

DS = {}

def parse_id_elasticsearch_from_text(text):
    last_line = text.split('\n')[-1]
    id_ = last_line.split(': ')[-1]
    
    return id_

for annotation in annotations:
    
    document = annotation.document
    project = document.project
    user = project.users.exclude(username__in=['admin', 'carla']).first()
    
    text = annotation.document.text
    term = text[annotation.start_offset:annotation.end_offset]
    term = term.strip()
    
    id_ = hashlib.sha256(term.encode('utf-8')).hexdigest()
    id_elasticsearch = parse_id_elasticsearch_from_text(document.text)
    
    if id_ not in DS:
        DS[id_] = {'present_in': [(user.username, id_elasticsearch, (annotation.start_offset, annotation.end_offset))], 
                   'label': annotation.label.text,                   
                   'text': term,
                   'version': 1}
    else:
        DS[id_]['present_in'].append((user.username, id_elasticsearch, (annotation.start_offset, annotation.end_offset)))

In [30]:
import json

print(json.dumps(DS, indent=4))

{
    "d4624f6b5ef83e5f083a0c8aaa9b5cabfcf14f2849cacb9c3382de81030a86cb": {
        "present_in": [
            [
                "181300028",
                "20180621_RE_965048_314646389",
                [
                    1725,
                    1746
                ]
            ]
        ],
        "label": "Precedente",
        "text": "S\u00famulas n\u00bas 282 e 356",
        "version": 1
    },
    "f0fd6fc1a6c9a3033af65871a1e4b54eb70f3bd2f2816ceb03e037a4141a317e": {
        "present_in": [
            [
                "181300028",
                "20180621_RE_965048_314646389",
                [
                    1933,
                    1957
                ]
            ]
        ],
        "label": "Precedente",
        "text": "S\u00famulas 282 e 356 do STF",
        "version": 1
    },
    "9eaa8738945f964d4d2836e97ce0322baf5548524396310cb3c412c27d2a31f8": {
        "present_in": [
            [
                "181300028",
                "20180621_RE_965048_3

### Persist data into elasticsearch index

In [31]:
from elasticsearch import Elasticsearch, RequestsHttpConnection

connection = Elasticsearch([{'host': 'aplcldrjvpr0017.acad.fgv.br', 'port': 9200}],
                           connection_class=RequestsHttpConnection,
                           http_auth=('admin', 'h1dr4!sen!2'),
                           use_ssl=True,
                           verify_certs=False,
                           timeout=180)

In [32]:
connection.ping()



True

In [34]:
connection.index?

In [33]:
for id_, payload in DS.items():    
    doc_type = payload.pop('label')
    doc_type = doc_type.lower()
    print(json.dumps(payload, indent=4))
    # connection.index(index='annotations', doc_type=doc_type, id=id_, body=payload)

{
    "present_in": [
        [
            "181300028",
            "20180621_RE_965048_314646389",
            [
                1725,
                1746
            ]
        ]
    ],
    "text": "S\u00famulas n\u00bas 282 e 356",
    "version": 1
}
{
    "present_in": [
        [
            "181300028",
            "20180621_RE_965048_314646389",
            [
                1933,
                1957
            ]
        ]
    ],
    "text": "S\u00famulas 282 e 356 do STF",
    "version": 1
}
{
    "present_in": [
        [
            "181300028",
            "20180621_RE_965048_314646389",
            [
                2194,
                2277
            ]
        ]
    ],
    "text": "AI n\u00ba 551.533/MG-AgR, Segunda Turma, Relator o Ministro Gilmar Mendes, DJ de 3/3/06",
    "version": 1
}
{
    "present_in": [
        [
            "181300028",
            "20180621_RE_965048_314646389",
            [
                3521,
                3612
            ]
        