In [24]:
import pysolr
from requests.auth import HTTPBasicAuth
import credentials
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
from nltk import word_tokenize
import pickle
import json
from itertools import chain
import os
import re

In [3]:
with open('abbrev_list.pkl','rb') as f:
    abbrev_list = pickle.load(f)
sentence_tokenizer = nltk.data.load('tokenizers/punkt/portuguese.pickle')
sentence_tokenizer._params.abbrev_types.update(abbrev_list)

In [11]:
solr_client = pysolr.Solr(
    credentials.solr_url,
    auth= HTTPBasicAuth(
        credentials.solr_login['username'],
        credentials.solr_login['password']
    )
)
solr_client.ping();

In [44]:
def get_cleaned_lines(text):
    return [line.strip() for line in text.splitlines() if line.strip()]

In [35]:
def get_tokenized_text(text):
    #sentence_tokenizer.train(text)
    lines = [line.strip() for line in text.splitlines() if line.strip()]
    text = ' '.join(lines)
    sentence_tokenizer.train(text)
    sents_chained = [sentence_tokenizer.tokenize(line) for line in lines]
    sents = list(chain.from_iterable(sents_chained))
    return '\n'.join(sents)

In [15]:
def get_doc_text(query):
    query_body = query
    fq = 'colecao:jurisprudencia OR colecao:parecer_tecnico'
    doc_fields = ['numero_sei', 'conteudo', 'descricao_tipo_documento', 'descricao_tipo_documento_pj']
    query_args = {
        "fl": doc_fields,
        "fq": fq,
        "rows": 1
    }
    res = solr_client.search(query_body, **query_args)
    doc = res.docs[0]
    
    tp_doc = doc.get('descricao_tipo_documento') or doc.get('descricao_tipo_documento_pj')
    doc['tipo_documento'] = tp_doc
    doc.pop('descricao_tipo_documento')
    doc.pop('descricao_tipo_documento_pj')

    return doc

In [49]:
with open('new_docs.txt','r') as f:
    new_docs = f.read().splitlines()

In [17]:
folder_path = '/home/gpaiva/Documents/CADE/Annotation/Dataset/new50'

In [41]:
origem = 'SEI_CADE'

In [59]:
all_path = os.path.join(folder_path, 'all.jsonl')
open(all_path, 'w').close()

for n_sei in new_docs:
    doc = get_doc_text(f'numero_sei:{n_sei}')
    prepared_text = get_tokenized_text(doc['conteudo'])
    doc['data'] = prepared_text
    doc['origem'] = origem
    doc['n_sei'] = doc.pop('numero_sei')
    doc.pop('conteudo')
    filename = os.path.join(folder_path, f'{n_sei}.json')
    with open(filename, 'w') as f:
        json.dump(doc, f, ensure_ascii=False)
    with open(all_path, 'a') as f:
        json.dump(doc, f, ensure_ascii=False)
        f.write('\n')