# Preprocessamento:

+ Criar variável que identifique o anotador

+ Mover tags que começam com token ' ' (vazio)

+ Remover linhas com '\n' seguidos

+ REGEX:
    + Garantir letra e números onde tamanho for maior que 1.
    + Passar múltiplos símbolos para outra linha. Exemplo:  §3º -->  § \n 3 \n º
    + Remover pontuação de centenas dos números. Exemplo: 12.200 --> 12200

+ Visualização das sentenças com displacy (from spacy import displacy) 

+ Incluir POS tagging.

+ Tranformar o dado $x_i$ em um $x'_i$ que incorpora os 2 últimos e próximos tokens.

# Classificador

+ Visualização: Separar o conjunto de test em 2 ou 3 arquivos e visualizar o que o modelo classificou e o que os anotadores classificaram (separar por id do anotador).

+ Parâmetros utilizados no classificador.

+ Analisar o formato dos dados que tem maior acerto e menor acerto tambem.

+ Para criar um contexto no erro imprimir 10 palavras antes e depois de dois erros.


# Instruções para os anotadores:

+ Atentar à marcação de tags que envolve espaço 

+ Atentar para não incluir espaço no início da Tag

# Organização do diretório: 

+ Manter toda a análise em somente um diretório

+ Formato do diretório com os datasets: /resources/dataset/

+ Notebooks:
    + '01 - Processamento.ipynb'
        + Gerar 'treino.csv' e teste.csv' para processamento
    + '02 - [CRF].ipynb' - Criando o modelo
        + Usar arquivo dos dados preprocessados gerado pelo notebook 1.
        + Gerar modelo (xxx.model)
    + '03 - Metricas.ipynb'

In [1]:
import os
import re 
import glob
import time
import numpy as np
import pandas as pd
from collections import Counter

import scipy.stats
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from sklearn.metrics import make_scorer
from sklearn.linear_model import Perceptron
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.feature_extraction.text import HashingVectorizer

In [2]:
# Encontra todos os csv's dentro das pastas de 'mock'

extension = 'csv'
all_filenames = [i for i in glob.glob('mock/*/**/***/****.{}'.format(extension))]

In [3]:
all_filenames[:3]

['mock/171300044/[PRATICA_ETAPA_1]/Documentos/20090520_Rcl_5133_601151.ner.csv',
 'mock/171300044/[PRATICA_ETAPA_1]/Documentos/20090520_Rcl_4879_603324.ner.csv',
 'mock/171300044/[PRATICA_ETAPA_1]/Documentos/20130318_RE_542863_128236196.ner.csv']

# Preprocessamento 

In [4]:
# Cria uma tag de inicio e fim de arquivo em cada 'csv' antes de apendar todos eles.

frames = []
for all_files in all_filenames:
    df = pd.read_csv(all_files,delimiter=';', na_values='NaN') # Lê o arquivo
    df['Tag'].iloc[0] , df['Tag'].iloc[-1] = ['INICIO_ARQ', 'FIM_ARQ'] # Altera a primeira e ultima Tag desse csv
    frames.append(df) # Adiciona esse dataframe no 'dataframe maior'
    
combined_csv = pd.concat(frames).reset_index(drop=True)
combined_csv.to_csv("combined_csv.csv",index=False,encoding='utf-8') # Cria um arquivo com todas as anotações.
combined_csv['Token'] = combined_csv['Token'].astype('str')

In [5]:
combined_csv.head(15), combined_csv.tail(20)

(            Token         Tag
 0          ~~e»to  INICIO_ARQ
 1                           O
 2          c;JJ;~           O
 3               .           O
 4             ~wa           O
 5               /           O
 6                           O
 7            pfi~           O
 8               /           O
 9             <'d           O
 10                          O
 11             \n           O
 12             \n           O
 13  Coordenadoria           O
 14                          O,
                                  Token       Tag
 713274                                         O
 713275                         outubro         O
 713276                                         O
 713277                              de         O
 713278                                         O
 713279                            2018         O
 713280                               .         O
 713281                                         O
 713282                              \n         O
 71

In [6]:
print("Número de linhas dos arquivos concatenados:", len(combined_csv['Tag']))

Número de linhas dos arquivos concatenados: 713294


In [7]:
combined_csv[-10:] # Conferindo se o index foi resetado

Unnamed: 0,Token,Tag
713284,Ministro,B_Pessoa
713285,,I_Pessoa
713286,GILMAR,I_Pessoa
713287,,I_Pessoa
713288,MENDES,I_Pessoa
713289,\n,I_Pessoa
713290,id,O
713291,:,O
713292,,O
713293,20181018_HC_162764_15338864359,FIM_ARQ


# Encontra parágrafo duplo no arquivo. Uma opção de separar por sentenças.

In [8]:
a_df = combined_csv #Simplifica o nome do arquivo para a função nao ficar grande demais.
starts = a_df[a_df['Token']=='\n'].index & a_df[a_df['Token'].shift(-1)=='\n'].index #Identifica os paragrafos duplos
print(u'Padrões(sentenças) encontrados:', len(starts))

Padrões(sentenças) encontrados: 7729


In [9]:
%%time

combined_csv['Sentence #'] = 'Sentence'

combined_csv['Sentence #'][:starts[0]+2] = 'Sentence %d'%(1) # Primeira sentença
combined_csv['Sentence #'][starts[-1]+2:] = 'Sentence %d'%(len(starts)+1) # Última sentença

for i in range(1,len(starts)):
    combined_csv['Sentence #'][starts[i-1]+2:starts[i]+2] = 'Sentence %d'%(i+1) 

combined_csv.head(), combined_csv.tail()

CPU times: user 3min 14s, sys: 36.8 ms, total: 3min 14s
Wall time: 3min 14s


(    Token         Tag  Sentence #
 0  ~~e»to  INICIO_ARQ  Sentence 1
 1                   O  Sentence 1
 2  c;JJ;~           O  Sentence 1
 3       .           O  Sentence 1
 4     ~wa           O  Sentence 1,
                                  Token       Tag     Sentence #
 713289                              \n  I_Pessoa  Sentence 7730
 713290                              id         O  Sentence 7730
 713291                               :         O  Sentence 7730
 713292                                         O  Sentence 7730
 713293  20181018_HC_162764_15338864359   FIM_ARQ  Sentence 7730)

In [10]:
# Número de sentenças
len(combined_csv['Sentence #'].unique())

7730

# Atualiza a Tag que termina com 'Doutrinador' para 'Doutrina'

In [11]:
combined_csv.Tag.unique()

array(['INICIO_ARQ', 'O', 'B_Precedente', 'I_Precedente', 'B_Pessoa',
       'I_Pessoa', 'B_Ref. Legislativa', 'I_Ref. Legislativa',
       'B_Doutrina', 'I_Doutrina', 'FIM_ARQ', 'B_Doutrinador',
       'I_Doutrinador'], dtype=object)

In [12]:
# Strip o final 'dor' de todo o DataFrame (formato extremo)
# combined_csv.Tag = combined_csv.Tag.str.rstrip('dor')

In [13]:
# indices de onde a tag ocorre
indx = combined_csv[combined_csv.Tag.str.endswith('Doutrinador')].index.values

### Rodar apenas uma vez

In [14]:
%%time
# Demorado e custoso
for i in range(len(indx)):
    combined_csv.Tag.iloc[indx[i]] = combined_csv.Tag.iloc[indx[i]].rstrip('dor')

CPU times: user 2min 34s, sys: 12 ms, total: 2min 34s
Wall time: 2min 34s


In [15]:
combined_csv.Tag.unique()

array(['INICIO_ARQ', 'O', 'B_Precedente', 'I_Precedente', 'B_Pessoa',
       'I_Pessoa', 'B_Ref. Legislativa', 'I_Ref. Legislativa',
       'B_Doutrina', 'I_Doutrina', 'FIM_ARQ'], dtype=object)

# Remove enter duplo depois de criar as sentenças

In [16]:
# Teste para ver os casoso onde ocorre enter duplo.
for i in range(len(starts)):
    print(combined_csv.iloc[starts[i]:starts[i]+2][:5])

   Token Tag  Sentence #
11    \n   O  Sentence 1
12    \n   O  Sentence 1
   Token Tag  Sentence #
47    \n   O  Sentence 2
48    \n   O  Sentence 2
   Token Tag  Sentence #
61    \n   O  Sentence 3
62    \n   O  Sentence 3
   Token Tag  Sentence #
67    \n   O  Sentence 4
68    \n   O  Sentence 4
   Token Tag  Sentence #
71    \n   O  Sentence 5
72    \n   O  Sentence 5
   Token           Tag  Sentence #
81    \n  I_Precedente  Sentence 6
82    \n             O  Sentence 6
    Token Tag  Sentence #
174    \n   O  Sentence 7
175    \n   O  Sentence 7
    Token Tag  Sentence #
209    \n   O  Sentence 8
210    \n   O  Sentence 8
    Token Tag  Sentence #
225    \n   O  Sentence 9
226    \n   O  Sentence 9
    Token Tag   Sentence #
240    \n   O  Sentence 10
241    \n   O  Sentence 10
    Token Tag   Sentence #
248    \n   O  Sentence 11
249    \n   O  Sentence 11
    Token           Tag   Sentence #
268    \n  I_Precedente  Sentence 12
269    \n  I_Precedente  Sentence 12
    Token    

7320    \n  I_Pessoa  Sentence 155
     Token Tag    Sentence #
7366    \n   O  Sentence 156
7367    \n   O  Sentence 156
     Token Tag    Sentence #
7402    \n   O  Sentence 157
7403    \n   O  Sentence 157
     Token Tag    Sentence #
7416    \n   O  Sentence 158
7417    \n   O  Sentence 158
     Token Tag    Sentence #
7424    \n   O  Sentence 159
7425    \n   O  Sentence 159
     Token           Tag    Sentence #
7432    \n  I_Precedente  Sentence 160
7433    \n             O  Sentence 160
     Token Tag    Sentence #
7509    \n   O  Sentence 161
7510    \n   O  Sentence 161
     Token Tag    Sentence #
7532    \n   O  Sentence 162
7533    \n   O  Sentence 162
     Token Tag    Sentence #
7548    \n   O  Sentence 163
7549    \n   O  Sentence 163
     Token Tag    Sentence #
7571    \n   O  Sentence 164
7572    \n   O  Sentence 164
     Token Tag    Sentence #
7592    \n   O  Sentence 165
7593    \n   O  Sentence 165
     Token Tag    Sentence #
7611    \n   O  Sentence 166
7612   

      Token                 Tag    Sentence #
17231    \n  I_Ref. Legislativa  Sentence 315
17232    \n  I_Ref. Legislativa  Sentence 315
      Token Tag    Sentence #
17249    \n   O  Sentence 316
17250    \n   O  Sentence 316
      Token Tag    Sentence #
17269    \n   O  Sentence 317
17270    \n   O  Sentence 317
      Token Tag    Sentence #
17283    \n   O  Sentence 318
17284    \n   O  Sentence 318
      Token Tag    Sentence #
17288    \n   O  Sentence 319
17289    \n   O  Sentence 319
      Token Tag    Sentence #
17365    \n   O  Sentence 320
17366    \n   O  Sentence 320
      Token Tag    Sentence #
17435    \n   O  Sentence 321
17436    \n   O  Sentence 321
      Token Tag    Sentence #
17519    \n   O  Sentence 322
17520    \n   O  Sentence 322
      Token Tag    Sentence #
17667    \n   O  Sentence 323
17668    \n   O  Sentence 323
      Token Tag    Sentence #
17751    \n   O  Sentence 324
17752    \n   O  Sentence 324
      Token Tag    Sentence #
17855    \n   O  Sente

      Token Tag    Sentence #
31353    \n   O  Sentence 476
31354    \n   O  Sentence 476
      Token Tag    Sentence #
31854    \n   O  Sentence 477
31855    \n   O  Sentence 477
      Token Tag    Sentence #
31945    \n   O  Sentence 478
31946    \n   O  Sentence 478
      Token Tag    Sentence #
31992    \n   O  Sentence 479
31993    \n   O  Sentence 479
      Token Tag    Sentence #
32585    \n   O  Sentence 480
32586    \n   O  Sentence 480
      Token Tag    Sentence #
32660    \n   O  Sentence 481
32661    \n   O  Sentence 481
      Token Tag    Sentence #
32684    \n   O  Sentence 482
32685    \n   O  Sentence 482
      Token Tag    Sentence #
33027    \n   O  Sentence 483
33028    \n   O  Sentence 483
      Token Tag    Sentence #
33071    \n   O  Sentence 484
33072    \n   O  Sentence 484
      Token                 Tag    Sentence #
33534    \n  I_Ref. Legislativa  Sentence 485
33535    \n  I_Ref. Legislativa  Sentence 485
      Token Tag    Sentence #
33561    \n   O  Sente

      Token Tag    Sentence #
43257    \n   O  Sentence 636
43258    \n   O  Sentence 636
      Token Tag    Sentence #
43371    \n   O  Sentence 637
43372    \n   O  Sentence 637
      Token Tag    Sentence #
43521    \n   O  Sentence 638
43522    \n   O  Sentence 638
      Token Tag    Sentence #
43534    \n   O  Sentence 639
43535    \n   O  Sentence 639
      Token Tag    Sentence #
43560    \n   O  Sentence 640
43561    \n   O  Sentence 640
      Token Tag    Sentence #
43644    \n   O  Sentence 641
43645    \n   O  Sentence 641
      Token Tag    Sentence #
43723    \n   O  Sentence 642
43724    \n   O  Sentence 642
      Token Tag    Sentence #
43843    \n   O  Sentence 643
43844    \n   O  Sentence 643
      Token Tag    Sentence #
43921    \n   O  Sentence 644
43922    \n   O  Sentence 644
      Token Tag    Sentence #
43984    \n   O  Sentence 645
43985    \n   O  Sentence 645
      Token Tag    Sentence #
44065    \n   O  Sentence 646
44066    \n   O  Sentence 646
      Toke

      Token Tag    Sentence #
59145    \n   O  Sentence 796
59146    \n   O  Sentence 796
      Token Tag    Sentence #
59277    \n   O  Sentence 797
59278    \n   O  Sentence 797
      Token Tag    Sentence #
59385    \n   O  Sentence 798
59386    \n   O  Sentence 798
      Token           Tag    Sentence #
59393    \n  I_Precedente  Sentence 799
59394    \n             O  Sentence 799
      Token Tag    Sentence #
59451    \n   O  Sentence 800
59452    \n   O  Sentence 800
      Token Tag    Sentence #
59455    \n   O  Sentence 801
59456    \n   O  Sentence 801
      Token Tag    Sentence #
59552    \n   O  Sentence 802
59553    \n   O  Sentence 802
      Token           Tag    Sentence #
59558    \n  I_Precedente  Sentence 803
59559    \n             O  Sentence 803
      Token Tag    Sentence #
59562    \n   O  Sentence 804
59563    \n   O  Sentence 804
      Token Tag    Sentence #
59584    \n   O  Sentence 805
59585    \n   O  Sentence 805
      Token Tag    Sentence #
59593    \

      Token Tag    Sentence #
71388    \n   O  Sentence 954
71389    \n   O  Sentence 954
      Token Tag    Sentence #
71743    \n   O  Sentence 955
71744    \n   O  Sentence 955
      Token Tag    Sentence #
71885    \n   O  Sentence 956
71886    \n   O  Sentence 956
      Token Tag    Sentence #
71953    \n   O  Sentence 957
71954    \n   O  Sentence 957
      Token Tag    Sentence #
72111    \n   O  Sentence 958
72112    \n   O  Sentence 958
      Token Tag    Sentence #
72118    \n   O  Sentence 959
72119    \n   O  Sentence 959
      Token Tag    Sentence #
72166    \n   O  Sentence 960
72167    \n   O  Sentence 960
      Token Tag    Sentence #
72177    \n   O  Sentence 961
72178    \n   O  Sentence 961
      Token Tag    Sentence #
72193    \n   O  Sentence 962
72194    \n   O  Sentence 962
      Token       Tag    Sentence #
72201    \n  I_Pessoa  Sentence 963
72202    \n         O  Sentence 963
      Token Tag    Sentence #
72205    \n   O  Sentence 964
72206    \n   O  Sente

      Token Tag     Sentence #
86057    \n   O  Sentence 1114
86058    \n   O  Sentence 1114
      Token Tag     Sentence #
86089    \n   O  Sentence 1115
86090    \n   O  Sentence 1115
      Token Tag     Sentence #
86125    \n   O  Sentence 1116
86126    \n   O  Sentence 1116
      Token Tag     Sentence #
86178    \n   O  Sentence 1117
86179    \n   O  Sentence 1117
      Token Tag     Sentence #
86187    \n   O  Sentence 1118
86188    \n   O  Sentence 1118
      Token Tag     Sentence #
86192    \n   O  Sentence 1119
86193    \n   O  Sentence 1119
      Token Tag     Sentence #
86264    \n   O  Sentence 1120
86265    \n   O  Sentence 1120
      Token Tag     Sentence #
86353    \n   O  Sentence 1121
86354    \n   O  Sentence 1121
      Token Tag     Sentence #
86403    \n   O  Sentence 1122
86404    \n   O  Sentence 1122
      Token Tag     Sentence #
86532    \n   O  Sentence 1123
86533    \n   O  Sentence 1123
      Token Tag     Sentence #
86679    \n   O  Sentence 1124
86680   

       Token Tag     Sentence #
104480    \n   O  Sentence 1270
104481    \n   O  Sentence 1270
       Token Tag     Sentence #
104820    \n   O  Sentence 1271
104821    \n   O  Sentence 1271
       Token Tag     Sentence #
104934    \n   O  Sentence 1272
104935    \n   O  Sentence 1272
       Token Tag     Sentence #
105077    \n   O  Sentence 1273
105078    \n   O  Sentence 1273
       Token Tag     Sentence #
105121    \n   O  Sentence 1274
105122    \n   O  Sentence 1274
       Token Tag     Sentence #
105209    \n   O  Sentence 1275
105210    \n   O  Sentence 1275
       Token Tag     Sentence #
105385    \n   O  Sentence 1276
105386    \n   O  Sentence 1276
       Token Tag     Sentence #
105422    \n   O  Sentence 1277
105423    \n   O  Sentence 1277
       Token Tag     Sentence #
105604    \n   O  Sentence 1278
105605    \n   O  Sentence 1278
       Token Tag     Sentence #
106252    \n   O  Sentence 1279
106253    \n   O  Sentence 1279
       Token Tag     Sentence #
106328  

       Token           Tag     Sentence #
123461    \n  I_Precedente  Sentence 1430
123462    \n  I_Precedente  Sentence 1430
       Token Tag     Sentence #
123572    \n   O  Sentence 1431
123573    \n   O  Sentence 1431
       Token Tag     Sentence #
123592    \n   O  Sentence 1432
123593    \n   O  Sentence 1432
       Token Tag     Sentence #
123610    \n   O  Sentence 1433
123611    \n   O  Sentence 1433
       Token Tag     Sentence #
123679    \n   O  Sentence 1434
123680    \n   O  Sentence 1434
       Token Tag     Sentence #
123715    \n   O  Sentence 1435
123716    \n   O  Sentence 1435
       Token Tag     Sentence #
123915    \n   O  Sentence 1436
123916    \n   O  Sentence 1436
       Token Tag     Sentence #
124007    \n   O  Sentence 1437
124008    \n   O  Sentence 1437
       Token Tag     Sentence #
124047    \n   O  Sentence 1438
124048    \n   O  Sentence 1438
       Token Tag     Sentence #
124143    \n   O  Sentence 1439
124144    \n   O  Sentence 1439
       Tok

134244    \n   O  Sentence 1588
       Token Tag     Sentence #
134357    \n   O  Sentence 1589
134358    \n   O  Sentence 1589
       Token Tag     Sentence #
134378    \n   O  Sentence 1590
134379    \n   O  Sentence 1590
       Token Tag     Sentence #
134397    \n   O  Sentence 1591
134398    \n   O  Sentence 1591
       Token Tag     Sentence #
134430    \n   O  Sentence 1592
134431    \n   O  Sentence 1592
       Token Tag     Sentence #
134555    \n   O  Sentence 1593
134556    \n   O  Sentence 1593
       Token Tag     Sentence #
134682    \n   O  Sentence 1594
134683    \n   O  Sentence 1594
       Token Tag     Sentence #
134806    \n   O  Sentence 1595
134807    \n   O  Sentence 1595
       Token Tag     Sentence #
134933    \n   O  Sentence 1596
134934    \n   O  Sentence 1596
       Token Tag     Sentence #
135056    \n   O  Sentence 1597
135057    \n   O  Sentence 1597
       Token Tag     Sentence #
135070    \n   O  Sentence 1598
135071    \n   O  Sentence 1598
       T

       Token Tag     Sentence #
148729    \n   O  Sentence 1747
148730    \n   O  Sentence 1747
       Token Tag     Sentence #
148749    \n   O  Sentence 1748
148750    \n   O  Sentence 1748
       Token Tag     Sentence #
148757    \n   O  Sentence 1749
148758    \n   O  Sentence 1749
       Token Tag     Sentence #
148787    \n   O  Sentence 1750
148788    \n   O  Sentence 1750
       Token Tag     Sentence #
148818    \n   O  Sentence 1751
148819    \n   O  Sentence 1751
       Token Tag     Sentence #
148889    \n   O  Sentence 1752
148890    \n   O  Sentence 1752
       Token Tag     Sentence #
148906    \n   O  Sentence 1753
148907    \n   O  Sentence 1753
       Token Tag     Sentence #
149001    \n   O  Sentence 1754
149002    \n   O  Sentence 1754
       Token Tag     Sentence #
149110    \n   O  Sentence 1755
149111    \n   O  Sentence 1755
       Token Tag     Sentence #
149116    \n   O  Sentence 1756
149117    \n   O  Sentence 1756
       Token Tag     Sentence #
149148  

       Token Tag     Sentence #
164984    \n   O  Sentence 1907
164985    \n   O  Sentence 1907
       Token Tag     Sentence #
165023    \n   O  Sentence 1908
165024    \n   O  Sentence 1908
       Token Tag     Sentence #
165031    \n   O  Sentence 1909
165032    \n   O  Sentence 1909
       Token Tag     Sentence #
165039    \n   O  Sentence 1910
165040    \n   O  Sentence 1910
       Token           Tag     Sentence #
165082    \n  I_Precedente  Sentence 1911
165083    \n  I_Precedente  Sentence 1911
       Token           Tag     Sentence #
165151    \n  I_Precedente  Sentence 1912
165152    \n  I_Precedente  Sentence 1912
       Token Tag     Sentence #
165200    \n   O  Sentence 1913
165201    \n   O  Sentence 1913
       Token Tag     Sentence #
165315    \n   O  Sentence 1914
165316    \n   O  Sentence 1914
       Token Tag     Sentence #
165320    \n   O  Sentence 1915
165321    \n   O  Sentence 1915
       Token Tag     Sentence #
165442    \n   O  Sentence 1916
165443    \n

       Token Tag     Sentence #
174132    \n   O  Sentence 2067
174133    \n   O  Sentence 2067
       Token           Tag     Sentence #
174231    \n  I_Precedente  Sentence 2068
174232    \n  I_Precedente  Sentence 2068
       Token Tag     Sentence #
174309    \n   O  Sentence 2069
174310    \n   O  Sentence 2069
       Token Tag     Sentence #
174455    \n   O  Sentence 2070
174456    \n   O  Sentence 2070
       Token         Tag     Sentence #
174476    \n  I_Doutrina  Sentence 2071
174477    \n  I_Doutrina  Sentence 2071
       Token         Tag     Sentence #
174560    \n  I_Doutrina  Sentence 2072
174561    \n  I_Doutrina  Sentence 2072
       Token Tag     Sentence #
174575    \n   O  Sentence 2073
174576    \n   O  Sentence 2073
       Token Tag     Sentence #
174645    \n   O  Sentence 2074
174646    \n   O  Sentence 2074
       Token         Tag     Sentence #
174746    \n  I_Doutrina  Sentence 2075
174747    \n  I_Doutrina  Sentence 2075
       Token         Tag     Sente

       Token Tag     Sentence #
184964    \n   O  Sentence 2224
184965    \n   O  Sentence 2224
       Token Tag     Sentence #
185073    \n   O  Sentence 2225
185074    \n   O  Sentence 2225
       Token Tag     Sentence #
185102    \n   O  Sentence 2226
185103    \n   O  Sentence 2226
       Token Tag     Sentence #
185367    \n   O  Sentence 2227
185368    \n   O  Sentence 2227
       Token Tag     Sentence #
185392    \n   O  Sentence 2228
185393    \n   O  Sentence 2228
       Token Tag     Sentence #
185540    \n   O  Sentence 2229
185541    \n   O  Sentence 2229
       Token Tag     Sentence #
185613    \n   O  Sentence 2230
185614    \n   O  Sentence 2230
       Token Tag     Sentence #
185690    \n   O  Sentence 2231
185691    \n   O  Sentence 2231
       Token Tag     Sentence #
185779    \n   O  Sentence 2232
185780    \n   O  Sentence 2232
       Token Tag     Sentence #
185840    \n   O  Sentence 2233
185841    \n   O  Sentence 2233
       Token Tag     Sentence #
185954  

198241    \n   O  Sentence 2382
       Token Tag     Sentence #
198393    \n   O  Sentence 2383
198394    \n   O  Sentence 2383
       Token Tag     Sentence #
198577    \n   O  Sentence 2384
198578    \n   O  Sentence 2384
       Token Tag     Sentence #
198644    \n   O  Sentence 2385
198645    \n   O  Sentence 2385
       Token Tag     Sentence #
198711    \n   O  Sentence 2386
198712    \n   O  Sentence 2386
       Token Tag     Sentence #
198909    \n   O  Sentence 2387
198910    \n   O  Sentence 2387
       Token Tag     Sentence #
198936    \n   O  Sentence 2388
198937    \n   O  Sentence 2388
       Token Tag     Sentence #
199007    \n   O  Sentence 2389
199008    \n   O  Sentence 2389
       Token Tag     Sentence #
199059    \n   O  Sentence 2390
199060    \n   O  Sentence 2390
       Token Tag     Sentence #
199116    \n   O  Sentence 2391
199117    \n   O  Sentence 2391
       Token Tag     Sentence #
199185    \n   O  Sentence 2392
199186    \n   O  Sentence 2392
       T

       Token Tag     Sentence #
211953    \n   O  Sentence 2545
211954    \n   O  Sentence 2545
       Token Tag     Sentence #
211961    \n   O  Sentence 2546
211962    \n   O  Sentence 2546
       Token Tag     Sentence #
211965    \n   O  Sentence 2547
211966    \n   O  Sentence 2547
       Token Tag     Sentence #
212054    \n   O  Sentence 2548
212055    \n   O  Sentence 2548
       Token Tag     Sentence #
212281    \n   O  Sentence 2549
212282    \n   O  Sentence 2549
       Token Tag     Sentence #
212351    \n   O  Sentence 2550
212352    \n   O  Sentence 2550
       Token Tag     Sentence #
212405    \n   O  Sentence 2551
212406    \n   O  Sentence 2551
       Token Tag     Sentence #
212413    \n   O  Sentence 2552
212414    \n   O  Sentence 2552
       Token Tag     Sentence #
212440    \n   O  Sentence 2553
212441    \n   O  Sentence 2553
       Token Tag     Sentence #
212530    \n   O  Sentence 2554
212531    \n   O  Sentence 2554
       Token Tag     Sentence #
212811  

       Token Tag     Sentence #
226928    \n   O  Sentence 2702
226929    \n   O  Sentence 2702
       Token         Tag     Sentence #
226969    \n  I_Doutrina  Sentence 2703
226970    \n  I_Doutrina  Sentence 2703
       Token Tag     Sentence #
227206    \n   O  Sentence 2704
227207    \n   O  Sentence 2704
       Token Tag     Sentence #
227435    \n   O  Sentence 2705
227436    \n   O  Sentence 2705
       Token Tag     Sentence #
227529    \n   O  Sentence 2706
227530    \n   O  Sentence 2706
       Token Tag     Sentence #
227694    \n   O  Sentence 2707
227695    \n   O  Sentence 2707
       Token Tag     Sentence #
227717    \n   O  Sentence 2708
227718    \n   O  Sentence 2708
       Token Tag     Sentence #
227774    \n   O  Sentence 2709
227775    \n   O  Sentence 2709
       Token Tag     Sentence #
227886    \n   O  Sentence 2710
227887    \n   O  Sentence 2710
       Token Tag     Sentence #
228066    \n   O  Sentence 2711
228067    \n   O  Sentence 2711
       Token Tag

       Token Tag     Sentence #
240151    \n   O  Sentence 2864
240152    \n   O  Sentence 2864
       Token Tag     Sentence #
240160    \n   O  Sentence 2865
240161    \n   O  Sentence 2865
       Token Tag     Sentence #
240166    \n   O  Sentence 2866
240167    \n   O  Sentence 2866
       Token Tag     Sentence #
240180    \n   O  Sentence 2867
240181    \n   O  Sentence 2867
       Token Tag     Sentence #
240190    \n   O  Sentence 2868
240191    \n   O  Sentence 2868
       Token Tag     Sentence #
240277    \n   O  Sentence 2869
240278    \n   O  Sentence 2869
       Token Tag     Sentence #
240320    \n   O  Sentence 2870
240321    \n   O  Sentence 2870
       Token Tag     Sentence #
240384    \n   O  Sentence 2871
240385    \n   O  Sentence 2871
       Token Tag     Sentence #
240420    \n   O  Sentence 2872
240421    \n   O  Sentence 2872
       Token Tag     Sentence #
240513    \n   O  Sentence 2873
240514    \n   O  Sentence 2873
       Token Tag     Sentence #
240558  

       Token Tag     Sentence #
251132    \n   O  Sentence 3024
251133    \n   O  Sentence 3024
       Token Tag     Sentence #
251155    \n   O  Sentence 3025
251156    \n   O  Sentence 3025
       Token Tag     Sentence #
251397    \n   O  Sentence 3026
251398    \n   O  Sentence 3026
       Token Tag     Sentence #
251563    \n   O  Sentence 3027
251564    \n   O  Sentence 3027
       Token Tag     Sentence #
251670    \n   O  Sentence 3028
251671    \n   O  Sentence 3028
       Token Tag     Sentence #
251792    \n   O  Sentence 3029
251793    \n   O  Sentence 3029
       Token Tag     Sentence #
251987    \n   O  Sentence 3030
251988    \n   O  Sentence 3030
       Token Tag     Sentence #
252058    \n   O  Sentence 3031
252059    \n   O  Sentence 3031
       Token Tag     Sentence #
252172    \n   O  Sentence 3032
252173    \n   O  Sentence 3032
       Token Tag     Sentence #
252299    \n   O  Sentence 3033
252300    \n   O  Sentence 3033
       Token Tag     Sentence #
252342  

268463    \n   O  Sentence 3185
       Token Tag     Sentence #
268478    \n   O  Sentence 3186
268479    \n   O  Sentence 3186
       Token Tag     Sentence #
268486    \n   O  Sentence 3187
268487    \n   O  Sentence 3187
       Token Tag     Sentence #
268496    \n   O  Sentence 3188
268497    \n   O  Sentence 3188
       Token Tag     Sentence #
268513    \n   O  Sentence 3189
268514    \n   O  Sentence 3189
       Token Tag     Sentence #
268595    \n   O  Sentence 3190
268596    \n   O  Sentence 3190
       Token Tag     Sentence #
268599    \n   O  Sentence 3191
268600    \n   O  Sentence 3191
       Token Tag     Sentence #
268751    \n   O  Sentence 3192
268752    \n   O  Sentence 3192
       Token Tag     Sentence #
268833    \n   O  Sentence 3193
268834    \n   O  Sentence 3193
       Token Tag     Sentence #
268909    \n   O  Sentence 3194
268910    \n   O  Sentence 3194
       Token Tag     Sentence #
268989    \n   O  Sentence 3195
268990    \n   O  Sentence 3195
       T

       Token Tag     Sentence #
282213    \n   O  Sentence 3344
282214    \n   O  Sentence 3344
       Token Tag     Sentence #
283084    \n   O  Sentence 3345
283085    \n   O  Sentence 3345
       Token Tag     Sentence #
283168    \n   O  Sentence 3346
283169    \n   O  Sentence 3346
       Token Tag     Sentence #
283188    \n   O  Sentence 3347
283189    \n   O  Sentence 3347
       Token Tag     Sentence #
283216    \n   O  Sentence 3348
283217    \n   O  Sentence 3348
       Token Tag     Sentence #
283369    \n   O  Sentence 3349
283370    \n   O  Sentence 3349
       Token Tag     Sentence #
283453    \n   O  Sentence 3350
283454    \n   O  Sentence 3350
       Token Tag     Sentence #
283556    \n   O  Sentence 3351
283557    \n   O  Sentence 3351
       Token Tag     Sentence #
283731    \n   O  Sentence 3352
283732    \n   O  Sentence 3352
       Token Tag     Sentence #
283872    \n   O  Sentence 3353
283873    \n   O  Sentence 3353
       Token Tag     Sentence #
283941  

       Token Tag     Sentence #
302053    \n   O  Sentence 3508
302054    \n   O  Sentence 3508
       Token Tag     Sentence #
302325    \n   O  Sentence 3509
302326    \n   O  Sentence 3509
       Token Tag     Sentence #
302395    \n   O  Sentence 3510
302396    \n   O  Sentence 3510
       Token Tag     Sentence #
302445    \n   O  Sentence 3511
302446    \n   O  Sentence 3511
       Token Tag     Sentence #
302524    \n   O  Sentence 3512
302525    \n   O  Sentence 3512
       Token Tag     Sentence #
302629    \n   O  Sentence 3513
302630    \n   O  Sentence 3513
       Token Tag     Sentence #
302833    \n   O  Sentence 3514
302834    \n   O  Sentence 3514
       Token       Tag     Sentence #
302855    \n  I_Pessoa  Sentence 3515
302856    \n  I_Pessoa  Sentence 3515
       Token Tag     Sentence #
302876    \n   O  Sentence 3516
302877    \n   O  Sentence 3516
       Token Tag     Sentence #
302963    \n   O  Sentence 3517
302964    \n   O  Sentence 3517
       Token Tag     S

       Token Tag     Sentence #
322383    \n   O  Sentence 3666
322384    \n   O  Sentence 3666
       Token Tag     Sentence #
322448    \n   O  Sentence 3667
322449    \n   O  Sentence 3667
       Token Tag     Sentence #
322512    \n   O  Sentence 3668
322513    \n   O  Sentence 3668
       Token Tag     Sentence #
322588    \n   O  Sentence 3669
322589    \n   O  Sentence 3669
       Token Tag     Sentence #
322673    \n   O  Sentence 3670
322674    \n   O  Sentence 3670
       Token Tag     Sentence #
322756    \n   O  Sentence 3671
322757    \n   O  Sentence 3671
       Token Tag     Sentence #
322911    \n   O  Sentence 3672
322912    \n   O  Sentence 3672
       Token Tag     Sentence #
322997    \n   O  Sentence 3673
322998    \n   O  Sentence 3673
       Token Tag     Sentence #
323110    \n   O  Sentence 3674
323111    \n   O  Sentence 3674
       Token Tag     Sentence #
323259    \n   O  Sentence 3675
323260    \n   O  Sentence 3675
       Token Tag     Sentence #
323579  

341700    \n   O  Sentence 3827
       Token Tag     Sentence #
341733    \n   O  Sentence 3828
341734    \n   O  Sentence 3828
       Token Tag     Sentence #
341839    \n   O  Sentence 3829
341840    \n   O  Sentence 3829
       Token Tag     Sentence #
342027    \n   O  Sentence 3830
342028    \n   O  Sentence 3830
       Token Tag     Sentence #
342175    \n   O  Sentence 3831
342176    \n   O  Sentence 3831
       Token Tag     Sentence #
342317    \n   O  Sentence 3832
342318    \n   O  Sentence 3832
       Token Tag     Sentence #
342459    \n   O  Sentence 3833
342460    \n   O  Sentence 3833
       Token Tag     Sentence #
342586    \n   O  Sentence 3834
342587    \n   O  Sentence 3834
       Token Tag     Sentence #
342907    \n   O  Sentence 3835
342908    \n   O  Sentence 3835
       Token Tag     Sentence #
342947    \n   O  Sentence 3836
342948    \n   O  Sentence 3836
       Token Tag     Sentence #
343329    \n   O  Sentence 3837
343330    \n   O  Sentence 3837
       T

       Token Tag     Sentence #
362384    \n   O  Sentence 3986
362385    \n   O  Sentence 3986
       Token Tag     Sentence #
362472    \n   O  Sentence 3987
362473    \n   O  Sentence 3987
       Token Tag     Sentence #
362677    \n   O  Sentence 3988
362678    \n   O  Sentence 3988
       Token Tag     Sentence #
363126    \n   O  Sentence 3989
363127    \n   O  Sentence 3989
       Token Tag     Sentence #
363407    \n   O  Sentence 3990
363408    \n   O  Sentence 3990
       Token Tag     Sentence #
363503    \n   O  Sentence 3991
363504    \n   O  Sentence 3991
       Token Tag     Sentence #
363719    \n   O  Sentence 3992
363720    \n   O  Sentence 3992
       Token Tag     Sentence #
363788    \n   O  Sentence 3993
363789    \n   O  Sentence 3993
       Token Tag     Sentence #
364025    \n   O  Sentence 3994
364026    \n   O  Sentence 3994
       Token Tag     Sentence #
364138    \n   O  Sentence 3995
364139    \n   O  Sentence 3995
       Token Tag     Sentence #
364147  

       Token Tag     Sentence #
372226    \n   O  Sentence 4146
372227    \n   O  Sentence 4146
       Token           Tag     Sentence #
372234    \n  I_Precedente  Sentence 4147
372235    \n  I_Precedente  Sentence 4147
       Token Tag     Sentence #
372311    \n   O  Sentence 4148
372312    \n   O  Sentence 4148
       Token Tag     Sentence #
372334    \n   O  Sentence 4149
372335    \n   O  Sentence 4149
       Token Tag     Sentence #
372350    \n   O  Sentence 4150
372351    \n   O  Sentence 4150
       Token Tag     Sentence #
372373    \n   O  Sentence 4151
372374    \n   O  Sentence 4151
       Token Tag     Sentence #
372394    \n   O  Sentence 4152
372395    \n   O  Sentence 4152
       Token Tag     Sentence #
372413    \n   O  Sentence 4153
372414    \n   O  Sentence 4153
       Token Tag     Sentence #
372440    \n   O  Sentence 4154
372441    \n   O  Sentence 4154
       Token Tag     Sentence #
372447    \n   O  Sentence 4155
372448    \n   O  Sentence 4155
       Tok

       Token Tag     Sentence #
382261    \n   O  Sentence 4306
382262    \n   O  Sentence 4306
       Token Tag     Sentence #
382312    \n   O  Sentence 4307
382313    \n   O  Sentence 4307
       Token Tag     Sentence #
382428    \n   O  Sentence 4308
382429    \n   O  Sentence 4308
       Token           Tag     Sentence #
382440    \n  I_Precedente  Sentence 4309
382441    \n             O  Sentence 4309
       Token Tag     Sentence #
382451    \n   O  Sentence 4310
382452    \n   O  Sentence 4310
       Token Tag     Sentence #
382455    \n   O  Sentence 4311
382456    \n   O  Sentence 4311
       Token Tag     Sentence #
382459    \n   O  Sentence 4312
382460    \n   O  Sentence 4312
       Token Tag     Sentence #
382483    \n   O  Sentence 4313
382484    \n   O  Sentence 4313
       Token Tag     Sentence #
382487    \n   O  Sentence 4314
382488    \n   O  Sentence 4314
       Token Tag     Sentence #
382638    \n   O  Sentence 4315
382639    \n   O  Sentence 4315
       Tok

       Token Tag     Sentence #
393786    \n   O  Sentence 4467
393787    \n   O  Sentence 4467
       Token Tag     Sentence #
393790    \n   O  Sentence 4468
393791    \n   O  Sentence 4468
       Token Tag     Sentence #
393798    \n   O  Sentence 4469
393799    \n   O  Sentence 4469
       Token Tag     Sentence #
393808    \n   O  Sentence 4470
393809    \n   O  Sentence 4470
       Token Tag     Sentence #
393816    \n   O  Sentence 4471
393817    \n   O  Sentence 4471
       Token Tag     Sentence #
393950    \n   O  Sentence 4472
393951    \n   O  Sentence 4472
       Token Tag     Sentence #
394194    \n   O  Sentence 4473
394195    \n   O  Sentence 4473
       Token Tag     Sentence #
394277    \n   O  Sentence 4474
394278    \n   O  Sentence 4474
       Token Tag     Sentence #
394293    \n   O  Sentence 4475
394294    \n   O  Sentence 4475
       Token Tag     Sentence #
394305    \n   O  Sentence 4476
394306    \n   O  Sentence 4476
       Token Tag     Sentence #
394309  

404233    \n   O  Sentence 4629
       Token Tag     Sentence #
404700    \n   O  Sentence 4630
404701    \n   O  Sentence 4630
       Token Tag     Sentence #
404755    \n   O  Sentence 4631
404756    \n   O  Sentence 4631
       Token Tag     Sentence #
404863    \n   O  Sentence 4632
404864    \n   O  Sentence 4632
       Token Tag     Sentence #
404899    \n   O  Sentence 4633
404900    \n   O  Sentence 4633
       Token Tag     Sentence #
404907    \n   O  Sentence 4634
404908    \n   O  Sentence 4634
       Token           Tag     Sentence #
404917    \n  I_Precedente  Sentence 4635
404918    \n  I_Precedente  Sentence 4635
       Token Tag     Sentence #
404980    \n   O  Sentence 4636
404981    \n   O  Sentence 4636
       Token Tag     Sentence #
405003    \n   O  Sentence 4637
405004    \n   O  Sentence 4637
       Token Tag     Sentence #
405019    \n   O  Sentence 4638
405020    \n   O  Sentence 4638
       Token Tag     Sentence #
405033    \n   O  Sentence 4639
405034    

       Token Tag     Sentence #
411794    \n   O  Sentence 4787
411795    \n   O  Sentence 4787
       Token Tag     Sentence #
411807    \n   O  Sentence 4788
411808    \n   O  Sentence 4788
       Token Tag     Sentence #
411833    \n   O  Sentence 4789
411834    \n   O  Sentence 4789
       Token Tag     Sentence #
411917    \n   O  Sentence 4790
411918    \n   O  Sentence 4790
       Token Tag     Sentence #
411996    \n   O  Sentence 4791
411997    \n   O  Sentence 4791
       Token Tag     Sentence #
412116    \n   O  Sentence 4792
412117    \n   O  Sentence 4792
       Token Tag     Sentence #
412194    \n   O  Sentence 4793
412195    \n   O  Sentence 4793
       Token Tag     Sentence #
412257    \n   O  Sentence 4794
412258    \n   O  Sentence 4794
       Token Tag     Sentence #
412338    \n   O  Sentence 4795
412339    \n   O  Sentence 4795
       Token Tag     Sentence #
412458    \n   O  Sentence 4796
412459    \n   O  Sentence 4796
       Token Tag     Sentence #
412617  

       Token Tag     Sentence #
425055    \n   O  Sentence 4947
425056    \n   O  Sentence 4947
       Token Tag     Sentence #
425227    \n   O  Sentence 4948
425228    \n   O  Sentence 4948
       Token Tag     Sentence #
425533    \n   O  Sentence 4949
425534    \n   O  Sentence 4949
       Token Tag     Sentence #
425633    \n   O  Sentence 4950
425634    \n   O  Sentence 4950
       Token Tag     Sentence #
425646    \n   O  Sentence 4951
425647    \n   O  Sentence 4951
       Token Tag     Sentence #
425831    \n   O  Sentence 4952
425832    \n   O  Sentence 4952
       Token Tag     Sentence #
425986    \n   O  Sentence 4953
425987    \n   O  Sentence 4953
       Token Tag     Sentence #
426040    \n   O  Sentence 4954
426041    \n   O  Sentence 4954
       Token Tag     Sentence #
426221    \n   O  Sentence 4955
426222    \n   O  Sentence 4955
       Token Tag     Sentence #
426234    \n   O  Sentence 4956
426235    \n   O  Sentence 4956
       Token Tag     Sentence #
426321  

       Token       Tag     Sentence #
439688    \n  I_Pessoa  Sentence 5106
439689    \n         O  Sentence 5106
       Token Tag     Sentence #
439696    \n   O  Sentence 5107
439697    \n   O  Sentence 5107
       Token           Tag     Sentence #
439710    \n  I_Precedente  Sentence 5108
439711    \n  I_Precedente  Sentence 5108
       Token Tag     Sentence #
439736    \n   O  Sentence 5109
439737    \n   O  Sentence 5109
       Token Tag     Sentence #
439744    \n   O  Sentence 5110
439745    \n   O  Sentence 5110
       Token Tag     Sentence #
439753    \n   O  Sentence 5111
439754    \n   O  Sentence 5111
       Token Tag     Sentence #
439774    \n   O  Sentence 5112
439775    \n   O  Sentence 5112
       Token Tag     Sentence #
439831    \n   O  Sentence 5113
439832    \n   O  Sentence 5113
       Token Tag     Sentence #
439835    \n   O  Sentence 5114
439836    \n   O  Sentence 5114
       Token Tag     Sentence #
439854    \n   O  Sentence 5115
439855    \n   O  Senten

       Token Tag     Sentence #
456076    \n   O  Sentence 5264
456077    \n   O  Sentence 5264
       Token Tag     Sentence #
456118    \n   O  Sentence 5265
456119    \n   O  Sentence 5265
       Token Tag     Sentence #
456381    \n   O  Sentence 5266
456382    \n   O  Sentence 5266
       Token Tag     Sentence #
456726    \n   O  Sentence 5267
456727    \n   O  Sentence 5267
       Token Tag     Sentence #
456817    \n   O  Sentence 5268
456818    \n   O  Sentence 5268
       Token Tag     Sentence #
456827    \n   O  Sentence 5269
456828    \n   O  Sentence 5269
       Token Tag     Sentence #
456919    \n   O  Sentence 5270
456920    \n   O  Sentence 5270
       Token Tag     Sentence #
457107    \n   O  Sentence 5271
457108    \n   O  Sentence 5271
       Token Tag     Sentence #
457469    \n   O  Sentence 5272
457470    \n   O  Sentence 5272
       Token Tag     Sentence #
457479    \n   O  Sentence 5273
457480    \n   O  Sentence 5273
       Token Tag     Sentence #
457770  

       Token Tag     Sentence #
476467    \n   O  Sentence 5423
476468    \n   O  Sentence 5423
       Token Tag     Sentence #
476551    \n   O  Sentence 5424
476552    \n   O  Sentence 5424
       Token Tag     Sentence #
476672    \n   O  Sentence 5425
476673    \n   O  Sentence 5425
       Token Tag     Sentence #
476695    \n   O  Sentence 5426
476696    \n   O  Sentence 5426
       Token Tag     Sentence #
476756    \n   O  Sentence 5427
476757    \n   O  Sentence 5427
       Token Tag     Sentence #
476857    \n   O  Sentence 5428
476858    \n   O  Sentence 5428
       Token Tag     Sentence #
476908    \n   O  Sentence 5429
476909    \n   O  Sentence 5429
       Token Tag     Sentence #
477037    \n   O  Sentence 5430
477038    \n   O  Sentence 5430
       Token Tag     Sentence #
477171    \n   O  Sentence 5431
477172    \n   O  Sentence 5431
       Token Tag     Sentence #
477233    \n   O  Sentence 5432
477234    \n   O  Sentence 5432
       Token Tag     Sentence #
477243  

496212    \n   O  Sentence 5582
       Token Tag     Sentence #
496221    \n   O  Sentence 5583
496222    \n   O  Sentence 5583
       Token Tag     Sentence #
496296    \n   O  Sentence 5584
496297    \n   O  Sentence 5584
       Token Tag     Sentence #
496311    \n   O  Sentence 5585
496312    \n   O  Sentence 5585
       Token Tag     Sentence #
496335    \n   O  Sentence 5586
496336    \n   O  Sentence 5586
       Token Tag     Sentence #
496339    \n   O  Sentence 5587
496340    \n   O  Sentence 5587
       Token Tag     Sentence #
496434    \n   O  Sentence 5588
496435    \n   O  Sentence 5588
       Token Tag     Sentence #
496541    \n   O  Sentence 5589
496542    \n   O  Sentence 5589
       Token Tag     Sentence #
496702    \n   O  Sentence 5590
496703    \n   O  Sentence 5590
       Token Tag     Sentence #
496847    \n   O  Sentence 5591
496848    \n   O  Sentence 5591
       Token Tag     Sentence #
496861    \n   O  Sentence 5592
496862    \n   O  Sentence 5592
       T

514311    \n   O  Sentence 5742
       Token Tag     Sentence #
514376    \n   O  Sentence 5743
514377    \n   O  Sentence 5743
       Token Tag     Sentence #
514458    \n   O  Sentence 5744
514459    \n   O  Sentence 5744
       Token Tag     Sentence #
514657    \n   O  Sentence 5745
514658    \n   O  Sentence 5745
       Token Tag     Sentence #
514689    \n   O  Sentence 5746
514690    \n   O  Sentence 5746
       Token Tag     Sentence #
514702    \n   O  Sentence 5747
514703    \n   O  Sentence 5747
       Token Tag     Sentence #
514803    \n   O  Sentence 5748
514804    \n   O  Sentence 5748
       Token Tag     Sentence #
515041    \n   O  Sentence 5749
515042    \n   O  Sentence 5749
       Token Tag     Sentence #
515136    \n   O  Sentence 5750
515137    \n   O  Sentence 5750
       Token Tag     Sentence #
515309    \n   O  Sentence 5751
515310    \n   O  Sentence 5751
       Token Tag     Sentence #
515368    \n   O  Sentence 5752
515369    \n   O  Sentence 5752
       T

       Token Tag     Sentence #
530009    \n   O  Sentence 5900
530010    \n   O  Sentence 5900
       Token Tag     Sentence #
530039    \n   O  Sentence 5901
530040    \n   O  Sentence 5901
       Token Tag     Sentence #
530072    \n   O  Sentence 5902
530073    \n   O  Sentence 5902
       Token Tag     Sentence #
530149    \n   O  Sentence 5903
530150    \n   O  Sentence 5903
       Token Tag     Sentence #
530165    \n   O  Sentence 5904
530166    \n   O  Sentence 5904
       Token Tag     Sentence #
530282    \n   O  Sentence 5905
530283    \n   O  Sentence 5905
       Token Tag     Sentence #
530397    \n   O  Sentence 5906
530398    \n   O  Sentence 5906
       Token Tag     Sentence #
530948    \n   O  Sentence 5907
530949    \n   O  Sentence 5907
       Token Tag     Sentence #
530973    \n   O  Sentence 5908
530974    \n   O  Sentence 5908
       Token Tag     Sentence #
531426    \n   O  Sentence 5909
531427    \n   O  Sentence 5909
       Token           Tag     Sentence 

       Token         Tag     Sentence #
543816    \n  I_Doutrina  Sentence 6058
543817    \n  I_Doutrina  Sentence 6058
       Token         Tag     Sentence #
543837    \n  I_Doutrina  Sentence 6059
543838    \n  I_Doutrina  Sentence 6059
       Token         Tag     Sentence #
543860    \n  I_Doutrina  Sentence 6060
543861    \n  I_Doutrina  Sentence 6060
       Token         Tag     Sentence #
543882    \n  I_Doutrina  Sentence 6061
543883    \n  I_Doutrina  Sentence 6061
       Token         Tag     Sentence #
543902    \n  I_Doutrina  Sentence 6062
543903    \n  I_Doutrina  Sentence 6062
       Token         Tag     Sentence #
543928    \n  I_Doutrina  Sentence 6063
543929    \n  I_Doutrina  Sentence 6063
       Token         Tag     Sentence #
543947    \n  I_Doutrina  Sentence 6064
543948    \n  I_Doutrina  Sentence 6064
       Token Tag     Sentence #
543993    \n   O  Sentence 6065
543994    \n   O  Sentence 6065
       Token                 Tag     Sentence #
544078    \n  I_

       Token Tag     Sentence #
556451    \n   O  Sentence 6218
556452    \n   O  Sentence 6218
       Token Tag     Sentence #
556550    \n   O  Sentence 6219
556551    \n   O  Sentence 6219
       Token Tag     Sentence #
556663    \n   O  Sentence 6220
556664    \n   O  Sentence 6220
       Token Tag     Sentence #
556705    \n   O  Sentence 6221
556706    \n   O  Sentence 6221
       Token Tag     Sentence #
556728    \n   O  Sentence 6222
556729    \n   O  Sentence 6222
       Token Tag     Sentence #
556755    \n   O  Sentence 6223
556756    \n   O  Sentence 6223
       Token Tag     Sentence #
556814    \n   O  Sentence 6224
556815    \n   O  Sentence 6224
       Token Tag     Sentence #
556944    \n   O  Sentence 6225
556945    \n   O  Sentence 6225
       Token Tag     Sentence #
557043    \n   O  Sentence 6226
557044    \n   O  Sentence 6226
       Token Tag     Sentence #
557209    \n   O  Sentence 6227
557210    \n   O  Sentence 6227
       Token Tag     Sentence #
557284  

       Token Tag     Sentence #
570527    \n   O  Sentence 6378
570528    \n   O  Sentence 6378
       Token Tag     Sentence #
570774    \n   O  Sentence 6379
570775    \n   O  Sentence 6379
       Token Tag     Sentence #
570966    \n   O  Sentence 6380
570967    \n   O  Sentence 6380
       Token Tag     Sentence #
571043    \n   O  Sentence 6381
571044    \n   O  Sentence 6381
       Token Tag     Sentence #
571413    \n   O  Sentence 6382
571414    \n   O  Sentence 6382
       Token Tag     Sentence #
571465    \n   O  Sentence 6383
571466    \n   O  Sentence 6383
       Token Tag     Sentence #
571473    \n   O  Sentence 6384
571474    \n   O  Sentence 6384
       Token           Tag     Sentence #
571489    \n  I_Precedente  Sentence 6385
571490    \n             O  Sentence 6385
       Token Tag     Sentence #
571499    \n   O  Sentence 6386
571500    \n   O  Sentence 6386
       Token Tag     Sentence #
571641    \n   O  Sentence 6387
571642    \n   O  Sentence 6387
       Tok

       Token Tag     Sentence #
585712    \n   O  Sentence 6539
585713    \n   O  Sentence 6539
       Token Tag     Sentence #
585978    \n   O  Sentence 6540
585979    \n   O  Sentence 6540
       Token Tag     Sentence #
586125    \n   O  Sentence 6541
586126    \n   O  Sentence 6541
       Token Tag     Sentence #
586231    \n   O  Sentence 6542
586232    \n   O  Sentence 6542
       Token Tag     Sentence #
586405    \n   O  Sentence 6543
586406    \n   O  Sentence 6543
       Token Tag     Sentence #
586450    \n   O  Sentence 6544
586451    \n   O  Sentence 6544
       Token Tag     Sentence #
586535    \n   O  Sentence 6545
586536    \n   O  Sentence 6545
       Token Tag     Sentence #
586613    \n   O  Sentence 6546
586614    \n   O  Sentence 6546
       Token Tag     Sentence #
586704    \n   O  Sentence 6547
586705    \n   O  Sentence 6547
       Token Tag     Sentence #
586757    \n   O  Sentence 6548
586758    \n   O  Sentence 6548
       Token Tag     Sentence #
587019  

       Token Tag     Sentence #
604599    \n   O  Sentence 6696
604600    \n   O  Sentence 6696
       Token Tag     Sentence #
604639    \n   O  Sentence 6697
604640    \n   O  Sentence 6697
       Token Tag     Sentence #
604801    \n   O  Sentence 6698
604802    \n   O  Sentence 6698
       Token Tag     Sentence #
605074    \n   O  Sentence 6699
605075    \n   O  Sentence 6699
       Token Tag     Sentence #
605314    \n   O  Sentence 6700
605315    \n   O  Sentence 6700
       Token Tag     Sentence #
605355    \n   O  Sentence 6701
605356    \n   O  Sentence 6701
       Token Tag     Sentence #
605362    \n   O  Sentence 6702
605363    \n   O  Sentence 6702
       Token Tag     Sentence #
605378    \n   O  Sentence 6703
605379    \n   O  Sentence 6703
       Token       Tag     Sentence #
605386    \n  I_Pessoa  Sentence 6704
605387    \n         O  Sentence 6704
       Token Tag     Sentence #
605390    \n   O  Sentence 6705
605391    \n   O  Sentence 6705
       Token Tag     S

       Token Tag     Sentence #
621142    \n   O  Sentence 6854
621143    \n   O  Sentence 6854
       Token Tag     Sentence #
621206    \n   O  Sentence 6855
621207    \n   O  Sentence 6855
       Token Tag     Sentence #
621273    \n   O  Sentence 6856
621274    \n   O  Sentence 6856
       Token Tag     Sentence #
621371    \n   O  Sentence 6857
621372    \n   O  Sentence 6857
       Token Tag     Sentence #
621571    \n   O  Sentence 6858
621572    \n   O  Sentence 6858
       Token Tag     Sentence #
621671    \n   O  Sentence 6859
621672    \n   O  Sentence 6859
       Token Tag     Sentence #
621779    \n   O  Sentence 6860
621780    \n   O  Sentence 6860
       Token Tag     Sentence #
621858    \n   O  Sentence 6861
621859    \n   O  Sentence 6861
       Token                 Tag     Sentence #
621892    \n  I_Ref. Legislativa  Sentence 6862
621893    \n  I_Ref. Legislativa  Sentence 6862
       Token Tag     Sentence #
621912    \n   O  Sentence 6863
621913    \n   O  Senten

       Token Tag     Sentence #
639470    \n   O  Sentence 7012
639471    \n   O  Sentence 7012
       Token Tag     Sentence #
639575    \n   O  Sentence 7013
639576    \n   O  Sentence 7013
       Token Tag     Sentence #
639771    \n   O  Sentence 7014
639772    \n   O  Sentence 7014
       Token Tag     Sentence #
639968    \n   O  Sentence 7015
639969    \n   O  Sentence 7015
       Token Tag     Sentence #
639986    \n   O  Sentence 7016
639987    \n   O  Sentence 7016
       Token Tag     Sentence #
639996    \n   O  Sentence 7017
639997    \n   O  Sentence 7017
       Token Tag     Sentence #
640127    \n   O  Sentence 7018
640128    \n   O  Sentence 7018
       Token Tag     Sentence #
640281    \n   O  Sentence 7019
640282    \n   O  Sentence 7019
       Token Tag     Sentence #
640461    \n   O  Sentence 7020
640462    \n   O  Sentence 7020
       Token Tag     Sentence #
640714    \n   O  Sentence 7021
640715    \n   O  Sentence 7021
       Token Tag     Sentence #
640858  

       Token Tag     Sentence #
660639    \n   O  Sentence 7174
660640    \n   O  Sentence 7174
       Token Tag     Sentence #
661021    \n   O  Sentence 7175
661022    \n   O  Sentence 7175
       Token Tag     Sentence #
661186    \n   O  Sentence 7176
661187    \n   O  Sentence 7176
       Token Tag     Sentence #
661277    \n   O  Sentence 7177
661278    \n   O  Sentence 7177
       Token Tag     Sentence #
661422    \n   O  Sentence 7178
661423    \n   O  Sentence 7178
       Token Tag     Sentence #
661669    \n   O  Sentence 7179
661670    \n   O  Sentence 7179
       Token Tag     Sentence #
661750    \n   O  Sentence 7180
661751    \n   O  Sentence 7180
       Token Tag     Sentence #
661966    \n   O  Sentence 7181
661967    \n   O  Sentence 7181
       Token Tag     Sentence #
662032    \n   O  Sentence 7182
662033    \n   O  Sentence 7182
       Token Tag     Sentence #
662274    \n   O  Sentence 7183
662275    \n   O  Sentence 7183
       Token Tag     Sentence #
662354  

       Token Tag     Sentence #
677849    \n   O  Sentence 7334
677850    \n   O  Sentence 7334
       Token Tag     Sentence #
677855    \n   O  Sentence 7335
677856    \n   O  Sentence 7335
       Token           Tag     Sentence #
677876    \n  I_Precedente  Sentence 7336
677877    \n             O  Sentence 7336
       Token Tag     Sentence #
677886    \n   O  Sentence 7337
677887    \n   O  Sentence 7337
       Token Tag     Sentence #
677979    \n   O  Sentence 7338
677980    \n   O  Sentence 7338
       Token Tag     Sentence #
678087    \n   O  Sentence 7339
678088    \n   O  Sentence 7339
       Token Tag     Sentence #
678318    \n   O  Sentence 7340
678319    \n   O  Sentence 7340
       Token Tag     Sentence #
678523    \n   O  Sentence 7341
678524    \n   O  Sentence 7341
       Token Tag     Sentence #
678941    \n   O  Sentence 7342
678942    \n   O  Sentence 7342
       Token Tag     Sentence #
679010    \n   O  Sentence 7343
679011    \n   O  Sentence 7343
       Tok

693917    \n   O  Sentence 7494
       Token Tag     Sentence #
693959    \n   O  Sentence 7495
693960    \n   O  Sentence 7495
       Token Tag     Sentence #
694030    \n   O  Sentence 7496
694031    \n   O  Sentence 7496
       Token Tag     Sentence #
694056    \n   O  Sentence 7497
694057    \n   O  Sentence 7497
       Token Tag     Sentence #
694066    \n   O  Sentence 7498
694067    \n   O  Sentence 7498
       Token Tag     Sentence #
694170    \n   O  Sentence 7499
694171    \n   O  Sentence 7499
       Token Tag     Sentence #
694176    \n   O  Sentence 7500
694177    \n   O  Sentence 7500
       Token Tag     Sentence #
694190    \n   O  Sentence 7501
694191    \n   O  Sentence 7501
       Token Tag     Sentence #
694204    \n   O  Sentence 7502
694205    \n   O  Sentence 7502
       Token Tag     Sentence #
694315    \n   O  Sentence 7503
694316    \n   O  Sentence 7503
       Token Tag     Sentence #
694448    \n   O  Sentence 7504
694449    \n   O  Sentence 7504
       T

       Token Tag     Sentence #
706065    \n   O  Sentence 7653
706066    \n   O  Sentence 7653
       Token Tag     Sentence #
706210    \n   O  Sentence 7654
706211    \n   O  Sentence 7654
       Token Tag     Sentence #
706312    \n   O  Sentence 7655
706313    \n   O  Sentence 7655
       Token Tag     Sentence #
706375    \n   O  Sentence 7656
706376    \n   O  Sentence 7656
       Token Tag     Sentence #
706659    \n   O  Sentence 7657
706660    \n   O  Sentence 7657
       Token Tag     Sentence #
706821    \n   O  Sentence 7658
706822    \n   O  Sentence 7658
       Token Tag     Sentence #
706970    \n   O  Sentence 7659
706971    \n   O  Sentence 7659
       Token Tag     Sentence #
707121    \n   O  Sentence 7660
707122    \n   O  Sentence 7660
       Token Tag     Sentence #
707356    \n   O  Sentence 7661
707357    \n   O  Sentence 7661
       Token Tag     Sentence #
707421    \n   O  Sentence 7662
707422    \n   O  Sentence 7662
       Token Tag     Sentence #
707481  

In [17]:
# Cria um array com as posições a serem retiradas.
pos = []
for i in range(len(starts)):
    pos.append(starts[i])
    pos.append(starts[i]+1)
pos[:5]

[11, 12, 47, 48, 61]

In [18]:
# Remove as linhas do dataframe e reseta os índices.
combined_csv = combined_csv.drop(pos).reset_index(drop=True)

In [19]:
# Confirma se a remoção foi bem sucedida.
combined_csv.head(15)

Unnamed: 0,Token,Tag,Sentence #
0,~~e»to,INICIO_ARQ,Sentence 1
1,,O,Sentence 1
2,c;JJ;~,O,Sentence 1
3,.,O,Sentence 1
4,~wa,O,Sentence 1
5,/,O,Sentence 1
6,,O,Sentence 1
7,pfi~,O,Sentence 1
8,/,O,Sentence 1
9,<'d,O,Sentence 1


# Mover B_  com Token vazio para linha abaixo

In [20]:
# 'begins' identifica as situações onde a Tag começa com 'B_' e o Token é vazio, uma situação onde 
# o anotador começou a marcação de um espaço vazio gerando a inconsistância.

begins = combined_csv[(combined_csv['Token']==' ') & 
                      (combined_csv['Tag'].str.startswith('B_'))].index.values

In [21]:
begins[0], len (begins)

(3654, 153)

In [22]:
%%time
# Rodar apenas uma vez
for i in range(len(begins)):
    combined_csv.Tag.iloc[begins[i]+1] = combined_csv.Tag.iloc[begins[i]] #Acertar a Tag do Token para começar
                                                                          #sem espaço
    combined_csv.Tag.iloc[begins[i]] = 'O' #Marca o espaço vazio como 'O'

CPU times: user 5.21 s, sys: 0 ns, total: 5.21 s
Wall time: 5.21 s


In [23]:
i, n = 1, 10
combined_csv.iloc[begins[i]-n+7:begins[i]+n]

Unnamed: 0,Token,Tag,Sentence #
5199,redação,O,Sentence 114
5200,,O,Sentence 114
5201,da,O,Sentence 114
5202,,O,Sentence 114
5203,L,B_Ref. Legislativa,Sentence 114
5204,.,I_Ref. Legislativa,Sentence 114
5205,,I_Ref. Legislativa,Sentence 114
5206,10358/01,I_Ref. Legislativa,Sentence 114
5207,,I_Ref. Legislativa,Sentence 114
5208,abrange,O,Sentence 114


# Tratar marcações que incluem vírgula no final da marcação

In [41]:
df_teste = combined_csv.copy()

In [52]:
# df_teste.iloc[2].shift(+1)

df_teste['Token'].shift(-1)

0                                       
1                                 c;JJ;~
2                                      .
3                                    ~wa
4                                      /
                       ...              
697831                                id
697832                                 :
697833                                  
697834    20181018_HC_162764_15338864359
697835                               NaN
Name: Token, Length: 697836, dtype: object

In [51]:
df_teste.iloc[2]

Token             c;JJ;~
Tag                    O
Sentence #    Sentence 1
Name: 2, dtype: object

In [114]:
# Índice das posições onde ocorre o fim da marcação em uma vírgula
inx = df_teste[(df_teste.Tag.str.startswith('I')) & (df_teste.Token == ',') & (df_teste.Tag.shift(-1) == 'O')].index.values

In [145]:
# Conferindo a ocorrência
i = 9
df_teste.iloc[inx[i]-5:inx[i]+10]

Unnamed: 0,Token,Tag,Sentence #
2497,,I_Precedente,Sentence 71
2498,n,I_Precedente,Sentence 71
2499,.,I_Precedente,Sentence 71
2500,,I_Precedente,Sentence 71
2501,2006.38.00.744462-0,I_Precedente,Sentence 71
2502,",",I_Precedente,Sentence 71
2503,,O,Sentence 71
2504,com,O,Sentence 71
2505,,O,Sentence 71
2506,fundamento,O,Sentence 71


In [146]:
df_teste = df_teste.drop(inx).reset_index(drop=True)

In [148]:
# Conferindo se funcionou
i = 9
df_teste.iloc[inx[i]-20:inx[i]+15]

Unnamed: 0,Token,Tag,Sentence #
2482,,O,Sentence 71
2483,autos,O,Sentence 71
2484,,O,Sentence 71
2485,do,O,Sentence 71
2486,,O,Sentence 71
2487,Processo,B_Precedente,Sentence 71
2488,,I_Precedente,Sentence 71
2489,n,I_Precedente,Sentence 71
2490,.,I_Precedente,Sentence 71
2491,,I_Precedente,Sentence 71


In [149]:
combined_csv = df_teste.copy()

In [152]:
combined_csv[(combined_csv.Tag.str.startswith('B')) & (combined_csv.Token == ' ')]

Unnamed: 0,Token,Tag,Sentence #


In [154]:
list(combined_csv.Tag.unique())

['INICIO_ARQ',
 'O',
 'B_Precedente',
 'I_Precedente',
 'B_Pessoa',
 'I_Pessoa',
 'B_Ref. Legislativa',
 'I_Ref. Legislativa',
 'B_Doutrina',
 'I_Doutrina',
 'FIM_ARQ']

# IMPORTANTE: Esse último passo é necessário para salvar todas as alterações feitas no preprocessamento.

In [155]:
# Salvando o processamento feito nos dados
combined_csv.to_csv("preprocessados.csv",index=False,encoding='utf-8')

# DUMP

# Tratar caracteres especiais

In [156]:
df_teste = combined_csv.copy()

In [157]:
# Identifica os tokens que começam com '§' e são maiores que 1 pois queremos pegar os casos onde
# '§' está associado ao numero do paragrafo.
espec_carac = list(df_teste[(df_teste.Token.str.startswith('§')) & (df_teste.Token.str.len()>1)].Token.unique())
espec_carac

['§2º', '§4º', '§§', '§3º', '§1º', '§1°', '§7º']

In [158]:
df_teste[df_teste.Token.str.match(espec_carac[0])]

Unnamed: 0,Token,Tag,Sentence #
15765,§2º,I_Ref. Legislativa,Sentence 310
282334,§2º,B_Ref. Legislativa,Sentence 3412
288581,§2º,I_Ref. Legislativa,Sentence 3464
345473,§2º,I_Ref. Legislativa,Sentence 3924
444852,§2º,I_Ref. Legislativa,Sentence 5267
574861,§2º,I_Ref. Legislativa,Sentence 6563
615705,§2º,B_Ref. Legislativa,Sentence 6948
621949,§2º,I_Ref. Legislativa,Sentence 7000


## Teste com duas sentenças

In [159]:
df_senten = df_teste[df_teste['Sentence #'] == 'Sentence 310'].append(df_teste[df_teste['Sentence #'] == 'Sentence 1589'])

df_senten.reset_index(drop=True,inplace=True)

In [160]:
for i in range(len(df_senten)):
    if any(carac in df_senten.Token.iloc[i] for carac in espec_carac) == True:
        line = df_senten.iloc[i]
        splt = list(line.Token)
        
        if line.Tag.startswith('I_'):
            if len(line.Token) == 2:
                #Para tamanho 2 temos que transformar uma linha em duas
                line0 = line.copy()
                line0.Token = splt[0]
                line1 = line.copy()
                line1.Token = splt[1]
                # Transforma em DataFrame
                line0 = pd.DataFrame(line0).transpose()
                line1 = pd.DataFrame(line1).transpose()

                df_senten1 = pd.concat([df_senten.iloc[:i], line0, line1, df_senten.iloc[i+1:]]).reset_index(drop=True)
            
            if len(line.Token) == 3:
                #Para tamanho 3 temos que transformar uma linha em três
                line0 = line.copy()
                line0.Token = splt[0]
                line1 = line.copy()
                line1.Token = splt[1]
                line2 = line.copy()
                line2.Token = splt[2]

                # Transforma em DataFrame
                line0 = pd.DataFrame(line0).transpose()
                line1 = pd.DataFrame(line1).transpose()
                line2 = pd.DataFrame(line2).transpose()

                df_senten1 = pd.concat([df_senten.iloc[:i], line0, line1, line2, df_senten.iloc[i+1:]]).reset_index(drop=True)

        if line.Tag.startswith('B_'):
            if len(line.Token) == 2:
                #Para tamanho 2 temos que transformar uma linha em duas
                line0 = line.copy()
                line0.Token = splt[0]
                line1 = line.copy()
                line1.Token = splt[1]
                line1.Tag = line.Tag.replace('B_','I_')

                # Transforma em DataFrame
                line0 = pd.DataFrame(line0).transpose()
                line1 = pd.DataFrame(line1).transpose()

                df_senten1 = pd.concat([df_senten.iloc[:i], line0, line1, df_senten.iloc[i+1:]]).reset_index(drop=True)
            if len(line.Token) == 3:
                #Para tamanho 3 temos que transformar uma linha em três
                line0 = line.copy()
                line0.Token = splt[0]
                line1 = line.copy()
                line1.Token = splt[1]
                line1.Tag = line.Tag.replace('B_','I_')
                line2 = line.copy()
                line2.Token = splt[2]
                line2.Tag = line.Tag.replace('B_','I_')

                # Transforma em DataFrame
                line0 = pd.DataFrame(line0).transpose()
                line1 = pd.DataFrame(line1).transpose()
                line2 = pd.DataFrame(line2).transpose()

                df_senten1 = pd.concat([df_senten.iloc[:i], line0, line1, line2, df_senten.iloc[i+1:]]).reset_index(drop=True)

In [161]:
(df_senten1.tail(10))

Unnamed: 0,Token,Tag,Sentence #
289,§,I_Ref. Legislativa,Sentence 1589
290,4,I_Ref. Legislativa,Sentence 1589
291,º,I_Ref. Legislativa,Sentence 1589
292,",",I_Ref. Legislativa,Sentence 1589
293,,I_Ref. Legislativa,Sentence 1589
294,do,I_Ref. Legislativa,Sentence 1589
295,,I_Ref. Legislativa,Sentence 1589
296,CPC,I_Ref. Legislativa,Sentence 1589
297,.,I_Ref. Legislativa,Sentence 1589
298,,O,Sentence 1589


In [162]:
df_senten.tail(10)

Unnamed: 0,Token,Tag,Sentence #
287,",",I_Ref. Legislativa,Sentence 1589
288,,I_Ref. Legislativa,Sentence 1589
289,§4º,I_Ref. Legislativa,Sentence 1589
290,",",I_Ref. Legislativa,Sentence 1589
291,,I_Ref. Legislativa,Sentence 1589
292,do,I_Ref. Legislativa,Sentence 1589
293,,I_Ref. Legislativa,Sentence 1589
294,CPC,I_Ref. Legislativa,Sentence 1589
295,.,I_Ref. Legislativa,Sentence 1589
296,,O,Sentence 1589


## Para o conjunto de teste (com todas as sentenças)

In [39]:
for i in range(len(df_teste)):
    if any(carac in df_teste.Token.iloc[i] for carac in espec_carac) == True:
        line = df_teste.iloc[i]
        splt = list(line.Token)
        
        if line.Tag.startswith('I_'):
            if len(line.Token) == 2:
                #Para tamanho 2 temos que transformar uma linha em duas
                line0 = line.copy()
                line0.Token = splt[0]
                line1 = line.copy()
                line1.Token = splt[1]
                # Transforma em DataFrame
                line0 = pd.DataFrame(line0).transpose()
                line1 = pd.DataFrame(line1).transpose()

                df_senten1 = pd.concat([df_teste.iloc[:i], line0, line1, df_teste.iloc[i+1:]]).reset_index(drop=True)
            
            if len(line.Token) == 3:
                #Para tamanho 3 temos que transformar uma linha em três
                line0 = line.copy()
                line0.Token = splt[0]
                line1 = line.copy()
                line1.Token = splt[1]
                line2 = line.copy()
                line2.Token = splt[2]

                # Transforma em DataFrame
                line0 = pd.DataFrame(line0).transpose()
                line1 = pd.DataFrame(line1).transpose()
                line2 = pd.DataFrame(line2).transpose()

                df_senten1 = pd.concat([df_teste.iloc[:i], line0, line1, line2, df_teste.iloc[i+1:]]).reset_index(drop=True)

        if line.Tag.startswith('B_'):
            if len(line.Token) == 2:
                #Para tamanho 2 temos que transformar uma linha em duas
                line0 = line.copy()
                line0.Token = splt[0]
                line1 = line.copy()
                line1.Token = splt[1]
                line1.Tag = line.Tag.replace('B_','I_')

                # Transforma em DataFrame
                line0 = pd.DataFrame(line0).transpose()
                line1 = pd.DataFrame(line1).transpose()

                df_senten1 = pd.concat([df_teste.iloc[:i], line0, line1, df_teste.iloc[i+1:]]).reset_index(drop=True)
            if len(line.Token) == 3:
                #Para tamanho 3 temos que transformar uma linha em três
                line0 = line.copy()
                line0.Token = splt[0]
                line1 = line.copy()
                line1.Token = splt[1]
                line1.Tag = line.Tag.replace('B_','I_')
                line2 = line.copy()
                line2.Token = splt[2]
                line2.Tag = line.Tag.replace('B_','I_')

                # Transforma em DataFrame
                line0 = pd.DataFrame(line0).transpose()
                line1 = pd.DataFrame(line1).transpose()
                line2 = pd.DataFrame(line2).transpose()

                df_senten1 = pd.concat([df_teste.iloc[:i], line0, line1, line2, df_teste.iloc[i+1:]]).reset_index(drop=True)

In [40]:
df_senten1[(df_senten1.Token.str.startswith('§')) & (df_senten1.Token.str.len()>1)]

Unnamed: 0,Token,Tag,Sentence #
15818,§2º,I_Ref. Legislativa,Sentence 310
131173,§4º,I_Ref. Legislativa,Sentence 1589
149864,§§,I_Ref. Legislativa,Sentence 1789
150253,§§,B_Ref. Legislativa,Sentence 1790
178152,§4º,I_Ref. Legislativa,Sentence 2196
178238,§3º,I_Ref. Legislativa,Sentence 2197
179601,§4º,I_Ref. Legislativa,Sentence 2216
183944,§1º,B_Ref. Legislativa,Sentence 2272
184458,§3º,I_Ref. Legislativa,Sentence 2276
186230,§3º,I_Ref. Legislativa,Sentence 2287


In [32]:
df_senten1

Unnamed: 0,Token,Tag,Sentence #
0,~~e»to,INICIO_ARQ,Sentence 1
1,,O,Sentence 1
2,c;JJ;~,O,Sentence 1
3,.,O,Sentence 1
4,~wa,O,Sentence 1
...,...,...,...
697833,\n,I_Pessoa,Sentence 7730
697834,id,O,Sentence 7730
697835,:,O,Sentence 7730
697836,,O,Sentence 7730


# Olhando para a predição de cada X

In [None]:
X_test[0][0]['word.lower()'], y_pred[0][0], y_test[0][0]

In [None]:
# Cria dataframe com o que o modelo predisse para análise.

result = []
for i in range(len(y_test)):
    df = pd.DataFrame(zip([X_test[i][j]['word.lower()'] for j in range(len(X_test[i]))], y_test[i], y_pred[i]))
#     df = pd.DataFrame(zip(X_test[i], y_test[i], y_pred[i]))
    result.append(df)
    
result = pd.concat(result)
result.reset_index(inplace=True, drop=True)

In [None]:
# combined_csv = pd.concat(frames)
result.to_csv("resultado.csv", index=False, encoding='utf-8')

In [None]:
result.shape

In [None]:
result.columns = ['X_test','y_test', 'y_pred']
result.head()

In [None]:
result[result['y_test'] != result['y_pred']]

In [None]:
print('Proporção de erros:',result[result.y_test != result.y_pred].shape[0] / result.shape[0])

----------------------------------------------------------------------------------------

In [None]:
df_res = pd.read_csv('mock/resultado.csv')
df_combined = pd.read_csv('mock/combined_csv.csv')

In [None]:
df_res.head()

In [None]:
df_res.shape

In [None]:
df_erros = df_res[df_res['y_test'] != df_res['y_pred']]
df_erros.head()

In [None]:
df_erros[(df_erros['X_test']==' ') & (df_erros['y_test'].str.startswith('B_'))]
indices = df_erros[(df_erros['X_test']==' ') & (df_erros['y_test'].str.startswith('B_'))].index.values

In [None]:
n = 10
pos = indices[1]

print('Retorna %d linhas antes e depois do ocorrido na linha %d para tag que começa com B_'%(n,pos)),df_res.iloc[pos-n:pos+n]

In [None]:
df_res

In [None]:
for i in indices:
    df.iloc[pos-n:pos+n]

# Remove pontuação dos números

In [33]:
5=4

SyntaxError: can't assign to literal (<ipython-input-33-d9863e100aa5>, line 1)

In [None]:
# Criando um DataFrame onde todas os tokens possuem pontuação 
df_pont = combined_csv[(combined_csv.Token.str.contains("""[.]""")) & (combined_csv.Token.str.len()>1)]

In [None]:
df_pont.shape

In [None]:
# Tokens de tamanho maior que 1.
df_pont.head(30)

In [None]:
# combined_csv.iloc[2722-3:2722+5]
df_pont[(df_pont.Tag.str.endswith('Precedente'))]

In [None]:
df_pont.Tag.value_counts()

In [None]:
# combined_csv.applymap(np.isreal)
# ord(combined_csv.Token)#<128
# combined_csv.Token[:10]
ord(combined_csv.Token.iloc[0])

In [None]:
import string
alphabet = string.ascii_letters#+string.punctuation
alphabet 

In [None]:
regexPattern = re.compile(alphabet)

In [None]:
combined_csv.Token = combined_csv.Token.astype('str')

In [None]:
combined_csv.iloc[[3,5]]

In [None]:
regexPattern.findall(combined_csv.Token.iloc[i])

In [None]:
# combined_csv.Token[combined_csv.Token.str.contains(alphabet) == False]

# regexPattern.findall(combined_csv.Token)

l = []
for i in range(len(combined_csv.Token)):
#     l.append(regexPattern.findall(combined_csv.Token.iloc[i]).index)
    indx = regexPattern.findall(combined_csv.Token.iloc[i]).index.values
    l.append(indx)
l

In [None]:
alphabet = string.ascii_letters
mainStr = combined_csv.Token.iloc[0]

# Create a regex pattern to match character 's'
regexPattern = re.compile(alphabet)
 
# Iterate over all the matches of regex pattern
iteratorOfMatchObs = regexPattern.finditer(mainStr)
indexPositions = []
count = 0
for matchObj in iteratorOfMatchObs:
    indexPositions.append(matchObj.start())
    count = count + 1
 
# print("Occurrence Count of character 's' : ", count)
print("Index Positions of 's' are : ", indexPositions)


In [None]:
l = []
for i in range(len(combined_csv)):
    iteratorOfMatchObs = regexPattern.finditer(combined_csv.Token.iloc[i])
    indexPositions = []
    count = 0
    for matchObj in iteratorOfMatchObs:
        indexPositions.append(matchObj.start())
        count = count + 1
    l.append(indexPositions)

In [None]:
l.count([])

In [None]:
regexPattern.findall(combined_csv.Token.iloc[0]).index

In [None]:
# combined_csv[combined_csv.Token.str.contains("""[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~123456789]""")]

# combined_csv[(combined_csv.Token.str.contains("""[./]"""))& (combined_csv.Token.str.len() > 1)]

In [None]:
# No meio do preprocessamento foi identificado tokens que terminam com 'adv' e estão colados ao nome do
# advogado referente ao caso.
# Tokens que terminan com 'adv'.

adv = combined_csv[combined_csv.Token.str.endswith('ADV')]#.reset_index(drop=True)
adv.head()#, adv.shape

In [None]:
# Desconfiança que o nome do advogado está agregado à palavra 'adv'

adv[adv.Token.str.len() >3]
adv_pos = adv[adv.Token.str.len() >3].index.values

k = 2 # Vê as linhas antes e depois do k-ésimo ocorrido .
combined_csv.iloc[adv_pos[k]-3:adv_pos[k]+3] # Olhando para as linhas anteriores e posteriores o acontecimento.
# combined_csv.iloc[adv_pos[0]].Token[-3:] #Separando a parte 'adv'

In [None]:
print("Quantidade de vezes que esse caso ocorre em todos os arquivos:",len(adv_pos))