In [1]:
import re
import json
import os

In [25]:
import pandas as pd

In [2]:
%load_ext autoreload
%autoreload 2

In [14]:
from legal_document_splitter import *

In [4]:
DATA_PATH="data/"

In [5]:
json_normas_path = DATA_PATH + 'normas_full.json'

with open(json_normas_path, 'r') as file:
    normas_json = json.load(file)

normas = list(normas_json.values())

In [6]:
normas_json.keys()

dict_keys(['Ato Declaratório Cosar nº 47.txt', 'Ato Declaratório Interpretativo RFB nº 1.txt', 'Ato Declaratório Interpretativo SRF nº 2.txt', 'Ato Declaratório Interpretativo SRF nº 24.txt', 'Ato Declaratório Interpretativo SRF nº 5.txt', 'Ato Declaratório PGFN nº 1.txt', 'Ato Declaratório PGFN nº 13.txt', 'Ato Declaratório PGFN nº 14.txt', 'Ato Declaratório PGFN nº 2.txt', 'Ato Declaratório PGFN nº 3.txt', 'Ato Declaratório PGFN nº 4.txt', 'Ato Declaratório PGFN nº 5.txt', 'Ato Declaratório PGFN nº 6 DE 2006.txt', 'Ato Declaratório PGFN nº 6 DE 2008.txt', 'Ato Declaratório PGFN nº 9.txt', 'Ato Declaratório RFB nº 16.txt', 'Ato Declaratório RFB nº 18.txt', 'Ato Declaratório RFB nº 3.txt', 'Ato Declaratório SRF nº 14.txt', 'Ato Declaratório SRF nº 16.txt', 'Ato Declaratório SRF nº 22.txt', 'Ato Declaratório SRF nº 26.txt', 'Ato Declaratório SRF nº 28.txt', 'Ato Declaratório SRF nº 3.txt', 'Ato Declaratório SRF nº 48.txt', 'Ato Declaratório SRF nº 7.txt', 'Ato Declaratório SRF nº 8.txt'

In [7]:
hierarchical_text_splitter = HierachicalRecursiveCharacterTextSplitter(
    separators=LEGISLATION_SPLITTING_HIERARCHY_BRAZIL,
    is_separator_regex=True,
    chunk_size=1000,  # Tamanho máximo do chunk em caracteres
    chunk_overlap=100,  # Sobreposição entre chunks para contexto
    apply_chunk_size=6 # Separator index to start applying the chunk size check.
)

In [18]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # Tamanho máximo do chunk em caracteres
    chunk_overlap=100,  # Sobreposição entre chunks para contexto
)

In [15]:
tokens_counter = tokensCounter("stjiris/bert-large-portuguese-cased-legal-tsdae")

### Process (segmentation and flattening) all the referenced documents

In [16]:
from tqdm import tqdm

In [29]:
applied_splitter = []
passages_referred_docs = []

for filename, doc in tqdm(normas_json.items(), desc="Processing all referred legal documents"):
    # print(f"Processing file: {filename}")
    
    # Remove a extensão ".txt" do nome do arquivo
    doc_name = filename.replace(".txt", "")

    # First try applying the hierarchical splitter
    chunks = hierarchical_text_splitter.split_text(doc)
    passages = []
    current_path = ""

    flatten_passages_hierarchy(chunks, current_path, passages)
    
    # print(chunks.keys())
    
    total_statistics = tokens_counter.passages_statistics([passage['passage'] for passage in passages])

    # print(total_statistics)

    #
    # Check if there are indications the hierarchical splitter did not work
    # verifying if there is any chunk longer than the maximum size.
    #
    
    if total_statistics['max_tokens'] > 512:
        # For such document, apply a regular text splitter.
        
        print(f">>> Applying regular text splitter in file {filename}.")

        chunks = text_splitter.split_text(doc)
        applied_splitter.append({'filename': filename,
                                 'splitter': 'text'})

        for passage in chunks:
            # Formata cada chunk com o nome do arquivo e o conteúdo
            passage_with_path = "{}: {}".format(doc_name, passage)
            passages_referred_docs.append(passage_with_path)    
    else:
        applied_splitter.append({'filename': filename,
                                 'splitter': 'hierarchical'})
    
        for passage in passages:
            # Formata cada chunk com o nome do arquivo e o conteúdo
            passage_with_path = "{}: {}".format(doc_name + "_" + passage['path'], passage['passage'])
            passages_referred_docs.append(passage_with_path)

  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
Processing all referred legal documents:   5%|██████                                                                                                          | 18/332 [00:00<00:01, 159.66it/s]

>>> Applying regular text splitter in file Ato Declaratório PGFN nº 1.txt.
>>> Applying regular text splitter in file Ato Declaratório PGFN nº 6 DE 2006.txt.
>>> Applying regular text splitter in file Ato Declaratório PGFN nº 9.txt.
>>> Applying regular text splitter in file Ato Declaratório RFB nº 3.txt.
>>> Applying regular text splitter in file Decreto nº 21.177.txt.


Processing all referred legal documents:  13%|██████████████▎                                                                                                  | 42/332 [00:01<00:08, 35.19it/s]

>>> Applying regular text splitter in file Decreto nº 3.000.txt.
>>> Applying regular text splitter in file Decreto nº 5.128.txt.
>>> Applying regular text splitter in file Decreto nº 50.656.txt.
>>> Applying regular text splitter in file Decreto nº 52.288.txt.
>>> Applying regular text splitter in file Decreto nº 56.435.txt.
>>> Applying regular text splitter in file Decreto nº 57.784.txt.
>>> Applying regular text splitter in file Decreto nº 57.942.txt.
>>> Applying regular text splitter in file Decreto nº 59.308.txt.
>>> Applying regular text splitter in file Decreto nº 59.309.txt.


Processing all referred legal documents:  15%|████████████████▋                                                                                                | 49/332 [00:01<00:07, 35.87it/s]

>>> Applying regular text splitter in file Decreto nº 59.566.txt.
>>> Applying regular text splitter in file Decreto nº 61.078.txt.
>>> Applying regular text splitter in file Decreto nº 62.125.txt.
>>> Applying regular text splitter in file Decreto nº 67.542.txt.
>>> Applying regular text splitter in file Decreto nº 70.951.txt.
>>> Applying regular text splitter in file Decreto nº 71.733.txt.
>>> Applying regular text splitter in file Decreto nº 75.102.txt.


Processing all referred legal documents:  17%|██████████████████▋                                                                                              | 55/332 [00:01<00:07, 35.81it/s]

>>> Applying regular text splitter in file Decreto nº 8.289.txt.
>>> Applying regular text splitter in file Decreto nº 8.624.txt.
>>> Applying regular text splitter in file Decreto nº 85.306.txt.
>>> Applying regular text splitter in file Decreto nº 86.084.txt.
>>> Applying regular text splitter in file Decreto nº 86.365.txt.
>>> Applying regular text splitter in file Decreto nº 87.563.txt.
>>> Applying regular text splitter in file Instrução Normativa RFB nº 1.037.txt.


Processing all referred legal documents:  19%|█████████████████████▍                                                                                           | 63/332 [00:01<00:06, 38.88it/s]

>>> Applying regular text splitter in file Instrução Normativa RFB nº 1.131.txt.
>>> Applying regular text splitter in file Instrução Normativa RFB nº 1.500.txt.
>>> Applying regular text splitter in file Instrução Normativa RFB nº 1.548.txt.


Processing all referred legal documents:  20%|███████████████████████▏                                                                                         | 68/332 [00:02<00:10, 24.62it/s]

>>> Applying regular text splitter in file Instrução Normativa RFB nº 1.585.txt.


Processing all referred legal documents:  23%|██████████████████████████▏                                                                                      | 77/332 [00:02<00:10, 25.21it/s]

>>> Applying regular text splitter in file Instrução Normativa RFB nº 1.717.txt.
>>> Applying regular text splitter in file Instrução Normativa RFB nº 107.txt.
>>> Applying regular text splitter in file Instrução Normativa RFB nº 2.055.txt.
>>> Applying regular text splitter in file Instrução Normativa RFB nº 2.060.txt.


Processing all referred legal documents:  24%|███████████████████████████▌                                                                                     | 81/332 [00:02<00:09, 26.81it/s]

>>> Applying regular text splitter in file Instrução Normativa RFB nº 2.172.txt.
>>> Applying regular text splitter in file Instrução Normativa RFB nº 208.txt.


Processing all referred legal documents:  27%|██████████████████████████████▉                                                                                  | 91/332 [00:03<00:09, 25.28it/s]

>>> Applying regular text splitter in file Instrução Normativa RFB nº 256.txt.
>>> Applying regular text splitter in file Instrução Normativa RFB nº 588.txt.
>>> Applying regular text splitter in file Instrução Normativa RFB nº 67.txt.
>>> Applying regular text splitter in file Instrução Normativa RFB nº 81.txt.


Processing all referred legal documents:  31%|███████████████████████████████████                                                                             | 104/332 [00:03<00:06, 34.17it/s]

>>> Applying regular text splitter in file Lei nº 1.510.txt.
>>> Applying regular text splitter in file Lei nº 10.522.txt.
>>> Applying regular text splitter in file Lei nº 10.559.txt.
>>> Applying regular text splitter in file Lei nº 10.741.txt.


Processing all referred legal documents:  33%|████████████████████████████████████▊                                                                           | 109/332 [00:03<00:08, 25.51it/s]

>>> Applying regular text splitter in file Lei nº 10.833.txt.
>>> Applying regular text splitter in file Lei nº 10.865.txt.


Processing all referred legal documents:  34%|██████████████████████████████████████                                                                          | 113/332 [00:03<00:11, 19.77it/s]

>>> Applying regular text splitter in file Lei nº 109.txt.
>>> Applying regular text splitter in file Lei nº 11.033.txt.


Processing all referred legal documents:  35%|███████████████████████████████████████▏                                                                        | 116/332 [00:04<00:13, 16.59it/s]

>>> Applying regular text splitter in file Lei nº 11.196.txt.


Processing all referred legal documents:  37%|█████████████████████████████████████████▊                                                                      | 124/332 [00:04<00:11, 18.70it/s]

>>> Applying regular text splitter in file Lei nº 11.437.txt.
>>> Applying regular text splitter in file Lei nº 11.524.txt.
>>> Applying regular text splitter in file Lei nº 11.727.txt.


Processing all referred legal documents:  38%|██████████████████████████████████████████▊                                                                     | 127/332 [00:04<00:10, 19.23it/s]

>>> Applying regular text splitter in file Lei nº 11.941.txt.
>>> Applying regular text splitter in file Lei nº 11.945.txt.


Processing all referred legal documents:  39%|████████████████████████████████████████████▏                                                                   | 131/332 [00:05<00:11, 17.64it/s]

>>> Applying regular text splitter in file Lei nº 12.249.txt.
>>> Applying regular text splitter in file Lei nº 12.350.txt.
>>> Applying regular text splitter in file Lei nº 12.431.txt.


Processing all referred legal documents:  41%|█████████████████████████████████████████████▉                                                                  | 136/332 [00:05<00:11, 16.92it/s]

>>> Applying regular text splitter in file Lei nº 12.469.txt.
>>> Applying regular text splitter in file Lei nº 12.594.txt.


Processing all referred legal documents:  42%|██████████████████████████████████████████████▌                                                                 | 138/332 [00:05<00:12, 15.52it/s]

>>> Applying regular text splitter in file Lei nº 12.715.txt.
>>> Applying regular text splitter in file Lei nº 12.794.txt.


Processing all referred legal documents:  43%|███████████████████████████████████████████████▉                                                                | 142/332 [00:05<00:12, 15.28it/s]

>>> Applying regular text splitter in file Lei nº 12.973.txt.
>>> Applying regular text splitter in file Lei nº 123.txt.


Processing all referred legal documents:  43%|████████████████████████████████████████████████▌                                                               | 144/332 [00:06<00:17, 10.99it/s]

>>> Applying regular text splitter in file Lei nº 13.043.txt.
>>> Applying regular text splitter in file Lei nº 13.097.txt.


Processing all referred legal documents:  46%|███████████████████████████████████████████████████▌                                                            | 153/332 [00:07<00:14, 12.54it/s]

>>> Applying regular text splitter in file Lei nº 13.149.txt.
>>> Applying regular text splitter in file Lei nº 14.119.txt.
>>> Applying regular text splitter in file Lei nº 14.286.txt.


Processing all referred legal documents:  49%|██████████████████████████████████████████████████████▋                                                         | 162/332 [00:07<00:08, 18.94it/s]

>>> Applying regular text splitter in file Lei nº 167.txt.
>>> Applying regular text splitter in file Lei nº 2.579.txt.
>>> Applying regular text splitter in file Lei nº 4.242.txt.
>>> Applying regular text splitter in file Lei nº 4.504.txt.


Processing all referred legal documents:  50%|████████████████████████████████████████████████████████                                                        | 166/332 [00:07<00:10, 16.13it/s]

>>> Applying regular text splitter in file Lei nº 4.506.txt.
>>> Applying regular text splitter in file Lei nº 4.591.txt.
>>> Applying regular text splitter in file Lei nº 4.862.txt.
>>> Applying regular text splitter in file Lei nº 4.886.txt.
>>> Applying regular text splitter in file Lei nº 5.172.txt.


Processing all referred legal documents:  52%|█████████████████████████████████████████████████████████▋                                                      | 171/332 [00:08<00:16,  9.53it/s]

>>> Applying regular text splitter in file Lei nº 5.452.txt.
>>> Applying regular text splitter in file Lei nº 5.809.txt.
>>> Applying regular text splitter in file Lei nº 5.844.txt.


Processing all referred legal documents:  52%|██████████████████████████████████████████████████████████▎                                                     | 173/332 [00:08<00:17,  9.14it/s]

>>> Applying regular text splitter in file Lei nº 6.015.txt.


Processing all referred legal documents:  54%|████████████████████████████████████████████████████████████▍                                                   | 179/332 [00:09<00:13, 10.95it/s]

>>> Applying regular text splitter in file Lei nº 6.404.txt.
>>> Applying regular text splitter in file Lei nº 7.713.txt.


Processing all referred legal documents:  55%|█████████████████████████████████████████████████████████████▋                                                  | 183/332 [00:09<00:11, 12.48it/s]

>>> Applying regular text splitter in file Lei nº 8.036.txt.
>>> Applying regular text splitter in file Lei nº 8.069.txt.


Processing all referred legal documents:  56%|██████████████████████████████████████████████████████████████▍                                                 | 185/332 [00:09<00:11, 12.83it/s]

>>> Applying regular text splitter in file Lei nº 8.112.txt.
>>> Applying regular text splitter in file Lei nº 8.313.txt.


Processing all referred legal documents:  58%|████████████████████████████████████████████████████████████████▍                                               | 191/332 [00:10<00:08, 16.19it/s]

>>> Applying regular text splitter in file Lei nº 8.383.txt.
>>> Applying regular text splitter in file Lei nº 8.685.txt.
>>> Applying regular text splitter in file Lei nº 8.852.txt.


Processing all referred legal documents:  60%|██████████████████████████████████████████████████████████████████▊                                             | 198/332 [00:10<00:07, 17.77it/s]

>>> Applying regular text splitter in file Lei nº 8.891.txt.
>>> Applying regular text splitter in file Lei nº 8.981.txt.


Processing all referred legal documents:  61%|████████████████████████████████████████████████████████████████████▏                                           | 202/332 [00:10<00:07, 16.87it/s]

>>> Applying regular text splitter in file Lei nº 9.096.txt.
>>> Applying regular text splitter in file Lei nº 9.249.txt.


Processing all referred legal documents:  62%|█████████████████████████████████████████████████████████████████████▍                                          | 206/332 [00:11<00:07, 17.60it/s]

>>> Applying regular text splitter in file Lei nº 9.250.txt.


Processing all referred legal documents:  63%|██████████████████████████████████████████████████████████████████████▏                                         | 208/332 [00:11<00:12, 10.07it/s]

>>> Applying regular text splitter in file Lei nº 9.430.txt.


Processing all referred legal documents:  64%|███████████████████████████████████████████████████████████████████████▏                                        | 211/332 [00:11<00:12,  9.72it/s]

>>> Applying regular text splitter in file Lei nº 9.504.txt.
>>> Applying regular text splitter in file Lei nº 9.532.txt.


Processing all referred legal documents:  65%|████████████████████████████████████████████████████████████████████████▊                                       | 216/332 [00:12<00:09, 11.90it/s]

>>> Applying regular text splitter in file Lei nº 9.615.txt.
>>> Applying regular text splitter in file Lei nº 9.779.txt.
>>> Applying regular text splitter in file Lei nº 9250.txt.


Processing all referred legal documents:  67%|██████████████████████████████████████████████████████████████████████████▉                                     | 222/332 [00:12<00:07, 15.24it/s]

>>> Applying regular text splitter in file Medida Provisória nº 2.158.txt.
>>> Applying regular text splitter in file Medida Provisória nº 2.228.txt.


Processing all referred legal documents:  69%|█████████████████████████████████████████████████████████████████████████████▌                                  | 230/332 [00:12<00:04, 23.63it/s]

>>> Applying regular text splitter in file Medida Provisória nº 252.txt.
>>> Applying regular text splitter in file Nota PGFN nº 1.1042017.txt.
>>> Applying regular text splitter in file Nota PGFN nº 1.5492012.txt.
>>> Applying regular text splitter in file Nota PGFNCRJ nº 1.0402015.txt.
>>> Applying regular text splitter in file Nota PGFNCRJ nº 9812015.txt.
>>> Applying regular text splitter in file Nota SEI nº 482018CRJPGACETPGFN-MF.txt.
>>> Applying regular text splitter in file Parecer PGFN nº 1.8882008.txt.
>>> Applying regular text splitter in file Parecer PGFN nº 21182011.txt.
>>> Applying regular text splitter in file Parecer PGFN nº 26832008.txt.


Processing all referred legal documents:  71%|███████████████████████████████████████████████████████████████████████████████▎                                | 235/332 [00:12<00:03, 28.72it/s]

>>> Applying regular text splitter in file Parecer PGFN nº 7012016.txt.
>>> Applying regular text splitter in file Parecer PGFNCRJ Nº 22712013.txt.
>>> Applying regular text splitter in file Parecer SEI nº 1102018CRJPGACETPGFN-MF.txt.
>>> Applying regular text splitter in file Portaria nº 277.txt.


Processing all referred legal documents:  79%|████████████████████████████████████████████████████████████████████████████████████████▍                       | 262/332 [00:14<00:03, 22.37it/s]

>>> Applying regular text splitter in file RIR2018.txt.
>>> Applying regular text splitter in file Solução de Consulta Cosit nº 100.txt.
>>> Applying regular text splitter in file Solução de Consulta Cosit nº 105.txt.
>>> Applying regular text splitter in file Solução de Consulta Cosit nº 115.txt.
>>> Applying regular text splitter in file Solução de Consulta Cosit nº 123.txt.
>>> Applying regular text splitter in file Solução de Consulta Cosit nº 134.txt.
>>> Applying regular text splitter in file Solução de Consulta Cosit nº 135.txt.
>>> Applying regular text splitter in file Solução de Consulta Cosit nº 138.txt.
>>> Applying regular text splitter in file Solução de Consulta Cosit nº 14.txt.
>>> Applying regular text splitter in file Solução de Consulta Cosit nº 140.txt.
>>> Applying regular text splitter in file Solução de Consulta Cosit nº 144.txt.
>>> Applying regular text splitter in file Solução de Consulta Cosit nº 147.txt.
>>> Applying regular text splitter in file Solução de 

Processing all referred legal documents:  93%|████████████████████████████████████████████████████████████████████████████████████████████████████████▏       | 309/332 [00:14<00:00, 64.16it/s]

>>> Applying regular text splitter in file Solução de Consulta Cosit nº 206.txt.
>>> Applying regular text splitter in file Solução de Consulta Cosit nº 209.txt.
>>> Applying regular text splitter in file Solução de Consulta Cosit nº 211.txt.
>>> Applying regular text splitter in file Solução de Consulta Cosit nº 214.txt.
>>> Applying regular text splitter in file Solução de Consulta Cosit nº 240.txt.
>>> Applying regular text splitter in file Solução de Consulta Cosit nº 256.txt.
>>> Applying regular text splitter in file Solução de Consulta Cosit nº 313.txt.
>>> Applying regular text splitter in file Solução de Consulta Cosit nº 354.txt.
>>> Applying regular text splitter in file Solução de Consulta Cosit nº 356.txt.
>>> Applying regular text splitter in file Solução de Consulta Cosit nº 48.txt.
>>> Applying regular text splitter in file Solução de Consulta Cosit nº 6.txt.
>>> Applying regular text splitter in file Solução de Consulta Cosit nº 60.txt.
>>> Applying regular text splitt

Processing all referred legal documents: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 332/332 [00:14<00:00, 22.42it/s]

>>> Applying regular text splitter in file Solução de Consulta Cosit nº 79.txt.
>>> Applying regular text splitter in file Solução de Consulta Cosit nº 82.txt.





In [30]:
splitter_df = pd.DataFrame(applied_splitter)

In [31]:
splitter_df.groupby("splitter").count()

Unnamed: 0_level_0,filename
splitter,Unnamed: 1_level_1
hierarchical,171
text,161


In [32]:
len(passages_referred_docs)

20464

In [28]:
splitter_df.groupby("splitter").count()

Unnamed: 0_level_0,filename
splitter,Unnamed: 1_level_1
hierarchical,250
text,82


In [23]:
len(passages_referred_docs)

24281

#### Check data statistics

In [24]:
tokens_counter.passages_statistics(passages_referred_docs)

{'max_tokens': 1007,
 'min_tokens': 18,
 'mean_tokens': 165.15407108438697,
 'std_tokens': 105.98080524276165,
 'skewness_tokens': 1.5930594614675477,
 'kurtosis_tokens': 6.1454833961628985,
 'median_tokens': 160,
 'max_token_passage': 3220}

In [21]:
tokens_counter.passages_statistics(passages_referred_docs)

{'max_tokens': 966,
 'min_tokens': 19,
 'mean_tokens': 202.92464816262705,
 'std_tokens': 94.56423146322919,
 'skewness_tokens': 0.8993632242460677,
 'kurtosis_tokens': 5.14506011498575,
 'median_tokens': 218.0,
 'max_token_passage': 11027}

In [33]:
import pickle

In [34]:
with open(DATA_PATH + "chunks_normas_new_20250228.pkl", "wb") as output_file:
    pickle.dump(passages_referred_docs, output_file, pickle.HIGHEST_PROTOCOL)