Building the LegalQA dataset by extracting text from pdf files downloaded from: https://www.gov.br/receitafederal/pt-br/centrais-de-conteudo/publicacoes/perguntas-e-respostas

In [28]:
from PyPDF2 import PdfReader
from pathlib import Path
import pandas as pd
import re
from re import search

In [29]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [30]:
# create pandas dataframe to store the QAs extracted from pdf files
df = pd.DataFrame()

In [31]:
# Reading ITR 2022 pdf file

filename = './rawdata/ITR2022.pdf'
skipto = 12
qa = []
    
# read file
reader = PdfReader(filename)    
# get total number of pages
num_pages = len(reader.pages)
print('File %s has %d pages' % (filename, num_pages))

question = ''
answer = ''
skipline = False
stillquestion = False

qnum = "|".join("^%03d —" % i for i in range(227))

for idx in range(skipto, num_pages):
    page = reader.pages[idx]
    pagetext = page.extract_text()
        
    lines = pagetext.split('\n')
    lineidx = 0
    while lineidx < len(lines):
        line = lines[lineidx]
        if search(qnum, line):
            if question:
                qa.append({'question': question, 'answer': answer})
            question = line.split('—')[1]
            answer = ''
            if '?' not in line:
                stillquestion = True            
        elif 'Retorno ao sumário' in line:
            lineidx += 1
            if lineidx >= len(lines):
                break
            line = lines[lineidx]
            if search(qnum, line):
                continue
        elif '?' in line:
            question = question + ' ' + line
            stillquestion = False
        else:
            if stillquestion:
                question = question + ' ' + line
            else:
                answer = answer + ' ' + line
        lineidx += 1
qa.append({'question': question, 'answer': answer})
              
print('we have a total of %d questions and answers for the ITR 2022 pdf file' % len(qa))    
df_temp = pd.DataFrame.from_dict(qa)

df = pd.concat([df, df_temp])

File ./rawdata/ITR2022.pdf has 81 pages
we have a total of 226 questions and answers for the ITR 2022 pdf file


In [32]:
df.shape

(226, 2)

In [33]:
# Reading DIRF 2023 pdf file

filename = './rawdata/Dirf2023.pdf'
skipto = 7
qa = []

titles_to_skip = {'1 INFORMAÇÕES GERAIS': "^(1.1|1.2|1.3) ", 
                  '2 PRAZO DE ENTREGA': "^(2.1|2.2|2.3) ", 
                  '3 DECLARANTES': "^(3.1|3.2|3.3|3.4) ", 
                  '4 RENDIMENTOS': "^(4.1|4.2|4.3) ", 
                  '5 RENDIMENTOS ISENTOS': "^(5.1|5.2|5.3|5.4|5.5) ", 
                  '6 REMESSA PARA O EXTERIOR': "^(6.1|6.2|6.3) ", 
                  '7 PREVIDÊNCIA': "^(7.1|7.2|7.3|7.4) ", 
                  '8 PLANO PRIVADO DE ASSISTÊNCIA À SAÚDE – COLETIVO EMPRESARIAL': "^(8.1|8.2|8.3|8.4) ", 
                  '9 PREENCHIMENTO': "^(9.1|9.2|9.3|9.4|9.5|9.6|9.7|9.8) ",
                  '10 RENDIMENTOS RECEBIDOS ACUMULADAMENTE': "^(10.1|10.2|10.3|10.4|10.5|10.6) ", 
                  '11 RENDIMENTOS PAGOS ÀS ENTIDADES IMUNES/ISENTAS – IN RFB 1.234/2012': "^(11.1|11.2) ",
                  '12 COMPROVANTE DE RENDIMENTOS': "^(12.1|12.2|12.3|12.4|12.5|12.6|12.7) ", 
                  '13 PROGRAMA GERADOR DA DECLARAÇÃO': "^(13.1|13.2|13.3|13.4|13.5) ", 
                  '14 MULTA POR ATRASO NA ENTREGA': "^(14.1|14.2|14.3) ", 
                  '15 RETIFICAÇÃO DE DECLARAÇÃO': "^(15.1|15.2|15.3|15.4|15.5|15.6) ", 
                  '16 LEIAUTE': "^(16.1|16.2|16.3) ",
                  '17 IMPORTAÇÃO': "^(17.1|17.2|17.3) ", 
                  '18 TRANSMISSÃO': "^(18.1|18.2|18.3|18.4|18.5) "}

    
# read file
reader = PdfReader(filename)    
# get total number of pages
num_pages = len(reader.pages)
print('File %s has %d pages' % (filename, num_pages))

question = ''
answer = ''
skipline = False
stillquestion = False

for idx in range(skipto, num_pages):
    page = reader.pages[idx]
    pagetext = page.extract_text()
    lines = pagetext.split('\n')     
    for line in lines:
        if line in titles_to_skip.keys():
            qnum = titles_to_skip[line]
        else:
            if search(qnum, line):
                if question:
                    qa.append({'question': question, 'answer': answer})    
                question = [s.strip() for s in re.split(qnum, line)][2]
                answer = ''
                if '?' not in line:
                    stillquestion = True
            elif '?' in line:
                question = question + ' ' + line
                stillquestion = False
            else:
                if stillquestion:
                    question = question + ' ' + line
                else:
                    answer = answer + ' ' + line
qa.append({'question': question, 'answer': answer})  

print('we have a total of %d questions and answers for the DIRF 2023 pdf file' % len(qa))    
df_temp = pd.DataFrame.from_dict(qa)

df = pd.concat([df, df_temp])

File ./rawdata/Dirf2023.pdf has 45 pages
we have a total of 77 questions and answers for the DIRF 2023 pdf file


In [34]:
df.shape

(303, 2)

In [35]:
# Reading DCTFWeb 2022 pdf file

filename = './rawdata/dctfweb.pdf'
skipto = 2
qa = []

titles_to_skip = {'1 – PAGAMENTO: DARF, SALDO A PAGAR, ABATIMENTO, AJUSTE NO SISTAD  ': "^(1.1|1.2|1.3|1.4|1.5|1.6|1.7|1.8|1.9|1.10|1.11|1.12|1.13|    1.14|1.15|1.16|1.17|    1.18|1.19|1.20|1.21|    1.22|1.23|1.24|1.25) ",
                  '2 – INTEGRAÇÃO DA DCTFWEB COM ESOCIAL E EFD -REINF, SUSPENSÃO  ': "^(2.1|2.2|    2.3|2.4|2.5|2.6|2.7|2.8|2.9|2.10) ",
                  '3 – COMPENSAÇÃO E RESTITUIÇÃO  ': "^(3.1|3.2|3.3|3.4|3.5|3.6|3.7|3.8|3.9|3.10|    3.11|3.12|    3.13|3.14) ", 
                  '4 – IRRF E FGTS  ': "^(4.1|4.2|4.3) ", 
                  '    5 – OUTROS ASSUN TOS RELACIONADOS À DCTFWEB  ': "^(5.1|5.2|5.3|5.4|5.5|5.6|    5.7|5.8|5.9) "}
    
# read file
reader = PdfReader(filename)    
# get total number of pages
num_pages = len(reader.pages)
print('File %s has %d pages' % (filename, num_pages))

question = ''
answer = ''
skipline = False
stillquestion = False

for idx in range(skipto, num_pages):
    page = reader.pages[idx]
    pagetext = page.extract_text()
    lines = pagetext.split('\n')   
    
    if idx == 2:
        lines = lines[35:]
    
    lineidx = 0
    while lineidx < len(lines):
        line = lines[lineidx]
        if 'Perguntas e Respostas da DCTFWeb – março /2022' in line:
            lineidx += 2
            continue
        if line in titles_to_skip.keys():
            qnum = titles_to_skip[line]
        else:                
            if search(qnum, line):
                if question:
                    qa.append({'question': question, 'answer': answer})
                question = [s.strip() for s in re.split(qnum, line)][2]
                answer = ''
                if '?' not in line:
                    stillquestion = True            
            elif '?' in line:
                question = question + ' ' + line
                stillquestion = False
            elif 'previdenciários pelo sistema SisFIES.' in line:
                question = question + ' ' + line
                stillquestion = False
            else:
                if stillquestion:
                    question = question + ' ' + line
                else:
                    answer = answer + ' ' + line
        lineidx += 1
qa.append({'question': question, 'answer': answer})  

print('we have a total of %d questions and answers for the DCTFWeb 2022 pdf file' % len(qa))    
df_temp = pd.DataFrame.from_dict(qa)

df = pd.concat([df, df_temp])

File ./rawdata/dctfweb.pdf has 19 pages
we have a total of 61 questions and answers for the DCTFWeb 2022 pdf file


In [36]:
df.shape

(364, 2)

In [37]:
# Reading DEREX pdf file
# QUESTÕES RELACIONADAS À DECLARAÇÃO SOBRE A UTILIZAÇÃO DE RECURSOS EM MOEDA ESTRANGEIRA DECORRENTES DO RECEBIMENTO DE EXPORTAÇÕES (DEREX)

filename = './rawdata/derex.pdf'
skipto = 0
qa = []
    
# read file
reader = PdfReader(filename)    
# get total number of pages
num_pages = len(reader.pages)
print('File %s has %d pages' % (filename, num_pages))

question = ''
answer = ''
skipline = False
stillquestion = True

qnum = "|".join("^%1d. [A-Z]" % i for i in range(18))
list_nums = "|".join("^%1d. " % i for i in range(18))

for idx in range(skipto, num_pages):
    page = reader.pages[idx]
    pagetext = page.extract_text()
    
    pagetext = pagetext[3:]
    lines = pagetext.split('\n')
    if idx == 0:
        lines = lines[13:]  
        
    for line in lines:
        if search(qnum, line):
            if question:
                qa.append({'question': question, 'answer': answer})
            question = [s.strip() for s in re.split(list_nums, line)][1]
            answer = ''
            stillquestion = True
        elif search('^R.:', line):
            line = line.split('R.:')[1]
            answer = answer + ' ' + line
            stillquestion = False
        else:
            if stillquestion:
                question = question + ' ' + line
            else:
                answer = answer + ' ' + line
qa.append({'question': question, 'answer': answer})

print('we have a total of %d questions and answers for the DEREX pdf file' % len(qa))    
df_temp = pd.DataFrame.from_dict(qa)

df = pd.concat([df, df_temp])


File ./rawdata/derex.pdf has 8 pages
we have a total of 17 questions and answers for the DEREX pdf file


In [38]:
df.shape

(381, 2)

In [39]:
# Reading salex pdf file
# QUESTÕES RELACIONADAS À COMPROVAÇÃO DO INGRESSO DOS RECURSOS PROVENIENTES DAS EXPORTAÇÕES (Lei nº 11.371/2006)

filename = './rawdata/salex.pdf'
skipto = 0
qa = []
    
# read file
reader = PdfReader(filename)    
# get total number of pages
num_pages = len(reader.pages)
print('File %s has %d pages' % (filename, num_pages))

question = ''
answer = ''
skipline = False
stillquestion = True

qnum = "|".join("^%1d. [A-Z]" % i for i in range(15))
list_nums = "|".join("^%1d. " % i for i in range(15))

for idx in range(skipto, num_pages):
    page = reader.pages[idx]
    pagetext = page.extract_text()
    
#     pagetext = pagetext[3:]
    lines = pagetext.split('\n')
    if idx == 0:
        lines = lines[10:]     
    else:
        lines = lines[1:]
        
    for line in lines:
        if search(qnum, line):
            if question:
                qa.append({'question': question, 'answer': answer})
            question = [s.strip() for s in re.split(list_nums, line)][1]
            answer = ''
            stillquestion = True
        elif search('^R.:', line):
            line = line.split('R.:')[1]
            answer = answer + ' ' + line
            stillquestion = False
        else:
            if stillquestion:
                question = question + ' ' + line
            else:
                answer = answer + ' ' + line              
qa.append({'question': question, 'answer': answer})

print('we have a total of %d questions and answers for the salex pdf file' % len(qa))    
df_temp = pd.DataFrame.from_dict(qa)

df = pd.concat([df, df_temp])

File ./rawdata/salex.pdf has 4 pages
we have a total of 15 questions and answers for the salex pdf file


In [40]:
df.shape

(396, 2)

In [41]:
# saving QA data
df.to_csv('LegalQA_dataset.csv', index=False, sep='\t')