Install _googletrans_. This _alpha_ version corrects some communication issues with the google translate service.

In [1]:
#! pip install googletrans==3.1.0a0

Install _pdfplumber_, required for parsing the original .pdf file.

In [2]:
#! pip install pdfplumber

Import required libraries

In [3]:
import numpy as np
from os import getcwd, chdir, mkdir, listdir
from os.path import join
import googletrans
import re
import requests
import pdfplumber

Download the regulation from www.ANAC.gov.br

In [4]:
url = 'https://www.anac.gov.br/assuntos/legislacao/legislacao-1/rbha-e-rbac/rbac/rbac-121/@@display-file/arquivo_norma/RBAC121EMD12.pdf'

r = requests.get(url)

if 'RBAC' not in listdir():
    mkdir('RBAC')

working_directory = getcwd()
chdir(join(working_directory, 'RBAC'))

with open('RBAC121.pdf', 'wb') as file:
    file.write(r.content)

chdir(working_directory)

Parse the .pdf file and save to .txt

In [5]:
filepath = join('RBAC', 'RBAC121.pdf')
text = ''

with pdfplumber.open(filepath) as pdf:
    #for page in range(len(pdf.pages)):
    for page in range(9,243):
        text += pdf.pages[page].extract_text()

filepath = join('RBAC', 'RBAC121.txt')
with open(filepath, 'w') as file:
    file.write(text)

Create list of individual sections

In [6]:
filepath = join('RBAC', 'RBAC121.txt')

RBAC121 = []

with open(filepath) as file:
    for line in file:
        if (re.match(' Data da emissão', line) or 
            re.match('Data de vigência', line) or 
            re.match('Origem: SPO', line) or
            not re.match('\w', line)):
            pass
        elif re.match('^121\.\d{1,4}  ', line):
            RBAC121.append(line)
        elif len(RBAC121) > 0:
            RBAC121[-1] += line

Create lists containing each section translation and translation status

In [7]:
RBAC121_lang = np.repeat(np.array('pt'), len(RBAC121)).tolist()
RBAC121_en = np.repeat(np.array(''), len(RBAC121)).tolist()
RBAC121_extra_data = np.repeat(np.array(''), len(RBAC121)).tolist()

Translate each section using the google translate service. Run this code every few hours until all sentences are translated:

In [13]:
translator = googletrans.Translator()

for section in range(len(RBAC121)):
    if RBAC121_lang[section] == 'pt':
        translation = translator.translate(RBAC121[section], dest='en', src='pt')
        RBAC121_en[section] = translation.text
        RBAC121_extra_data[section] = translation.extra_data
        if translation.extra_data['original-language'] == 'pt':
            RBAC121_lang[section] = 'en'

print('Translated sentences: {} out of {}'.format(RBAC121_lang.count('en'), len(RBAC121_lang)))

Translated sentences: 360 out of 360


Save the full translation to .txt file.

In [17]:
filepath = join('RBAC', 'RBAC121_en.txt')

with open(filepath, 'w', encoding='utf-8') as file:
    for section in RBAC121_en:
        file.write(section + '\n<EOS>\n')