# Setup

Import libraries

In [1]:
import numpy as np
import tensorflow as tf
from os import chdir, getcwd, listdir, mkdir
from os.path import join
import requests
from xml.etree import ElementTree as ET
import re
import pandas as pd
import time
import pkg_resources

HOME = getcwd()

In [2]:
package_list = [pkg for pkg in pkg_resources.working_set]

if 'googletrans' not in [pkg.key for pkg in package_list]:
    ! pip install googletrans==3.1.0a0
import googletrans
    
if 'pdfplumber' not in [pkg.key for pkg in package_list]:
    ! pip install pdfplumber
import pdfplumber

## FAR Part 121

Download file from the ECFR website

In [None]:
URL = 'https://www.govinfo.gov/bulkdata/ECFR/title-14/ECFR-title14.xml'

if 'Regulations' not in listdir():
    mkdir('Regulations')

chdir(join(HOME, 'Regulations'))
r = requests.get(URL)
with open('ECFR-title14.xml', 'wb') as file:
    file.write(r.content)

chdir(HOME)

Identify node corresponding to Part 121 xml file

In [None]:
filepath = join('Regulations', 'ECFR-title14.xml')
tree = ET.parse(filepath)
root = tree.getroot()

for element in root.iter():
    if element.tag == 'DIV5':
        if element.attrib['N'] == '121':
            root121 = element

section_nodes = []

for element in root121.iter():
    if element.tag == 'DIV8':
        section_nodes.append(element)

Save file with individual sections:

In [None]:
titles = []
sections = []

for section in section_nodes:
    titles.append(section.attrib['N'])
    text = ''
    for line in section.itertext():
        if re.search('\w', line):
            text += line
    sections.append(text)
    
filename = join(HOME, 'Regulations', 'FAR_Part121_sections.xlsx')    
pd.DataFrame({'title':titles, 'requirement':sections}).to_excel(filename)

Test splitting method: print section to screen:

In [None]:
SECTION_NUMBER = 155

section, paragraph, item = '', '', ''
section_id, paragraph_id, item_id = '', '', ''
current_level = ''

for line in section_nodes[SECTION_NUMBER].itertext():
    if re.match('^§ 121.\d*', line):
        current_level = 'section'
        section_id = re.findall('^§ 121.\d*', line)[0]
        section = line + '\n'
        paragraph, item, paragraph_id, item_id = '', '', '', ''
    elif re.match('^\([a-h]\)', line):
        if current_level == 'paragraph' or current_level == 'item':
            print(section_id + paragraph_id + item_id + '\n')
            print(section + paragraph + item)
            print('----------')
        paragraph_id = re.findall('^\([a-h]\)', line)[0]
        paragraph = line
        item, item_id = '', ''
        current_level = 'paragraph'
    elif re.match('^\(\d*\)',line):
        if current_level == 'item':
            print(section_id + paragraph_id + item_id + '\n')
            print(section + paragraph + item)
            print('----------')
        item_id = re.findall('^\(\d*\)',line)[0]
        item = line
        current_level = 'item'
    elif re.match('^\s+$', line):
        pass
    elif re.match('\[.*\]', line):
        pass
    else:
        if current_level == 'section':
            section += line
        elif current_level == 'paragraph':
            paragraph += line
        elif current_level == 'item':
            item += line
print(section_id + paragraph_id + item_id + '\n')
print(section + paragraph + item)

Create lists of titles and requirements:

In [None]:
requirement_titles = []
requirement_texts = []

for sec in section_nodes:
    
    section, paragraph, item = '', '', ''
    section_id, paragraph_id, item_id = '', '', ''
    current_level = ''

    for line in sec.itertext():
        if re.match('^§ 121.\d*', line):
            current_level = 'section'
            section_id = re.findall('^§ 121.\d*', line)[0]
            section = line + '\n'
            paragraph, item, paragraph_id, item_id = '', '', '', ''
        elif re.match('^\([a-h]\)', line):
            if current_level == 'paragraph' or current_level == 'item':
                requirement_titles.append(section_id + paragraph_id + item_id)
                requirement_texts.append(section + paragraph + item)
            paragraph_id = re.findall('^\([a-h]\)', line)[0]
            paragraph = line
            item, item_id = '', ''
            current_level = 'paragraph'
        elif re.match('^\(\d*\)',line):
            if current_level == 'item':
                requirement_titles.append(section_id + paragraph_id + item_id)
                requirement_texts.append(section + paragraph + item)
            item_id = re.findall('^\(\d*\)',line)[0]
            item = line
            current_level = 'item'
        elif re.match('^\s+$', line):
            pass
        elif re.match('\[.*\]', line):
            pass
        else:
            if current_level == 'section':
                section += line
            elif current_level == 'paragraph':
                paragraph += line
            elif current_level == 'item':
                item += line
    requirement_titles.append(section_id + paragraph_id + item_id)
    requirement_texts.append(section + paragraph + item)

Save excel file to disk:

In [None]:
filepath = join(HOME, 'Regulations', 'FAR_Part121_nodes.xlsx')
df = pd.DataFrame(zip(requirement_titles, requirement_texts), columns = ['title', 'requirement'])
df = df.loc[df.title != '']
df.to_excel(filepath)

## RBAC 121

Download the pdf file from www.ANAC.gov.br

In [3]:
url = 'https://www.anac.gov.br/assuntos/legislacao/legislacao-1/rbha-e-rbac/rbac/rbac-121/@@display-file/arquivo_norma/RBAC121EMD12.pdf'

r = requests.get(url)

if 'Regulations' not in listdir():
    mkdir('Regulations')

chdir(join(HOME, 'Regulations'))

with open('RBAC121.pdf', 'wb') as file:
    file.write(r.content)

chdir(HOME)

Parse the .pdf file and save to .txt

In [4]:
filepath = join(HOME, 'Regulations', 'RBAC121.pdf')
text = ''
chdir(HOME)

with pdfplumber.open(filepath) as pdf:
    #for page in range(len(pdf.pages)):
    for page in range(9,243): # skip the table of contents and stop before annexes
        text += pdf.pages[page].extract_text()
        
filepath = join(HOME, 'Regulations', 'RBAC121.txt')
with open(filepath, 'w') as file:
    file.write(text)

### Save version with individual sections

Create list of individual sections

In [5]:
filepath = join(HOME, 'Regulations', 'RBAC121.txt')

requirements = []
titles = []

with open(filepath) as file:
    for line in file:
        if (re.match(' Data da emissão', line) or 
            re.match('Data de vigência', line) or 
            re.match('Origem: SPO', line) or
            #re.match('SUBPARTE ', line) or
            #re.match('\[RESERVADO\]', line) or
            not re.search('\w', line)):
            pass
        elif re.match('^121\.\d{1,4}  ', line):
            requirements.append(line)
        elif len(requirements) > 0:
            requirements[-1] += line
            
for n, req in enumerate(requirements):
    #titles.append(re.findall('121.\d{1,4}', req)[0]) # alternative implementation
    titles.append(req.split('  ')[0])
    requirements[n] = re.sub('\nSUBPARTE \w*\s\n.*$', '', req, flags = re.DOTALL)
    
filepath = join(HOME, 'Regulations', 'RBAC121_sections.xlsx')    
pd.DataFrame({'title':titles, 'requirement':requirements}).to_excel(filepath)

### Save split version

Split sections into paragraphs and items and save to excel

In [116]:
filepath = join(HOME, 'Regulations', 'RBAC121_sections.xlsx')
df_sections = pd.read_excel(filepath, dtype=str)[['title', 'requirement']]

section_titles, section_texts = [], []
item_titles, item_texts = [], []


for section_tuple in df_sections.itertuples():
    sec = section_tuple[1]
    req = section_tuple[2]
    section, paragraph, item = '', '', ''
    section_id, paragraph_id, item_id = '', '', ''
    current_level = ''
    
    for line in req.split('\n'):
        if re.match('^121.\d*', line) and re.match(req.split(' ')[0], line):
            current_level = 'section'
            section_id = re.findall('^121.\d*', line)[0]
            section = line + '\n'
            paragraph, item, paragraph_id, item_id = '', '', '', ''
        elif re.match('^\([a-h]\)', line):
            if current_level == 'paragraph' or current_level == 'item':
                section_titles.append(section_id + paragraph_id + item_id)
                section_texts.append(section + paragraph + item)
            paragraph_id = re.findall('^\([a-h]\)', line)[0]
            paragraph = line
            item, item_id = '', ''
            current_level = 'paragraph'
        elif re.match('^\(\d*\)',line):
            if current_level == 'item':
                section_titles.append(section_id + paragraph_id + item_id)
                section_texts.append(section + paragraph + item)
            item_id = re.findall('^\(\d*\)',line)[0]
            item = line
            current_level = 'item'
        elif re.match('^\s+$', line):
            pass
        elif re.match('\[.*\]', line):
            pass
        else:
            if current_level == 'section':
                section += line
            elif current_level == 'paragraph':
                paragraph += line
            elif current_level == 'item':
                item += line
    section_titles.append(section_id + paragraph_id + item_id)
    section_texts.append(section + paragraph + item)

In [119]:
filepath = join(HOME, 'Regulations', 'RBAC121_nodes.xlsx')
df_nodes = pd.DataFrame({'title':section_titles, 'requirement':section_texts})
df_nodes.to_excel(filepath)

### Translation

Create lists containing each section translation and translation status

In [123]:
filepath = join(HOME, 'Regulations', 'RBAC121_sections.xlsx')
requirements = pd.read_excel(filepath)['requirement'].tolist()

True

In [12]:
RBAC121_lang = np.repeat(np.array('pt'), len(requirements)).tolist()
RBAC121_en = np.repeat(np.array(''), len(requirements)).tolist()
RBAC121_extra_data = np.repeat(np.array(''), len(requirements)).tolist()

Translate each section using the google translate service. Run this code every few hours until all sentences are translated:

In [13]:
translator = googletrans.Translator()
wait_delay = 2 * 60 * 60 # 1 hours

In [None]:
wait_bool = False
translation_ready = 'RBAC121_sections_en.xlsx' in listdir(join(HOME, 'Regulations'))

while translation_ready != True:
    
    if wait_bool: # do not wait on the first time
        time.sleep(wait_delay)
    wait_bool=True
    
    for section in range(len(requirements)):
        if RBAC121_lang[section] == 'pt':
            translation = translator.translate(requirements[section], dest='en', src='pt')
            RBAC121_en[section] = translation.text
            RBAC121_extra_data[section] = translation.extra_data
            if translation.extra_data['original-language'] == 'pt':
                RBAC121_lang[section] = 'en'

    print('Translated sentences: {} out of {}'.format(RBAC121_lang.count('en'), len(RBAC121_lang)))
    if RBAC121_lang.count('en') == len(RBAC121_lang):
        translation_ready = True

Translated sentences: 0 out of 360
Translated sentences: 132 out of 360
Translated sentences: 259 out of 360
Translated sentences: 358 out of 360
Translated sentences: 358 out of 360


Save the full translation to .txt file.

In [None]:
filepath = join('Regulations', 'RBAC121_sections_en.xlsx')
df = pd.DataFrame({'title':titles, 'requirement':requirements, 'translation':RBAC121_en})
df.to_excel(filepath)

# Trash