# Setup

Import libraries

In [232]:
import numpy as np
import tensorflow as tf
from os import chdir, getcwd, listdir, mkdir
from os.path import join
import requests
from xml.etree import ElementTree as ET
import re
import pandas as pd

HOME = getcwd()

## RBAC 121

In [225]:
RBAC121 = ['']

with open(join(HOME, 'RBAC', 'RBAC121_en.txt'), encoding='utf-8') as file:
    for line in file:
        if re.match('\<EOS\>',line):
            RBAC121.append('')
        else:
            RBAC121[-1] += line
_ = RBAC121.pop()

NameError: name 'b' is not defined

In [253]:
titles, requirements = [], []

for section in RBAC121:
    title, text = section.split(' ', maxsplit=1)
    text = re.sub('\n', ' ', text)
    titles.append(title)
    requirements.append(text)

In [481]:
df = pd.DataFrame(zip(titles, requirements), columns = ['title', 'requirement'])
df.to_excel(join('RBAC','RBAC121_labelled.xlsx'))
df['title'] = df.title.map(lambda x: x.replace(',','.'))

In [None]:
d

Use part 121 labels as reference:

In [482]:
part121 = pd.read_excel(join('FAR','ECFR-title14_labelled.xlsx'))[['title','requirement','label','tag']]
part121['section'] = part121.requirement.map(lambda x: re.findall('^§ (121.\d*)', x)[0])

In [483]:
df_agg = part121.groupby(['section', 'label']).agg({'label':'count'})
df_agg.columns = ['count']
index = df_agg.index
df_agg['section'] = [ind[0] for ind in index]
df_agg['label'] = [ind[1] for ind in index]
df_agg.reset_index(drop=True, inplace=True)
df_agg = df_agg.pivot_table('count', 'section', 'label', fill_value=0)
df_agg['label'] = np.array(df_agg).argmax(axis=1)
df_agg['title'] = df_agg.index

In [484]:
df_labelled = df.join(df_agg, on = 'title', rsuffix = 'FAA')[['title','requirement','label']]
df_labelled.to_excel(join('RBAC', 'RBAC121_labelled.xlsx'))

## FAR Part 121

In [469]:
df.join(df_agg, on = 'title', rsuffix = 'FAA')[['title','requirement','label']]

Unnamed: 0,title,requirement,label
0,121.1,"Applicability according to RBAC No. 119, which...",0.0
1,121.2,[Reserved] (Wording given by Resolution No. 52...,0.0
2,121.3,[Reserved],
3,121.4,Applicability of the rules for unauthorized op...,0.0
4,121.7,Definitions The following definitions apply to...,0.0
...,...,...,...
355,121.1223,[Reserved],
356,121.1225,Component “Operational security policy and obj...,
357,121.1227,“Operational security risk management” compone...,
358,121.1229,Component “guarantee of operational safety” in...,


RangeIndex(start=0, stop=360, step=1)

Download file

In [5]:
URL = 'https://www.govinfo.gov/bulkdata/ECFR/title-14/ECFR-title14.xml'

if 'FAR' not in listdir():
    mkdir('FAR')

chdir(join(HOME, 'FAR'))
r = requests.get(URL)
with open('ECFR-title14.xml', 'wb') as file:
    file.write(r.content)

chdir(HOME)

Parse xml file

In [490]:
filepath = join('FAR', 'ECFR-title14.xml')
tree = ET.parse(filepath)
root = tree.getroot()

for element in root.iter():
    if element.tag == 'DIV5':
        if element.attrib['N'] == '121':
            root121 = element

In [499]:
section_nodes = []

for element in root121.iter():
    if element.tag == 'DIV8':
        section_nodes.append(element)

Print to screen:

In [600]:
section, paragraph, item = '', '', ''
section_id, paragraph_id, item_id = '', '', ''
current_level = ''

for line in section_nodes[0].itertext():
    if re.match('^§ 121.\d*', line):
        current_level = 'section'
        section_id = re.findall('^§ 121.\d*', line)[0]
        section = line + '\n'
        paragraph, item, paragraph_id, item_id = '', '', '', ''
    elif re.match('^\([a-h]\)', line):
        if current_level == 'paragraph' or current_level == 'item':
            print(section_id + paragraph_id + item_id + '\n')
            print(section + paragraph + item)
            print('----------')
        paragraph_id = re.findall('^\([a-h]\)', line)[0]
        paragraph = line
        item, item_id = '', ''
        current_level = 'paragraph'
    elif re.match('^\(\d*\)',line):
        if current_level == 'item':
            print(section_id + paragraph_id + item_id + '\n')
            print(section + paragraph + item)
            print('----------')
        item_id = re.findall('^\(\d*\)',line)[0]
        item = line
        current_level = 'item'
    elif re.match('^\s+$', line):
        pass
    elif re.match('\[.*\]', line):
        pass
    else:
        if current_level == 'section':
            section += line
        elif current_level == 'paragraph':
            paragraph += line
        elif current_level == 'item':
            item += line
print(section_id + paragraph_id + item_id + '\n')
print(section + paragraph + item)

§ 121.1(a)

§ 121.1   Applicability.
This part prescribes rules governing - 
(a) The domestic, flag, and supplemental operations of each person who holds or is required to hold an Air Carrier Certificate or Operating Certificate under part 119 of this chapter. 

----------
§ 121.1(b)

§ 121.1   Applicability.
This part prescribes rules governing - 
(b) Each person employed or used by a certificate holder conducting operations under this part including maintenance, preventive maintenance, and alteration of aircraft. 

----------
§ 121.1(c)

§ 121.1   Applicability.
This part prescribes rules governing - 
(c) Each person who applies for provisional approval of an Advanced Qualification Program curriculum, curriculum segment, or portion of a curriculum segment under SFAR No. 58 of 14 CFR part 121, and each person employed or used by an air carrier or commercial operator under this part to perform training, qualification, or evaluation functions under an Advanced Qualification Program unde

Create lists of titles and requirements:

In [604]:
requirement_titles = []
requirement_texts = []

for sec in section_nodes:
    
    section, paragraph, item = '', '', ''
    section_id, paragraph_id, item_id = '', '', ''
    current_level = ''

    for line in sec.itertext():
        if re.match('^§ 121.\d*', line):
            current_level = 'section'
            section_id = re.findall('^§ 121.\d*', line)[0]
            section = line + '\n'
            paragraph, item, paragraph_id, item_id = '', '', '', ''
        elif re.match('^\([a-h]\)', line):
            if current_level == 'paragraph' or current_level == 'item':
                requirement_titles.append(section_id + paragraph_id + item_id)
                requirement_texts.append(section + paragraph + item)
            paragraph_id = re.findall('^\([a-h]\)', line)[0]
            paragraph = line
            item, item_id = '', ''
            current_level = 'paragraph'
        elif re.match('^\(\d*\)',line):
            if current_level == 'item':
                requirement_titles.append(section_id + paragraph_id + item_id)
                requirement_texts.append(section + paragraph + item)
            item_id = re.findall('^\(\d*\)',line)[0]
            item = line
            current_level = 'item'
        elif re.match('^\s+$', line):
            pass
        elif re.match('\[.*\]', line):
            pass
        else:
            if current_level == 'section':
                section += line
            elif current_level == 'paragraph':
                paragraph += line
            elif current_level == 'item':
                item += line
    requirement_titles.append(section_id + paragraph_id + item_id)
    requirement_texts.append(section + paragraph + item)

In [605]:
len(requirement_titles), len(requirement_texts)

(2364, 2364)

Save excel file to disk:

In [608]:
filepath = join(HOME, 'FAR', 'ECFR-title14.xlsx')

pd.DataFrame(zip(requirement_titles, requirement_texts)).to_excel(filepath)

# Trash

In [82]:
filepath = join('FAR', 'ECFR-title14.xml')
tree = ET.parse(filepath)
root = tree.getroot()

sections = []

for element in root.iter():
    if element.tag == 'DIV8' and re.match('^§ 121.\d+', element.attrib['N']):
        sections.append(element)

In [85]:
sections[0].attrib

{'N': '§ 121.1', 'NODE': '14:3.0.1.1.7.1.2.1', 'TYPE': 'SECTION'}

In [97]:
for element in sections[0].iter():
    print(element.attrib)

{'N': '§ 121.1', 'NODE': '14:3.0.1.1.7.1.2.1', 'TYPE': 'SECTION'}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{'TYPE': 'N'}


In [133]:
section_text = ''
paragraph_text = ''
subitem_text = ''

position = 'section'

for text in sections[0].itertext():
    if re.search('\w', text):
        if re.match('^§ 121.\d*\s', text):
            position = 'section'
            section_text, paragraph_text, subitem_text = '', '', ''
        elif re.match('^([a-z]*)', text):
            position = 'paragraph'
            paragraph_text, subitem_text = '', ''
        elif
            
        
        section_text += text

print(section_text)

§ 121.1   Applicability.This part prescribes rules governing - 
(a) The domestic, flag, and supplemental operations of each person who holds or is required to hold an Air Carrier Certificate or Operating Certificate under part 119 of this chapter. 
(b) Each person employed or used by a certificate holder conducting operations under this part including maintenance, preventive maintenance, and alteration of aircraft. 
(c) Each person who applies for provisional approval of an Advanced Qualification Program curriculum, curriculum segment, or portion of a curriculum segment under SFAR No. 58 of 14 CFR part 121, and each person employed or used by an air carrier or commercial operator under this part to perform training, qualification, or evaluation functions under an Advanced Qualification Program under SFAR No. 58 of 14 CFR part 121. 
(d) Nonstop Commercial Air Tours conducted for compensation or hire in accordance with § 119.1(e)(2) of this chapter must comply with drug and alcohol requi

In [126]:
if re.match('.*a', 'fdsfa'):
    print('match!')

match!


In [75]:
for element in root121.iter():
    print(element.text)
    



§ 121.1500   SFAR No. 111 - Lavatory Oxygen Systems.
(a) 
Applicability.
(1) All operators of transport category airplanes that are required to comply with AD 2012-11-09, but only for airplanes on which the actions required by that AD have not been accomplished.

(2) Applicants for airworthiness certificates.

(3) Holders of production certificates.

(4) Applicants for type certificates, including changes to type certificates.

(b) 
Regulatory relief.
(1) A person described in paragraph (a) of this section may conduct flight operations and add airplanes to operations specifications with disabled lavatory oxygen systems, modified in accordance with FAA Airworthiness Directive 2011-04-09, subject to the following limitations:

(i) This relief is limited to regulatory compliance of lavatory oxygen systems.

(ii) Within 30 days of March 29, 2013, all oxygen masks must be removed from affected lavatories, and the mask stowage location must be reclosed.

(iii) Within 60 days of March 29, 2

<Element 'P' at 0x000001DFA4285A90>

In [60]:
root121.attrib['N']

'§ 121.1500'

In [31]:
dir(root121)

['__class__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__delitem__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setitem__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'append',
 'attrib',
 'clear',
 'extend',
 'find',
 'findall',
 'findtext',
 'get',
 'getchildren',
 'getiterator',
 'insert',
 'items',
 'iter',
 'iterfind',
 'itertext',
 'keys',
 'makeelement',
 'remove',
 'set',
 'tag',
 'tail',
 'text']

In [18]:
for element in root.iter():
    # stop when DoT part is reached
    if element.tag == 'DIV1':
        if element.attrib['N'] == '4':
            break 
        # if reached a new section
    if element.tag == 'DIV8' and re.match('^§ 121.\d+', element.attrib['N']):
        section_numbers.append(element.attrib['N'])
        

385

In [19]:
type(root)

xml.etree.ElementTree.Element

xml.etree.ElementTree.Element

In [8]:
filepath = join('FAR', 'ECFR-title14.xml')
tree = ET.parse(filepath)
root = tree.getroot()

part121 = []
atributes = []

# iterate over each element of the xml tree and fill corresponding lists
for element in root.iter():
    # stop when DoT part is reached
    if element.tag == 'DIV1':
        if element.attrib['N'] == '4':
            break 
        # if reached a new section
    if element.tag == 'DIV8':
        if '§ 121.' in element.attrib['N']:
            section = []
            [section.append(line) for line in element.itertext() if line[0] != '\n']
            part121.append(' '.join(section))