In [1]:
from bs4 import BeautifulSoup

In [2]:
tei_doc = 'sample.tei.xml'
with open(tei_doc, 'r') as tei:
    soup = BeautifulSoup(tei, 'lxml')

In [3]:
soup.title

<title level="a" type="main">Attention Is All You Need</title>

In [4]:
soup.title.getText()

'Attention Is All You Need'

In [5]:
soup.abstract

<abstract>
<div xmlns="http://www.tei-c.org/ns/1.0"><p>The dominant sequence transduction models are based on complex recurrent or convolutional neural networks that include an encoder and a decoder. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train. Our model achieves 28.4 BLEU on the WMT 2014 Englishto-German translation task, improving over the existing best results, including ensembles, by over 2 BLEU. On the WMT 2014 English-to-French translation task, our model establishes a new single-model state-of-the-art BLEU score of 41.0 after training for 3.5 days on eight GPUs, a small fraction of the training costs of the bes

In [6]:
soup.abstract.getText(separator=' ', strip=True)

'The dominant sequence transduction models are based on complex recurrent or convolutional neural networks that include an encoder and a decoder. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train. Our model achieves 28.4 BLEU on the WMT 2014 Englishto-German translation task, improving over the existing best results, including ensembles, by over 2 BLEU. On the WMT 2014 English-to-French translation task, our model establishes a new single-model state-of-the-art BLEU score of 41.0 after training for 3.5 days on eight GPUs, a small fraction of the training costs of the best models from the literature. * Equal contribution. Li

In [7]:
abstract_text = soup.abstract.getText(separator=' ', strip=True)
'movement' in abstract_text.lower(), 'ecology' in abstract_text.lower(), 'computer' in abstract_text.lower()

(False, False, False)

In [8]:
def read_tei(tei_file):
    with open(tei_file, 'r') as tei:
        soup = BeautifulSoup(tei, 'lxml')
        return soup
    raise RuntimeError('Cannot generate a soup from the input')

In [9]:
def elem_to_text(elem, default=''):
    if elem:
        return elem.getText()
    else:
        return default

In [10]:
from dataclasses import dataclass

@dataclass
class Person:
    firstname: str
    middlename: str
    surname: str

turing_author = Person(firstname='Alan', middlename='M', surname='Turing')

f"{turing_author.firstname} {turing_author.surname} authored many influential publications in computer science."


'Alan Turing authored many influential publications in computer science.'

In [11]:
class TEIFile(object):
    def __init__(self, filename):
        self.filename = filename
        self.soup = read_tei(filename)
        self._text = None
        self._title = ''
        self._abstract = ''

    @property
    def doi(self):
        idno_elem = self.soup.find('idno', type='DOI')
        if not idno_elem:
            return ''
        else:
            return idno_elem.getText()

    @property
    def title(self):
        if not self._title:
            self._title = self.soup.title.getText()
        return self._title

    @property
    def abstract(self):
        if not self._abstract:
            abstract = self.soup.abstract.getText(separator=' ', strip=True)
            self._abstract = abstract
        return self._abstract

    @property
    def authors(self):
        authors_in_header = self.soup.analytic.find_all('author')

        result = []
        for author in authors_in_header:
            persname = author.persname
            if not persname:
                continue
            firstname = elem_to_text(persname.find("forename", type="first"))
            middlename = elem_to_text(persname.find("forename", type="middle"))
            surname = elem_to_text(persname.surname)
            person = Person(firstname, middlename, surname)
            result.append(person)
        return result
    
    @property
    def text(self):
        divs_text = []
        if not self._text:
            
            for div in self.soup.body.find_all("div")[1:]:
                # div is neither an appendix nor references, just plain text.
                if not div.get("type"):
                    div_text = div.get_text(separator=' ', strip=True)
                    divs_text.append(div_text)

            plain_text = " ".join(divs_text)
            self._text = divs_text
        return self._text

In [12]:
tei = TEIFile('sample.tei.xml')
f"The authors of the paper entitled '{tei.title}' are {tei.authors}"

"The authors of the paper entitled 'Attention Is All You Need' are [Person(firstname='Ashish', middlename='', surname='Vaswani'), Person(firstname='Google', middlename='', surname='Brain'), Person(firstname='Noam', middlename='', surname='Shazeer'), Person(firstname='Google', middlename='', surname='Brain'), Person(firstname='Niki', middlename='', surname='Parmar'), Person(firstname='Jakob', middlename='', surname='Uszkoreit'), Person(firstname='Llion', middlename='', surname='Jones'), Person(firstname='Aidan', middlename='N', surname='Gomez'), Person(firstname='Łukasz', middlename='', surname='Kaiser'), Person(firstname='Google', middlename='', surname='Brain'), Person(firstname='Illia', middlename='', surname='Polosukhin')]"

In [13]:
tei.text

['Introduction Recurrent neural networks, long short-term memory [12] and gated recurrent [7] neural networks in particular, have been firmly established as state of the art approaches in sequence modeling and transduction problems such as language modeling and machine translation [29, 2, 5] . Numerous efforts have since continued to push the boundaries of recurrent language models and encoder-decoder architectures [31, 21, 13] . Recurrent models typically factor computation along the symbol positions of the input and output sequences. Aligning the positions to steps in computation time, they generate a sequence of hidden states h t , as a function of the previous hidden state h t−1 and the input for position t. This inherently sequential nature precludes parallelization within training examples, which becomes critical at longer sequence lengths, as memory constraints limit batching across examples. Recent work has achieved significant improvements in computational efficiency through f

In [14]:
head_datas = [head.get_text() for head in soup.find_all('head')]
head_datas

[]

In [15]:
soup.find_all('h1')

[]

In [16]:
soup.find_all('div')[2]

<div xmlns="http://www.tei-c.org/ns/1.0">Background<p>The goal of reducing sequential computation also forms the foundation of the Extended Neural GPU <ref target="#b19" type="bibr">[20]</ref>, ByteNet <ref target="#b14" type="bibr">[15]</ref> and ConvS2S <ref target="#b7" type="bibr">[8]</ref>, all of which use convolutional neural networks as basic building block, computing hidden representations in parallel for all input and output positions. In these models, the number of operations required to relate signals from two arbitrary input or output positions grows in the distance between positions, linearly for ConvS2S and logarithmically for ByteNet. This makes it more difficult to learn dependencies between distant positions <ref target="#b10" type="bibr">[11]</ref>. In the Transformer this is reduced to a constant number of operations, albeit at the cost of reduced effective resolution due to averaging attention-weighted positions, an effect we counteract with Multi-Head Attention as

In [17]:
divs_text=[]
for div in soup.body.find_all("div")[1:]:
    # div is neither an appendix nor references, just plain text.
    if not div.get("type"):
        div_text = div.get_text(separator=' ', strip=True)
        divs_text.append(div_text)
        #rint(divs_text)




In [18]:
div_text.split(' ')[0]

'Acknowledgements'

In [19]:
divs_text

['Introduction Recurrent neural networks, long short-term memory [12] and gated recurrent [7] neural networks in particular, have been firmly established as state of the art approaches in sequence modeling and transduction problems such as language modeling and machine translation [29, 2, 5] . Numerous efforts have since continued to push the boundaries of recurrent language models and encoder-decoder architectures [31, 21, 13] . Recurrent models typically factor computation along the symbol positions of the input and output sequences. Aligning the positions to steps in computation time, they generate a sequence of hidden states h t , as a function of the previous hidden state h t−1 and the input for position t. This inherently sequential nature precludes parallelization within training examples, which becomes critical at longer sequence lengths, as memory constraints limit batching across examples. Recent work has achieved significant improvements in computational efficiency through f

In [20]:
soup.find_all('div')[2]

<div xmlns="http://www.tei-c.org/ns/1.0">Background<p>The goal of reducing sequential computation also forms the foundation of the Extended Neural GPU <ref target="#b19" type="bibr">[20]</ref>, ByteNet <ref target="#b14" type="bibr">[15]</ref> and ConvS2S <ref target="#b7" type="bibr">[8]</ref>, all of which use convolutional neural networks as basic building block, computing hidden representations in parallel for all input and output positions. In these models, the number of operations required to relate signals from two arbitrary input or output positions grows in the distance between positions, linearly for ConvS2S and logarithmically for ByteNet. This makes it more difficult to learn dependencies between distant positions <ref target="#b10" type="bibr">[11]</ref>. In the Transformer this is reduced to a constant number of operations, albeit at the cost of reduced effective resolution due to averaging attention-weighted positions, an effect we counteract with Multi-Head Attention as

In [21]:
soup.prettify()

'<?xml version="1.0" encoding="UTF-8"?>\n<html>\n <body>\n  <tei xml:space="preserve" xmlns="http://www.tei-c.org/ns/1.0" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemalocation="http://www.tei-c.org/ns/1.0 /home/varun/Desktop/NLP/Projects/grobid-0.5.6/grobid-home/schemas/xsd/Grobid.xsd">\n   <teiheader xml:lang="en">\n    <filedesc>\n     <titlestmt>\n      <title level="a" type="main">\n       Attention Is All You Need\n      </title>\n     </titlestmt>\n     <publicationstmt>\n      <publisher>\n      </publisher>\n      <availability status="unknown">\n       <licence>\n       </licence>\n      </availability>\n     </publicationstmt>\n     <sourcedesc>\n      <biblstruct>\n       <analytic>\n        <author>\n         <persname xmlns="http://www.tei-c.org/ns/1.0">\n          <forename type="first">\n           Ashish\n          </forename>\n          <surname>\n           Vaswani\n          </surname>\n         </persname

In [22]:
tei_doc = 'sample.tei.xml'
with open(tei_doc, 'r') as tei:
    soup1 = BeautifulSoup(tei, 'xml')

In [23]:
soup1.find_all('head')

[<head n="1">Introduction</head>,
 <head n="2">Background</head>,
 <head n="3">Model Architecture</head>,
 <head n="3.1">Encoder and Decoder Stacks</head>,
 <head n="3.2">Attention</head>,
 <head n="3.2.1">Scaled Dot-Product Attention</head>,
 <head n="3.2.2">Multi-Head Attention</head>,
 <head n="3.2.3">Applications of Attention in our Model</head>,
 <head n="3.3">Position-wise Feed-Forward Networks</head>,
 <head n="3.4">Embeddings and Softmax</head>,
 <head n="3.5">Positional Encoding</head>,
 <head n="4">Why Self-Attention</head>,
 <head n="5">Training</head>,
 <head n="5.1">Training Data and Batching</head>,
 <head n="5.2">Hardware and Schedule</head>,
 <head n="5.3">Optimizer</head>,
 <head n="5.4">Regularization</head>,
 <head n="6">Results</head>,
 <head n="6.1">Machine Translation</head>,
 <head n="6.2">Model Variations</head>,
 <head n="7">Conclusion</head>,
 <head>Figure 1 :</head>,
 <head>Figure 2 :</head>,
 <head>•</head>,
 <head>P</head>,
 <head>Table 2 :</head>,
 <head>T

In [24]:
headings=[]
for div in soup1.body.find_all("head"):
    # div is neither an appendix nor references, just plain text.
    try:
        if not div.get("type"):
            div_text = div.get_text(separator=' ', strip=True)
            headings.append((div_text, div['n']))
        #rint(divs_text)
    except KeyError:
        headings.append(div_text)
        pass
headings

[('Introduction', '1'),
 ('Background', '2'),
 ('Model Architecture', '3'),
 ('Encoder and Decoder Stacks', '3.1'),
 ('Attention', '3.2'),
 ('Scaled Dot-Product Attention', '3.2.1'),
 ('Multi-Head Attention', '3.2.2'),
 ('Applications of Attention in our Model', '3.2.3'),
 ('Position-wise Feed-Forward Networks', '3.3'),
 ('Embeddings and Softmax', '3.4'),
 ('Positional Encoding', '3.5'),
 ('Why Self-Attention', '4'),
 ('Training', '5'),
 ('Training Data and Batching', '5.1'),
 ('Hardware and Schedule', '5.2'),
 ('Optimizer', '5.3'),
 ('Regularization', '5.4'),
 ('Results', '6'),
 ('Machine Translation', '6.1'),
 ('Model Variations', '6.2'),
 ('Conclusion', '7'),
 'Figure 1 :',
 'Figure 2 :',
 '•',
 'P',
 'Table 2 :',
 'Table 2',
 'Table 3 :']

In [30]:
d={}
j=0
for i in range(len(divs_text)):
    j=j+1
    if(type(headings[i])==tuple):
        if(headings[i][1].split('.')[0] not in d.keys()):
            d[headings[i][1].split('.')[0]]=[divs_text[i]]
        else:
            d[headings[i][1].split('.')[0]].append(divs_text[i])
    
    else:
        #print(divs_text[i])
        d[str(j)]=[divs_text[i]]
        
d

{'1': ['Introduction Recurrent neural networks, long short-term memory [12] and gated recurrent [7] neural networks in particular, have been firmly established as state of the art approaches in sequence modeling and transduction problems such as language modeling and machine translation [29, 2, 5] . Numerous efforts have since continued to push the boundaries of recurrent language models and encoder-decoder architectures [31, 21, 13] . Recurrent models typically factor computation along the symbol positions of the input and output sequences. Aligning the positions to steps in computation time, they generate a sequence of hidden states h t , as a function of the previous hidden state h t−1 and the input for position t. This inherently sequential nature precludes parallelization within training examples, which becomes critical at longer sequence lengths, as memory constraints limit batching across examples. Recent work has achieved significant improvements in computational efficiency thr

In [26]:
type((1,2))==tuple

True

In [27]:
for i in range(len(divs_text)):
    x=divs_text[i]
    y=headings[i][0]
    
    print(x.split(' ')[0:len(y.split(' '))])
    print(y)

['Introduction']
Introduction
['Background']
Background
['Model', 'Architecture']
Model Architecture
['Encoder', 'and', 'Decoder', 'Stacks']
Encoder and Decoder Stacks
['Attention']
Attention
['Scaled', 'Dot-Product', 'Attention']
Scaled Dot-Product Attention
['Multi-Head', 'Attention']
Multi-Head Attention
['Applications', 'of', 'Attention', 'in', 'our', 'Model']
Applications of Attention in our Model
['Position-wise', 'Feed-Forward', 'Networks']
Position-wise Feed-Forward Networks
['Embeddings', 'and', 'Softmax']
Embeddings and Softmax
['Positional', 'Encoding']
Positional Encoding
['Why', 'Self-Attention']
Why Self-Attention
['Training']
Training
['Training', 'Data', 'and', 'Batching']
Training Data and Batching
['Hardware', 'and', 'Schedule']
Hardware and Schedule
['Optimizer']
Optimizer
['Regularization']
Regularization
['Results']
Results
['Machine', 'Translation']
Machine Translation
['Model', 'Variations']
Model Variations
['Conclusion']
Conclusion
['Acknowledgements']
F


In [28]:
final=[]
for i in divs_text:

SyntaxError: unexpected EOF while parsing (<ipython-input-28-fa22775edf15>, line 2)

In [29]:
#d={}
if(2 not in d.keys()):
    d[2]=[1]
else:
    d[2].append(2)
d

{'1': ['Introduction Recurrent neural networks, long short-term memory [12] and gated recurrent [7] neural networks in particular, have been firmly established as state of the art approaches in sequence modeling and transduction problems such as language modeling and machine translation [29, 2, 5] . Numerous efforts have since continued to push the boundaries of recurrent language models and encoder-decoder architectures [31, 21, 13] . Recurrent models typically factor computation along the symbol positions of the input and output sequences. Aligning the positions to steps in computation time, they generate a sequence of hidden states h t , as a function of the previous hidden state h t−1 and the input for position t. This inherently sequential nature precludes parallelization within training examples, which becomes critical at longer sequence lengths, as memory constraints limit batching across examples. Recent work has achieved significant improvements in computational efficiency thr

In [None]:
#for div in soup1.body.find_all("head"):
#    print(type(div))
#    print(div.name)
#    print(div['n'])