In [1]:
from bs4 import BeautifulSoup

In [2]:
tei_doc = 'sample2.tei.xml'
with open(tei_doc, 'r') as tei:
    soup = BeautifulSoup(tei, 'lxml')

In [3]:
soup.title

<title level="a" type="main">Attention-based Deep Multiple Instance Learning</title>

In [4]:
soup.title.getText()

'Attention-based Deep Multiple Instance Learning'

In [5]:
soup.abstract

<abstract>
<div xmlns="http://www.tei-c.org/ns/1.0"><p>Multiple instance learning (MIL) is a variation of supervised learning where a single class label is assigned to a bag of instances. In this paper, we state the MIL problem as learning the Bernoulli distribution of the bag label where the bag label probability is fully parameterized by neural networks. Furthermore, we propose a neural network-based permutation-invariant aggregation operator that corresponds to the attention mechanism. Notably, an application of the proposed attention-based operator provides insight into the contribution of each instance to the bag label. We show empirically that our approach achieves comparable performance to the best MIL methods on benchmark MIL datasets and it outperforms other methods on a MNIST-based MIL dataset and two real-life histopathology datasets without sacrificing interpretability.</p></div>
</abstract>

In [6]:
soup.abstract.getText(separator=' ', strip=True)

'The dominant sequence transduction models are based on complex recurrent or convolutional neural networks that include an encoder and a decoder. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train. Our model achieves 28.4 BLEU on the WMT 2014 Englishto-German translation task, improving over the existing best results, including ensembles, by over 2 BLEU. On the WMT 2014 English-to-French translation task, our model establishes a new single-model state-of-the-art BLEU score of 41.0 after training for 3.5 days on eight GPUs, a small fraction of the training costs of the best models from the literature. * Equal contribution. Li

In [6]:
abstract_text = soup.abstract.getText(separator=' ', strip=True)
'movement' in abstract_text.lower(), 'ecology' in abstract_text.lower(), 'computer' in abstract_text.lower()

(False, False, False)

In [7]:
def read_tei(tei_file):
    with open(tei_file, 'r') as tei:
        soup = BeautifulSoup(tei, 'lxml')
        return soup
    raise RuntimeError('Cannot generate a soup from the input')

In [8]:
def elem_to_text(elem, default=''):
    if elem:
        return elem.getText()
    else:
        return default

In [9]:
from dataclasses import dataclass

@dataclass
class Person:
    firstname: str
    middlename: str
    surname: str

turing_author = Person(firstname='Alan', middlename='M', surname='Turing')

f"{turing_author.firstname} {turing_author.surname} authored many influential publications in computer science."


'Alan Turing authored many influential publications in computer science.'

In [10]:
class TEIFile(object):
    def __init__(self, filename):
        self.filename = filename
        self.soup = read_tei(filename)
        self._text = None
        self._title = ''
        self._abstract = ''

    @property
    def doi(self):
        idno_elem = self.soup.find('idno', type='DOI')
        if not idno_elem:
            return ''
        else:
            return idno_elem.getText()

    @property
    def title(self):
        if not self._title:
            self._title = self.soup.title.getText()
        return self._title

    @property
    def abstract(self):
        if not self._abstract:
            abstract = self.soup.abstract.getText(separator=' ', strip=True)
            self._abstract = abstract
        return self._abstract

    @property
    def authors(self):
        authors_in_header = self.soup.analytic.find_all('author')

        result = []
        for author in authors_in_header:
            persname = author.persname
            if not persname:
                continue
            firstname = elem_to_text(persname.find("forename", type="first"))
            middlename = elem_to_text(persname.find("forename", type="middle"))
            surname = elem_to_text(persname.surname)
            person = Person(firstname, middlename, surname)
            result.append(person)
        return result
    
    @property
    def text(self):
        divs_text = []
        if not self._text:
            
            for div in self.soup.body.find_all("div")[1:]:
                # div is neither an appendix nor references, just plain text.
                if not div.get("type"):
                    div_text = div.get_text(separator=' ', strip=True)
                    divs_text.append(div_text)

            plain_text = " ".join(divs_text)
            self._text = divs_text
        return self._text

In [13]:
tei = TEIFile('sample2.tei.xml')
f"The authors of the paper entitled '{tei.title}' are {tei.authors}"

"The authors of the paper entitled 'Attention-based Deep Multiple Instance Learning' are [Person(firstname='Maximilian', middlename='', surname='Ilse'), Person(firstname='Jakub', middlename='M', surname='Tomczak'), Person(firstname='Max', middlename='', surname='Welling')]"

In [14]:
tei.text

['Introduction In typical machine learning problems like image classification it is assumed that an image clearly represents a category (a class). However, in many real-life applications multiple instances are observed and only a general statement of the category is given. This scenario is called multiple instance learning (MIL) (Dietterich et al., 1997; Maron & Lozano-Pérez, 1998) or, learning from weakly annotated data (Oquab et al., 2014) . The problem of weakly annotated data is especially apparent in medical imaging (Quellec et al., 2017 ) (e.g., computational pathology, mammography or CT lung screening) where an image is typically described by a single label (benign/malignant) or a Region Of Interest (ROI) is roughly given. MIL deals with a bag of instances for which a single class label is assigned. Hence, the main goal of MIL is to learn a Proceedings of the 35 th International Conference on Machine Learning, Stockholm, Sweden, PMLR 80, 2018 . Copyright 2018 by the author(s). m

In [15]:
head_datas = [head.get_text() for head in soup.find_all('head')]
head_datas

[]

In [16]:
soup.find_all('h1')

[]

In [17]:
soup.find_all('div')[2]

<div xmlns="http://www.tei-c.org/ns/1.0">Methodology</div>

In [18]:
divs_text=[]
for div in soup.body.find_all("div")[1:]:
    # div is neither an appendix nor references, just plain text.
    if not div.get("type"):
        div_text = div.get_text(separator=' ', strip=True)
        divs_text.append(div_text)
        #rint(divs_text)




In [19]:
div_text.split(' ')[0]

'Acknowledgements'

In [20]:
divs_text

['Introduction In typical machine learning problems like image classification it is assumed that an image clearly represents a category (a class). However, in many real-life applications multiple instances are observed and only a general statement of the category is given. This scenario is called multiple instance learning (MIL) (Dietterich et al., 1997; Maron & Lozano-Pérez, 1998) or, learning from weakly annotated data (Oquab et al., 2014) . The problem of weakly annotated data is especially apparent in medical imaging (Quellec et al., 2017 ) (e.g., computational pathology, mammography or CT lung screening) where an image is typically described by a single label (benign/malignant) or a Region Of Interest (ROI) is roughly given. MIL deals with a bag of instances for which a single class label is assigned. Hence, the main goal of MIL is to learn a Proceedings of the 35 th International Conference on Machine Learning, Stockholm, Sweden, PMLR 80, 2018 . Copyright 2018 by the author(s). m

In [21]:
soup.find_all('div')[2]

<div xmlns="http://www.tei-c.org/ns/1.0">Methodology</div>

In [18]:
soup.prettify()

'<?xml version="1.0" encoding="UTF-8"?>\n<html>\n <body>\n  <tei xml:space="preserve" xmlns="http://www.tei-c.org/ns/1.0" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemalocation="http://www.tei-c.org/ns/1.0 /home/varun/Desktop/NLP/Projects/grobid-0.5.6/grobid-home/schemas/xsd/Grobid.xsd">\n   <teiheader xml:lang="en">\n    <filedesc>\n     <titlestmt>\n      <title level="a" type="main">\n       Attention Is All You Need\n      </title>\n     </titlestmt>\n     <publicationstmt>\n      <publisher>\n      </publisher>\n      <availability status="unknown">\n       <licence>\n       </licence>\n      </availability>\n     </publicationstmt>\n     <sourcedesc>\n      <biblstruct>\n       <analytic>\n        <author>\n         <persname xmlns="http://www.tei-c.org/ns/1.0">\n          <forename type="first">\n           Ashish\n          </forename>\n          <surname>\n           Vaswani\n          </surname>\n         </persname

In [27]:
tei_doc = 'sample2.tei.xml'
with open(tei_doc, 'r') as tei:
    soup1 = BeautifulSoup(tei, 'xml')

In [28]:
soup1.find_all('head')

[<head n="1.">Introduction</head>,
 <head n="2.">Methodology</head>,
 <head n="2.1.">Multiple instance learning (MIL)</head>,
 <head n="2.2.">MIL with Neural Networks</head>,
 <head n="2.3.">MIL pooling</head>,
 <head n="2.4.">Attention-based MIL pooling</head>,
 <head>Attention mechanism</head>,
 <head n="3.">Related work</head>,
 <head>MIL and attention</head>,
 <head n="4.">Experiments</head>,
 <head n="4.1.">Classical MIL datasets</head>,
 <head n="4.2.">MNIST-bags</head>,
 <head>Results and discussion</head>,
 <head n="4.3.">Histopathology datasets</head>,
 <head>METHOD</head>,
 <head n="5.">Conclusion</head>,
 <head n="6.1.">Deep MIL approaches</head>,
 <head n="6.2.">Code</head>,
 <head n="6.3.">Classical MIL datasets</head>,
 <head>Additional details In</head>,
 <head>Figure 1 .</head>,
 <head>Figure 2 .</head>,
 <head>Figure 3 .</head>,
 <head/>,
 <head>Figure 4 .</head>,
 <head>Table 3 .Figure 5 .</head>,
 <head>Figure 7 . 00017 Figure 8 .</head>,
 <head>Figure 10 .</head>,
 

In [29]:
headings=[]
for div in soup1.body.find_all("head"):
    # div is neither an appendix nor references, just plain text.
    try:
        if not div.get("type"):
            div_text = div.get_text(separator=' ', strip=True)
            headings.append((div_text, div['n']))
        #print(divs_text)
    except KeyError:
        headings.append(div_text)
        pass
    
headings

[('Introduction', '1.'),
 ('Methodology', '2.'),
 ('Multiple instance learning (MIL)', '2.1.'),
 ('MIL with Neural Networks', '2.2.'),
 ('MIL pooling', '2.3.'),
 ('Attention-based MIL pooling', '2.4.'),
 'Attention mechanism',
 ('Related work', '3.'),
 'MIL and attention',
 ('Experiments', '4.'),
 ('Classical MIL datasets', '4.1.'),
 ('MNIST-bags', '4.2.'),
 'Results and discussion',
 ('Histopathology datasets', '4.3.'),
 'METHOD',
 ('Conclusion', '5.'),
 ('Deep MIL approaches', '6.1.'),
 ('Code', '6.2.'),
 ('Classical MIL datasets', '6.3.'),
 'Additional details In',
 'Figure 1 .',
 'Figure 2 .',
 'Figure 3 .',
 '',
 'Figure 4 .',
 'Table 3 .Figure 5 .',
 'Figure 7 . 00017 Figure 8 .',
 'Figure 10 .',
 'Figure 11 .Figure 12 .',
 'Table 1 .',
 'Table 4 .',
 'Table 6 .',
 'Table 7 .',
 'Table 8 .',
 'Table 10 .',
 'Table 11 .',
 'Table 12 .Table 13 .',
 'Table 15 .',
 'Table 17 .']

In [30]:
d={}
j=0
for i in range(len(divs_text)):
    j=j+1
    if(type(headings[i])==tuple):
        if(headings[i][1].split('.')[0] not in d.keys()):
            x=divs_text[i].split(' ')[len(headings[i][0].split(' ')):]
            y=' '.join(x)
            d[headings[i][1].split('.')[0]]=[(headings[i][0], y)]
        else:
            x=divs_text[i].split(' ')[len(headings[i][0].split(' ')):]
            y=' '.join(x)
            d[headings[i][1].split('.')[0]].append((headings[i][0], y))
    
    else:
        #print(divs_text[i])
        d[str(j)]=[divs_text[i]]
        
d

{'1': [('Introduction',
   'In typical machine learning problems like image classification it is assumed that an image clearly represents a category (a class). However, in many real-life applications multiple instances are observed and only a general statement of the category is given. This scenario is called multiple instance learning (MIL) (Dietterich et al., 1997; Maron & Lozano-Pérez, 1998) or, learning from weakly annotated data (Oquab et al., 2014) . The problem of weakly annotated data is especially apparent in medical imaging (Quellec et al., 2017 ) (e.g., computational pathology, mammography or CT lung screening) where an image is typically described by a single label (benign/malignant) or a Region Of Interest (ROI) is roughly given. MIL deals with a bag of instances for which a single class label is assigned. Hence, the main goal of MIL is to learn a Proceedings of the 35 th International Conference on Machine Learning, Stockholm, Sweden, PMLR 80, 2018 . Copyright 2018 by the

In [26]:
type((1,2))==tuple

True

In [25]:
sentence = ['this','is','a','sentence']
'-'.join(sentence)


'this-is-a-sentence'

In [23]:
for i in range(len(divs_text)):
    x=divs_text[i]
    y=headings[i][0]
    
    print(x.split(' ')[0:len(y.split(' '))])
    print(y)

['Introduction']
Introduction
['Background']
Background
['Model', 'Architecture']
Model Architecture
['Encoder', 'and', 'Decoder', 'Stacks']
Encoder and Decoder Stacks
['Attention']
Attention
['Scaled', 'Dot-Product', 'Attention']
Scaled Dot-Product Attention
['Multi-Head', 'Attention']
Multi-Head Attention
['Applications', 'of', 'Attention', 'in', 'our', 'Model']
Applications of Attention in our Model
['Position-wise', 'Feed-Forward', 'Networks']
Position-wise Feed-Forward Networks
['Embeddings', 'and', 'Softmax']
Embeddings and Softmax
['Positional', 'Encoding']
Positional Encoding
['Why', 'Self-Attention']
Why Self-Attention
['Training']
Training
['Training', 'Data', 'and', 'Batching']
Training Data and Batching
['Hardware', 'and', 'Schedule']
Hardware and Schedule
['Optimizer']
Optimizer
['Regularization']
Regularization
['Results']
Results
['Machine', 'Translation']
Machine Translation
['Model', 'Variations']
Model Variations
['Conclusion']
Conclusion
['Acknowledgements']
F


In [28]:
final=[]
for i in divs_text:

SyntaxError: unexpected EOF while parsing (<ipython-input-28-fa22775edf15>, line 2)

In [29]:
#d={}
if(2 not in d.keys()):
    d[2]=[1]
else:
    d[2].append(2)
d

{'1': ['Introduction Recurrent neural networks, long short-term memory [12] and gated recurrent [7] neural networks in particular, have been firmly established as state of the art approaches in sequence modeling and transduction problems such as language modeling and machine translation [29, 2, 5] . Numerous efforts have since continued to push the boundaries of recurrent language models and encoder-decoder architectures [31, 21, 13] . Recurrent models typically factor computation along the symbol positions of the input and output sequences. Aligning the positions to steps in computation time, they generate a sequence of hidden states h t , as a function of the previous hidden state h t−1 and the input for position t. This inherently sequential nature precludes parallelization within training examples, which becomes critical at longer sequence lengths, as memory constraints limit batching across examples. Recent work has achieved significant improvements in computational efficiency thr

In [None]:
#for div in soup1.body.find_all("head"):
#    print(type(div))
#    print(div.name)
#    print(div['n'])