In [7]:
from bs4 import BeautifulSoup

In [72]:
tei_doc = "../papers-xmls/Learning_to_Optimize_Combinatorial_Functions.tei.xml"
with open(tei_doc, 'r') as tei:
    soup = BeautifulSoup(tei, 'lxml')
    

In [29]:
soup.title

<title level="a" type="main">Multiple Instance Learning Networks for Fine-Grained Sentiment Analysis</title>

In [63]:
soup.title.getText()

'Learning a Lexicon and Translation Model from Phoneme Lattices'

In [19]:
soup.abstract

<abstract>
<div xmlns="http://www.tei-c.org/ns/1.0"><p>We consider the task of fine-grained sentiment analysis from the perspective of multiple instance learning (MIL). Our neural model is trained on document sentiment labels, and learns to predict the sentiment of text segments, i.e. sentences or elementary discourse units (EDUs), without segment-level supervision. We introduce an attention-based polarity scoring method for identifying positive and negative text snippets and a new dataset which we call SPOT (as shorthand for Segment-level POlariTy annotations) for evaluating MILstyle sentiment models like ours. Experimental results demonstrate superior performance against multiple baselines, whereas a judgement elicitation study shows that EDU-level opinion extraction produces more informative summaries than sentence-based alternatives.</p></div>
</abstract>

In [20]:
soup.abstract.getText(separator=' ', strip=True)

'We consider the task of fine-grained sentiment analysis from the perspective of multiple instance learning (MIL). Our neural model is trained on document sentiment labels, and learns to predict the sentiment of text segments, i.e. sentences or elementary discourse units (EDUs), without segment-level supervision. We introduce an attention-based polarity scoring method for identifying positive and negative text snippets and a new dataset which we call SPOT (as shorthand for Segment-level POlariTy annotations) for evaluating MILstyle sentiment models like ours. Experimental results demonstrate superior performance against multiple baselines, whereas a judgement elicitation study shows that EDU-level opinion extraction produces more informative summaries than sentence-based alternatives.'

In [7]:
abstract_text = soup.abstract.getText(separator=' ', strip=True)
'movement' in abstract_text.lower(), 'ecology' in abstract_text.lower(), 'computer' in abstract_text.lower()

(False, False, False)

In [21]:
def read_tei(tei_file):
    with open(tei_file, 'r') as tei:
        soup = BeautifulSoup(tei, 'lxml')
        return soup
    raise RuntimeError('Cannot generate a soup from the input')

In [22]:
def elem_to_text(elem, default=''):
    if elem:
        return elem.getText()
    else:
        return default

In [23]:
from dataclasses import dataclass

@dataclass
class Person:
    firstname: str
    middlename: str
    surname: str

turing_author = Person(firstname='Alan', middlename='M', surname='Turing')

f"{turing_author.firstname} {turing_author.surname} authored many influential publications in computer science."


'Alan Turing authored many influential publications in computer science.'

In [24]:
class TEIFile(object):
    def __init__(self, filename):
        self.filename = filename
        self.soup = read_tei(filename)
        self._text = None
        self._title = ''
        self._abstract = ''

    @property
    def doi(self):
        idno_elem = self.soup.find('idno', type='DOI')
        if not idno_elem:
            return ''
        else:
            return idno_elem.getText()

    @property
    def title(self):
        if not self._title:
            self._title = self.soup.title.getText()
        return self._title

    @property
    def abstract(self):
        if not self._abstract:
            abstract = self.soup.abstract.getText(separator=' ', strip=True)
            self._abstract = abstract
        return self._abstract

    @property
    def authors(self):
        authors_in_header = self.soup.analytic.find_all('author')

        result = []
        for author in authors_in_header:
            persname = author.persname
            if not persname:
                continue
            firstname = elem_to_text(persname.find("forename", type="first"))
            middlename = elem_to_text(persname.find("forename", type="middle"))
            surname = elem_to_text(persname.surname)
            person = Person(firstname, middlename, surname)
            result.append(person)
        return result
    
    @property
    def text(self):
        divs_text = []
        if not self._text:
            
            for div in self.soup.body.find_all("div")[1:]:
                # div is neither an appendix nor references, just plain text.
                if not div.get("type"):
                    div_text = div.get_text(separator=' ', strip=True)
                    divs_text.append(div_text)

            plain_text = " ".join(divs_text)
            self._text = divs_text
        return self._text

In [12]:
tei = TEIFile('sample.tei.xml')
f"The authors of the paper entitled '{tei.title}' are {tei.authors}"

"The authors of the paper entitled 'Attention Is All You Need' are [Person(firstname='Ashish', middlename='', surname='Vaswani'), Person(firstname='Google', middlename='', surname='Brain'), Person(firstname='Noam', middlename='', surname='Shazeer'), Person(firstname='Google', middlename='', surname='Brain'), Person(firstname='Niki', middlename='', surname='Parmar'), Person(firstname='Jakob', middlename='', surname='Uszkoreit'), Person(firstname='Llion', middlename='', surname='Jones'), Person(firstname='Aidan', middlename='N', surname='Gomez'), Person(firstname='Łukasz', middlename='', surname='Kaiser'), Person(firstname='Google', middlename='', surname='Brain'), Person(firstname='Illia', middlename='', surname='Polosukhin')]"

In [13]:
tei.text

['Introduction Recurrent neural networks, long short-term memory [12] and gated recurrent [7] neural networks in particular, have been firmly established as state of the art approaches in sequence modeling and transduction problems such as language modeling and machine translation [29, 2, 5] . Numerous efforts have since continued to push the boundaries of recurrent language models and encoder-decoder architectures [31, 21, 13] . Recurrent models typically factor computation along the symbol positions of the input and output sequences. Aligning the positions to steps in computation time, they generate a sequence of hidden states h t , as a function of the previous hidden state h t−1 and the input for position t. This inherently sequential nature precludes parallelization within training examples, which becomes critical at longer sequence lengths, as memory constraints limit batching across examples. Recent work has achieved significant improvements in computational efficiency through f

In [14]:
head_datas = [head.get_text() for head in soup.find_all('head')]
head_datas

[]

In [15]:
soup.find_all('h1')

[]

In [16]:
soup.find_all('div')[2]

<div xmlns="http://www.tei-c.org/ns/1.0">Background<p>The goal of reducing sequential computation also forms the foundation of the Extended Neural GPU <ref target="#b19" type="bibr">[20]</ref>, ByteNet <ref target="#b14" type="bibr">[15]</ref> and ConvS2S <ref target="#b7" type="bibr">[8]</ref>, all of which use convolutional neural networks as basic building block, computing hidden representations in parallel for all input and output positions. In these models, the number of operations required to relate signals from two arbitrary input or output positions grows in the distance between positions, linearly for ConvS2S and logarithmically for ByteNet. This makes it more difficult to learn dependencies between distant positions <ref target="#b10" type="bibr">[11]</ref>. In the Transformer this is reduced to a constant number of operations, albeit at the cost of reduced effective resolution due to averaging attention-weighted positions, an effect we counteract with Multi-Head Attention as

In [73]:
divs_text=[]
for div in soup.body.find_all("div")[1:]:
    # div is neither an appendix nor references, just plain text.
    if not div.get("type"):
        div_text = div.get_text(separator=' ', strip=True)
        divs_text.append(div_text)
        #rint(divs_text)




In [45]:
div_text.split(' ')[0]

'Acknowledgements'

In [74]:
divs_text

['Introduction Combinatorial optimization aims to optimize an objective function over a set of feasible solutions defined on a discrete space. Numerous real-life decision-making problems can be formulated as combinatorial optimization problems (Korte et al. 2012; Trevisan 2011) . In the last decade, development of time-efficient algorithms for combinatorial optimization problems paved the way for these algorithms to be widely utilized in industry, including, but not limited to, in resource allocation (Angalakudati et al. 2014 ), efficient energy scheduling (Ngueveu, Artigues, and Lopez 2016), price optimization (Ferreira, Lee, and Simchi-Levi 2015) , sales promotion planning (Cohen et al. 2017) , etc. The last decade has, in parallel, witnessed a tremendous growth in machine learning (ML) methods, which can produce very accurate predictions by leveraging historical and contextual data. In real-world applications, not all parameters of an optimization problem are known at the time of Co

In [17]:
soup.find_all('div')[2]

<div xmlns="http://www.tei-c.org/ns/1.0">Background<p>The goal of reducing sequential computation also forms the foundation of the Extended Neural GPU <ref target="#b19" type="bibr">[20]</ref>, ByteNet <ref target="#b14" type="bibr">[15]</ref> and ConvS2S <ref target="#b7" type="bibr">[8]</ref>, all of which use convolutional neural networks as basic building block, computing hidden representations in parallel for all input and output positions. In these models, the number of operations required to relate signals from two arbitrary input or output positions grows in the distance between positions, linearly for ConvS2S and logarithmically for ByteNet. This makes it more difficult to learn dependencies between distant positions <ref target="#b10" type="bibr">[11]</ref>. In the Transformer this is reduced to a constant number of operations, albeit at the cost of reduced effective resolution due to averaging attention-weighted positions, an effect we counteract with Multi-Head Attention as

In [18]:
soup.prettify()

'<?xml version="1.0" encoding="UTF-8"?>\n<html>\n <body>\n  <tei xml:space="preserve" xmlns="http://www.tei-c.org/ns/1.0" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemalocation="http://www.tei-c.org/ns/1.0 /home/varun/Desktop/NLP/Projects/grobid-0.5.6/grobid-home/schemas/xsd/Grobid.xsd">\n   <teiheader xml:lang="en">\n    <filedesc>\n     <titlestmt>\n      <title level="a" type="main">\n       Attention Is All You Need\n      </title>\n     </titlestmt>\n     <publicationstmt>\n      <publisher>\n      </publisher>\n      <availability status="unknown">\n       <licence>\n       </licence>\n      </availability>\n     </publicationstmt>\n     <sourcedesc>\n      <biblstruct>\n       <analytic>\n        <author>\n         <persname xmlns="http://www.tei-c.org/ns/1.0">\n          <forename type="first">\n           Ashish\n          </forename>\n          <surname>\n           Vaswani\n          </surname>\n         </persname

In [75]:
tei_doc = "../papers-xmls/Learning_to_Optimize_Combinatorial_Functions.tei.xml"
#tei_doc = "../papers-xmls/Learning_a_Lexicon_and_Translation_Model_from_Phoneme_Lattices.tei.xml"
with open(tei_doc, 'r') as tei:
    soup1 = BeautifulSoup(tei, 'xml')

In [76]:
soup1.find_all('head')

[<head>Introduction</head>,
 <head>Related Work</head>,
 <head>Problem Formulation and Approach</head>,
 <head>Two Stage Learning</head>,
 <head>Smart Predict then Optimize (SPO)</head>,
 <head>Combinatorial problems and scaling up</head>,
 <head>Experimental Evaluation</head>,
 <head>RQ1: exact versus weaker oracles</head>,
 <head>Instance</head>,
 <head>RQ2 benefits of warmstarting</head>,
 <head>RQ3: SPO versus QPTL</head>,
 <head>RQ4: Suitability on large, hard optimisation instances</head>,
 <head>Conclusions and future work</head>,
 <head>Figure 1 :</head>,
 <head>Figure 3 :</head>,
 <head>Figure 5 :</head>,
 <head>Table 1 :</head>,
 <head>Table 2 :</head>,
 <head>Acknowledgments</head>]

In [77]:
headings=[]
maxx=0
for div in soup1.body.find_all("head"):
    # div is neither an appendix nor references, just plain text.
    try:
        if not div.get("type"):
            div_text = div.get_text(separator=' ', strip=True)
            headings.append((div_text, div['n']))
            if(int(div['n'].split('.')[0])>maxx):
                maxx=int(div['n'])
        #print(divs_text)
    except KeyError:
        headings.append(div_text)
        pass
    
headings
#maxx

['Introduction',
 'Related Work',
 'Problem Formulation and Approach',
 'Two Stage Learning',
 'Smart Predict then Optimize (SPO)',
 'Combinatorial problems and scaling up',
 'Experimental Evaluation',
 'RQ1: exact versus weaker oracles',
 'Instance',
 'RQ2 benefits of warmstarting',
 'RQ3: SPO versus QPTL',
 'RQ4: Suitability on large, hard optimisation instances',
 'Conclusions and future work',
 'Figure 1 :',
 'Figure 3 :',
 'Figure 5 :',
 'Table 1 :',
 'Table 2 :']

In [78]:
len(divs_text)

14

In [79]:
d={}
j=0
max_t_now=0
for i in range(len(divs_text)):
    j=j+1
    if(i<len(headings) and type(headings[i])==tuple):
        if(max_t_now<int(headings[i][1].split('.')[0])):
                max_t_now=int(headings[i][1].split('.')[0])
                
        if(headings[i][1].split('.')[0] not in d.keys()):
            x=divs_text[i].split(' ')[len(headings[i][0].split(' ')):]
            y=' '.join(x)
            d[headings[i][1].split('.')[0]]=[(headings[i][0], y)]
        else:
            x=divs_text[i].split(' ')[len(headings[i][0].split(' ')):]
            y=' '.join(x)
            d[headings[i][1].split('.')[0]].append((headings[i][0], y))
    
    elif (max_t_now<maxx and i<len(headings)):
        x=divs_text[i].split(' ')[len(headings[i].split(' ')):]
        y=' '.join(x)
        d[str(max_t_now)].append((headings[i], y))
        
    else:
        #print(divs_text[i])
        d[str(j)]=[divs_text[i]]
        
d

{'1': ['Introduction Combinatorial optimization aims to optimize an objective function over a set of feasible solutions defined on a discrete space. Numerous real-life decision-making problems can be formulated as combinatorial optimization problems (Korte et al. 2012; Trevisan 2011) . In the last decade, development of time-efficient algorithms for combinatorial optimization problems paved the way for these algorithms to be widely utilized in industry, including, but not limited to, in resource allocation (Angalakudati et al. 2014 ), efficient energy scheduling (Ngueveu, Artigues, and Lopez 2016), price optimization (Ferreira, Lee, and Simchi-Levi 2015) , sales promotion planning (Cohen et al. 2017) , etc. The last decade has, in parallel, witnessed a tremendous growth in machine learning (ML) methods, which can produce very accurate predictions by leveraging historical and contextual data. In real-world applications, not all parameters of an optimization problem are known at the time

In [60]:
def make_files(d, dest):
    for key in sorted(d):
        #text_file = open(d[key][0][0]+".txt", "w")
        #print(d[key][0])
        put=""
        if(type(d[key][0])==tuple):
            x=d[key][0][0]
            y=key+'_'+('_'.join(x.split(' ')))
            for i in d[key]:
                put=put+" "+i[1]    # from subsection take subsection text
        else:
            x=str(key)
            y=x
            put=d[key][0]
        
        print(y)  #filename is y
        print(put)
        text_file = open(dest+y, "w")
        n = text_file.write(put)
        text_file.close()

##for abstract seperately

In [61]:
make_files(d,"OUT/Multiple_Instance_Learning_Networks_for_Fine-Grained_Sentiment_Analysis/" )

1_Introduction
 Sentiment analysis has become a fundamental area of research in Natural Language Processing thanks to the proliferation of user-generated content in the form of online reviews, blogs, internet forums, and social media. A plethora of methods have been proposed in the literature that attempt to distill sentiment information from text, allowing users and service providers to make opinion-driven decisions. The success of neural networks in a variety of applications (Bahdanau et al., 2015; Le and Mikolov, 2014; Socher et al., 2013) and the availability of large amounts of labeled data have led to an increased focus on sentiment classification. Supervised models are typically trained on documents (Johnson and Zhang, 2015a; Johnson and Zhang, 2015b; Tang et al., 2015; Yang et al., 2016) , sentences (Kim, 2014) , or phrases (Socher et al., 2011; [Rating: ] I had a very mixed experience at The Stand. The burger and fries were good. The chocolate shake was divine: rich and creamy

In [34]:
text_file = open("sample.txt", "w")
n = text_file.write(soup.abstract.getText(separator=' ', strip=True))
text_file.close()

In [33]:
soup.abstract.getText(separator=' ', strip=True)

'The dominant sequence transduction models are based on complex recurrent or convolutional neural networks that include an encoder and a decoder. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train. Our model achieves 28.4 BLEU on the WMT 2014 Englishto-German translation task, improving over the existing best results, including ensembles, by over 2 BLEU. On the WMT 2014 English-to-French translation task, our model establishes a new single-model state-of-the-art BLEU score of 41.0 after training for 3.5 days on eight GPUs, a small fraction of the training costs of the best models from the literature. * Equal contribution. Li

In [25]:
sentence = ['this','is','a','sentence']
'-'.join(sentence)


'this-is-a-sentence'

In [23]:
for i in range(len(divs_text)):
    x=divs_text[i]
    y=headings[i][0]
    
    print(x.split(' ')[0:len(y.split(' '))])
    print(y)

['Introduction']
Introduction
['Background']
Background
['Model', 'Architecture']
Model Architecture
['Encoder', 'and', 'Decoder', 'Stacks']
Encoder and Decoder Stacks
['Attention']
Attention
['Scaled', 'Dot-Product', 'Attention']
Scaled Dot-Product Attention
['Multi-Head', 'Attention']
Multi-Head Attention
['Applications', 'of', 'Attention', 'in', 'our', 'Model']
Applications of Attention in our Model
['Position-wise', 'Feed-Forward', 'Networks']
Position-wise Feed-Forward Networks
['Embeddings', 'and', 'Softmax']
Embeddings and Softmax
['Positional', 'Encoding']
Positional Encoding
['Why', 'Self-Attention']
Why Self-Attention
['Training']
Training
['Training', 'Data', 'and', 'Batching']
Training Data and Batching
['Hardware', 'and', 'Schedule']
Hardware and Schedule
['Optimizer']
Optimizer
['Regularization']
Regularization
['Results']
Results
['Machine', 'Translation']
Machine Translation
['Model', 'Variations']
Model Variations
['Conclusion']
Conclusion
['Acknowledgements']
F


In [28]:
final=[]
for i in divs_text:

SyntaxError: unexpected EOF while parsing (<ipython-input-28-fa22775edf15>, line 2)

In [29]:
#d={}
if(2 not in d.keys()):
    d[2]=[1]
else:
    d[2].append(2)
d

{'1': ['Introduction Recurrent neural networks, long short-term memory [12] and gated recurrent [7] neural networks in particular, have been firmly established as state of the art approaches in sequence modeling and transduction problems such as language modeling and machine translation [29, 2, 5] . Numerous efforts have since continued to push the boundaries of recurrent language models and encoder-decoder architectures [31, 21, 13] . Recurrent models typically factor computation along the symbol positions of the input and output sequences. Aligning the positions to steps in computation time, they generate a sequence of hidden states h t , as a function of the previous hidden state h t−1 and the input for position t. This inherently sequential nature precludes parallelization within training examples, which becomes critical at longer sequence lengths, as memory constraints limit batching across examples. Recent work has achieved significant improvements in computational efficiency thr

In [None]:
#for div in soup1.body.find_all("head"):
#    print(type(div))
#    print(div.name)
#    print(div['n'])