In [8]:
from bs4 import BeautifulSoup

In [None]:
#for 1 soup:
tei_doc = 'sample.tei.xml'
with open(tei_doc, 'r') as tei:
    soup = BeautifulSoup(tei, 'lxml')

In [None]:
soup.title.getText()

In [None]:
soup.abstract.getText(separator=' ', strip=True)

In [9]:
# helpers:
def read_tei(tei_file):
    with open(tei_file, 'r') as tei:
        soup = BeautifulSoup(tei, 'lxml')
        return soup
    raise RuntimeError('Cannot generate a soup from the input')

def elem_to_text(elem, default=''):
    if elem:
        return elem.getText()
    else:
        return default

from dataclasses import dataclass

@dataclass
class Person:
    firstname: str
    middlename: str
    surname: str

turing_author = Person(firstname='Alan', middlename='M', surname='Turing')

f"{turing_author.firstname} {turing_author.surname} authored many influential publications in computer science."



'Alan Turing authored many influential publications in computer science.'

In [10]:
class TEIFile(object):
    def __init__(self, filename):
        self.filename = filename
        self.soup = read_tei(filename)
        self._text = None
        self._title = ''
        self._abstract = ''

    @property
    def doi(self):
        idno_elem = self.soup.find('idno', type='DOI')
        if not idno_elem:
            return ''
        else:
            return idno_elem.getText()

    @property
    def title(self):
        if not self._title:
            self._title = self.soup.title.getText()
        return self._title

    @property
    def abstract(self):
        if not self._abstract:
            abstract = self.soup.abstract.getText(separator=' ', strip=True)
            self._abstract = abstract
        return self._abstract

    @property
    def authors(self):
        authors_in_header = self.soup.analytic.find_all('author')

        result = []
        for author in authors_in_header:
            persname = author.persname
            if not persname:
                continue
            firstname = elem_to_text(persname.find("forename", type="first"))
            middlename = elem_to_text(persname.find("forename", type="middle"))
            surname = elem_to_text(persname.surname)
            person = Person(firstname, middlename, surname)
            result.append(person)
        return result
    
    @property
    def text(self):
        divs_text = []
        if not self._text:
            
            for div in self.soup.body.find_all("div")[1:]:
                # div is neither an appendix nor references, just plain text.
                if not div.get("type"):
                    div_text = div.get_text(separator=' ', strip=True)
                    divs_text.append(div_text)

            plain_text = " ".join(divs_text)
            self._text = divs_text
        return self._text

In [None]:
tei = TEIFile('sample2.tei.xml')
f"The authors of the paper entitled '{tei.title}' are {tei.authors}"

In [12]:
def make_divs(soup):
    divs_text=[]
    for div in soup.body.find_all("div")[1:]:
        # div is neither an appendix nor references, just plain text.
        if not div.get("type"):
            div_text = div.get_text(separator=' ', strip=True)
            divs_text.append(div_text)
        #rint(divs_text)
    return divs_text


In [None]:
tei_doc = 'sample2.tei.xml'
with open(tei_doc, 'r') as tei:
    soup1 = BeautifulSoup(tei, 'xml')

In [35]:
def make_head(soup1, maxx):    
    headings=[]
    for div in soup1.body.find_all("head"):
        # div is neither an appendix nor references, just plain text.
        try:
            if not div.get("type"):
                div_text = div.get_text(separator=' ', strip=True)
                headings.append((div_text, div['n']))
                if(int(div['n'].split('.')[0])>maxx):
                    maxx=int(div['n'].split('.')[0])
            #print(divs_text)
        except KeyError:
            headings.append(div_text)
            pass
    
    return headings, maxx

In [38]:
def make_d(divs_text, headings, maxx):
    d={}
    j=0
    max_t_now=0
    for i in range(len(divs_text)):
        j=j+1
        if(i<len(headings) and type(headings[i])==tuple):
            if(max_t_now<int(headings[i][1].split('.')[0])):
                max_t_now=int(headings[i][1].split('.')[0])
                
            if(headings[i][1].split('.')[0] not in d.keys()):
                x=divs_text[i].split(' ')[len(headings[i][0].split(' ')):]
                y=' '.join(x)
                d[headings[i][1].split('.')[0]]=[(headings[i][0], y)]
            else:
                x=divs_text[i].split(' ')[len(headings[i][0].split(' ')):]
                y=' '.join(x)
                d[headings[i][1].split('.')[0]].append((headings[i][0], y))
    
        elif (max_t_now<maxx and i<len(headings)):
            x=divs_text[i].split(' ')[len(headings[i].split(' ')):]
            y=' '.join(x)
            d[str(max_t_now)].append((headings[i], y))
        
        else:
            #print(divs_text[i])
            d[str(j)]=[divs_text[i]]
        
    return d

In [2]:
import os

# traverse root directory, and list directories as dirs and files as files
for root, dirs, files in os.walk("../papers-xmls"):
    #path = root.split(os.sep)
    #print((len(path) - 1) * '---', os.path.basename(root))
    for file in files:
        if(file.endswith(".tei.xml")):
            x='_'.join(file.split('.')[:-2])
            y='_'.join(x.split(' '))
            print(y)
            #os.mkdir(os.path.join("OUT", y))
            os.rename("../papers-xmls/"+file, "../papers-xmls/"+y+".tei.xml")
            

DCFNet__Deep_Neural_Network_with_Decomposed_Convolutional_Filters
Modeling_Semantic_Expectation__Using_Script_Knowledge_for_Referent_Prediction
EQUATION_PARSING___Mapping_Sentences_to_Grounded_Equations
Multi-task_Learning_with_Labeled_and_Unlabeled_Tasks
CoSimRank__A_Flexible___Efficient_Graph-Theoretic_Similarity_Measure
Miscommunication_Recovery_in_Physically_Situated_Dialogue
An_Empirical_Study_of_Self-Disclosure_in_Spoken_Dialogue_Systems
Targeted_Syntactic_Evaluation_of_Language_Models
Supervised_Learning_of_Automatic_Pyramid_for_Optimization-Based_Multi-Document_Summarization
TextFlow__A_Text_Similarity_Measure_based_on_Continuous_Sequences
Colorless_green_recurrent_networks_dream_hierarchically
Stochastic_Gradient_Monomial_Gamma_Sampler
Comparing_Dynamics__Deep_Neural_Networks_versus_Glassy_Systems
RESIDE__Improving_Distantly-Supervised_Neural_Relation_Extraction_using_Side_Information
Trainable_Greedy_Decoding_for_Neural_Machine_Translation
Bayesian_Uncertainty_Estimation_for_

In [9]:
os.listdir("../papers-xmls/")

['DCFNet_ Deep Neural Network with Decomposed Convolutional Filters.tei.xml',
 'Modeling Semantic Expectation_ Using Script Knowledge for Referent Prediction.tei.xml',
 'EQUATION PARSING _ Mapping Sentences to Grounded Equations.tei.xml',
 'Multi-task Learning with Labeled and Unlabeled Tasks.tei.xml',
 'CoSimRank_ A Flexible _ Efficient Graph-Theoretic Similarity Measure.tei.xml',
 'Miscommunication Recovery in Physically Situated Dialogue.tei.xml',
 'An Empirical Study of Self-Disclosure in Spoken Dialogue Systems.tei.xml',
 'Targeted Syntactic Evaluation of Language Models.tei.xml',
 'Supervised Learning of Automatic Pyramid for Optimization-Based Multi-Document Summarization.tei.xml',
 'TextFlow_ A Text Similarity Measure based on Continuous Sequences.tei.xml',
 'Colorless green recurrent networks dream hierarchically.tei.xml',
 'Stochastic Gradient Monomial Gamma Sampler.tei.xml',
 'Comparing Dynamics_ Deep Neural Networks versus Glassy Systems.tei.xml',
 'RESIDE_ Improving Distan

In [23]:
!mkdir OUT/"a"

In [4]:
import os
y="a"
os.mkdir(os.path.join("OUT", y))

In [15]:
def make_files(d, dest):
    for key in sorted(d):
        #text_file = open(d[key][0][0]+".txt", "w")
        #print(d[key][0])
        put=""
        if(type(d[key][0])==tuple):
            x=d[key][0][0]
            y=key+'_'+('_'.join(x.split(' ')))
            for i in d[key]:
                put=put+" "+i[1]    # from subsection take subsection text
        else:
            x=str(key)
            y=x
            put=d[key][0]
        
        print(y)  #filename is y
        print(put)
        text_file = open(dest+y, "w")
        n = text_file.write(put)
        text_file.close()

##for abstract seperately

In [33]:
def main():
    import os
    from bs4 import BeautifulSoup

    # traverse root directory, and list directories as dirs and files as files
    for root, dirs, files in os.walk("../papers-xmls"):
        #path = root.split(os.sep)
        #print((len(path) - 1) * '---', os.path.basename(root))
        for file in files:
            if(file.endswith(".tei.xml")):
                x='_'.join(file.split('.')[:-2])
                y='_'.join(x.split(' '))
                print(y)
                #os.mkdir(os.path.join("OUT", y))
                #os.rename("../papers-xmls/"+file, "../papers-xmls/"+y+".tei.xml")
                dest="OUT/"+y+"/"
                print(dest)
            
                tei_doc = "../papers-xmls/"+file
                print(tei_doc)
                with open(tei_doc, 'r') as tei:
                    soup = BeautifulSoup(tei, 'lxml')
            
                with open(tei_doc, 'r') as tei:
                    soup1 = BeautifulSoup(tei, 'xml')
                divs_text=[]
                divs_text=make_divs(soup)
            
                maxx=0
                headings=[]
                headings, maxx=make_head(soup1, maxx)
            
                d={}
                d=make_d(divs_text, headings, maxx)
                #make divs_text
            
                text_file = open(dest+"abstract", "w")
                n = text_file.write(soup.abstract.getText(separator=' ', strip=True))
                text_file.close()
            
                make_files(d, dest)

In [36]:
main()

Multiple_Instance_Learning_Networks_for_Fine-Grained_Sentiment_Analysis
OUT/Multiple_Instance_Learning_Networks_for_Fine-Grained_Sentiment_Analysis/
../papers-xmls/Multiple_Instance_Learning_Networks_for_Fine-Grained_Sentiment_Analysis.tei.xml
1_Introduction
 Sentiment analysis has become a fundamental area of research in Natural Language Processing thanks to the proliferation of user-generated content in the form of online reviews, blogs, internet forums, and social media. A plethora of methods have been proposed in the literature that attempt to distill sentiment information from text, allowing users and service providers to make opinion-driven decisions. The success of neural networks in a variety of applications (Bahdanau et al., 2015; Le and Mikolov, 2014; Socher et al., 2013) and the availability of large amounts of labeled data have led to an increased focus on sentiment classification. Supervised models are typically trained on documents (Johnson and Zhang, 2015a; Johnson and Z

1_Introduction
 Many natural language processing tasks require document creation time (DCT) information as a useful additional metadata. Tasks such as information retrieval (Li and Croft, 2003; Dakka et al., 2008) , temporal scoping of events and facts (Allan et al., 1998; Talukdar et al., 2012b) , document summarization (Wan, 2007) and analysis (de Jong et al., 2005a) require precise and validated creation time of the documents. Most of the documents obtained from the Web either contain DCT that cannot be trusted or contain no DCT information at all (Kanhabua and Nørvåg, 2008) . Thus, predicting the time of these documents based on their content is an important task, often referred to as Document Dating. A few generative approaches (de Jong et al., 2005b; Kanhabua and Nørvåg, 2008) as well as a discriminative model (Chambers, 2012) have been previously proposed for this task. Kotsakos et al. (2014) employs term-burstiness resulting in improved precision on this task. Recently proposed

IndexError: list index out of range