In [1]:
from bs4 import BeautifulSoup

In [None]:
#for 1 soup:
tei_doc = 'sample.tei.xml'
with open(tei_doc, 'r') as tei:
    soup = BeautifulSoup(tei, 'lxml')

In [None]:
soup.title.getText()

In [None]:
soup.abstract.getText(separator=' ', strip=True)

In [None]:
# helpers:
def read_tei(tei_file):
    with open(tei_file, 'r') as tei:
        soup = BeautifulSoup(tei, 'lxml')
        return soup
    raise RuntimeError('Cannot generate a soup from the input')

def elem_to_text(elem, default=''):
    if elem:
        return elem.getText()
    else:
        return default

from dataclasses import dataclass

@dataclass
class Person:
    firstname: str
    middlename: str
    surname: str

turing_author = Person(firstname='Alan', middlename='M', surname='Turing')

f"{turing_author.firstname} {turing_author.surname} authored many influential publications in computer science."



In [None]:
class TEIFile(object):
    def __init__(self, filename):
        self.filename = filename
        self.soup = read_tei(filename)
        self._text = None
        self._title = ''
        self._abstract = ''

    @property
    def doi(self):
        idno_elem = self.soup.find('idno', type='DOI')
        if not idno_elem:
            return ''
        else:
            return idno_elem.getText()

    @property
    def title(self):
        if not self._title:
            self._title = self.soup.title.getText()
        return self._title

    @property
    def abstract(self):
        if not self._abstract:
            abstract = self.soup.abstract.getText(separator=' ', strip=True)
            self._abstract = abstract
        return self._abstract

    @property
    def authors(self):
        authors_in_header = self.soup.analytic.find_all('author')

        result = []
        for author in authors_in_header:
            persname = author.persname
            if not persname:
                continue
            firstname = elem_to_text(persname.find("forename", type="first"))
            middlename = elem_to_text(persname.find("forename", type="middle"))
            surname = elem_to_text(persname.surname)
            person = Person(firstname, middlename, surname)
            result.append(person)
        return result
    
    @property
    def text(self):
        divs_text = []
        if not self._text:
            
            for div in self.soup.body.find_all("div")[1:]:
                # div is neither an appendix nor references, just plain text.
                if not div.get("type"):
                    div_text = div.get_text(separator=' ', strip=True)
                    divs_text.append(div_text)

            plain_text = " ".join(divs_text)
            self._text = divs_text
        return self._text

In [None]:
tei = TEIFile('sample2.tei.xml')
f"The authors of the paper entitled '{tei.title}' are {tei.authors}"

In [None]:
divs_text=[]
for div in soup.body.find_all("div")[1:]:
    # div is neither an appendix nor references, just plain text.
    if not div.get("type"):
        div_text = div.get_text(separator=' ', strip=True)
        divs_text.append(div_text)
        #rint(divs_text)



In [None]:
tei_doc = 'sample2.tei.xml'
with open(tei_doc, 'r') as tei:
    soup1 = BeautifulSoup(tei, 'xml')

In [None]:
headings=[]
for div in soup1.body.find_all("head"):
    # div is neither an appendix nor references, just plain text.
    try:
        if not div.get("type"):
            div_text = div.get_text(separator=' ', strip=True)
            headings.append((div_text, div['n']))
        #print(divs_text)
    except KeyError:
        headings.append(div_text)
        pass
    
headings

In [None]:
d={}
j=0
for i in range(len(divs_text)):
    j=j+1
    if(type(headings[i])==tuple):
        if(headings[i][1].split('.')[0] not in d.keys()):
            x=divs_text[i].split(' ')[len(headings[i][0].split(' ')):]
            y=' '.join(x)
            d[headings[i][1].split('.')[0]]=[(headings[i][0], y)]
        else:
            x=divs_text[i].split(' ')[len(headings[i][0].split(' ')):]
            y=' '.join(x)
            d[headings[i][1].split('.')[0]].append((headings[i][0], y))
    
    else:
        #print(divs_text[i])
        d[str(j)]=[divs_text[i]]
        
d

In [5]:
import os

# traverse root directory, and list directories as dirs and files as files
for root, dirs, files in os.walk("../papers-xmls"):
    #path = root.split(os.sep)
    #print((len(path) - 1) * '---', os.path.basename(root))
    for file in files:
        if(file.endswith(".tei.xml")):
            x='_'.join(file.split('.')[:-2])
            y='_'.join(x.split(' '))
            print(y)
            os.mkdir(os.path.join("OUT", y))
            

DCFNet__Deep_Neural_Network_with_Decomposed_Convolutional_Filters
Modeling_Semantic_Expectation__Using_Script_Knowledge_for_Referent_Prediction
EQUATION_PARSING___Mapping_Sentences_to_Grounded_Equations
Multi-task_Learning_with_Labeled_and_Unlabeled_Tasks
CoSimRank__A_Flexible___Efficient_Graph-Theoretic_Similarity_Measure
Miscommunication_Recovery_in_Physically_Situated_Dialogue
An_Empirical_Study_of_Self-Disclosure_in_Spoken_Dialogue_Systems
Targeted_Syntactic_Evaluation_of_Language_Models
Supervised_Learning_of_Automatic_Pyramid_for_Optimization-Based_Multi-Document_Summarization
TextFlow__A_Text_Similarity_Measure_based_on_Continuous_Sequences
Colorless_green_recurrent_networks_dream_hierarchically
Stochastic_Gradient_Monomial_Gamma_Sampler
Comparing_Dynamics__Deep_Neural_Networks_versus_Glassy_Systems
RESIDE__Improving_Distantly-Supervised_Neural_Relation_Extraction_using_Side_Information
Trainable_Greedy_Decoding_for_Neural_Machine_Translation
Bayesian_Uncertainty_Estimation_for_

Topological_Mixture_Estimation
meProp__Sparsified_Back_Propagation_for_Accelerated_Deep_Learning_with_Reduced_Overfitting
Estimating_the_unseen_from_multiple_populations
Nonparanormal_Information_Estimation
Key-Value_Memory_Networks_for_Directly_Reading_Documents
Citation_Resolution__A_method_for_evaluating_context-based_citation_recommendation_systems
Continual_Learning_Through_Synaptic_Intelligence
Unsupervised_Neural_Machine_Translation_with_Weight_Sharing
Learning_to_Coordinate_with_Coordination_Graphs_in_Repeated_Single-Stage_Multi-Agent_Decision_Problems
Fast_Bayesian_Intensity_Estimation_for_the_Permanental_Process
To_Understand_Deep_Learning_We_Need_to_Understand_Kernel_Learning
Uniform_Deviation_Bounds_for_k-Means_Clustering
Structured_Multi-Label_Biomedical_Text_Tagging_via_Attentive_Neural_Tree_Decoding
Learning_Sentence_Embeddings_with_Auxiliary_Tasks_for_Cross-Domain_Sentiment_Classification
Style_Transfer_Through_Back-Translation
Deep_Reinforcement_Learning_for_Dialogue_G

Semantic_Annotation_for_Microblog_Topics_Using_Wikipedia_Temporal_Information
Competitive_Caching_with_Machine_Learned_Advice
Error-repair_Dependency_Parsing_for_Ungrammatical_Texts
Noise2Noise__Learning_Image_Restoration_without_Clean_Data
To_Attend_or_not_to_Attend__A_Case_Study_on_Syntactic_Structures_for_Semantic_Relatedness
Consistent_k-Clustering
How_NOT_To_Evaluate_Your_Dialogue_System__An_Empirical_Study_of_Unsupervised_Evaluation_Metrics_for_Dialogue_Response_Generation
A_Primal-Dual_Analysis_of_Global_Optimality_in_Nonconvex_Low-Rank__Matrix_Recovery
Parsing_as_Language_Modeling
Going_out_on_a_limb__Joint_Extraction_of_Entity_Mentions_and_Relations_without_Dependency_Trees
The_University_of_Alicante_at_MultiLing_2015__approach,_results_and_further_insights
Identifying_Semantic_Edit_Intentions_from_Revisions_in_Wikipedia
Investigating_Capsule_Networks_with_Dynamic_Routing_for_Text_Classification
Gradient_Descent_for_Sparse_Rank-One_Matrix_Completion_for_Crowd-Sourced_Aggregati

In [9]:
os.listdir("../papers-xmls/")

['DCFNet_ Deep Neural Network with Decomposed Convolutional Filters.tei.xml',
 'Modeling Semantic Expectation_ Using Script Knowledge for Referent Prediction.tei.xml',
 'EQUATION PARSING _ Mapping Sentences to Grounded Equations.tei.xml',
 'Multi-task Learning with Labeled and Unlabeled Tasks.tei.xml',
 'CoSimRank_ A Flexible _ Efficient Graph-Theoretic Similarity Measure.tei.xml',
 'Miscommunication Recovery in Physically Situated Dialogue.tei.xml',
 'An Empirical Study of Self-Disclosure in Spoken Dialogue Systems.tei.xml',
 'Targeted Syntactic Evaluation of Language Models.tei.xml',
 'Supervised Learning of Automatic Pyramid for Optimization-Based Multi-Document Summarization.tei.xml',
 'TextFlow_ A Text Similarity Measure based on Continuous Sequences.tei.xml',
 'Colorless green recurrent networks dream hierarchically.tei.xml',
 'Stochastic Gradient Monomial Gamma Sampler.tei.xml',
 'Comparing Dynamics_ Deep Neural Networks versus Glassy Systems.tei.xml',
 'RESIDE_ Improving Distan

In [23]:
!mkdir OUT/"a"

In [4]:
import os
y="a"
os.mkdir(os.path.join("OUT", y))