In [1]:
from bs4 import BeautifulSoup
import lxml
import pandas as pd

In [2]:
def read_tei(tei_file):
    with open(tei_file, 'r') as tei:
        soup = BeautifulSoup(tei, 'lxml')
        return soup
    raise RuntimeError('Cannot generate a soup from the input')

Copied and customized from https://komax.github.io/blog/text/python/xml/parsing_tei_xml_python/

In [3]:
from dataclasses import dataclass

@dataclass
class Person:
    firstname: str
    middlename: str
    surname: str

In [6]:
def elem_to_text(elem, default=''):
    if elem:
        return elem.getText()
    else:
        return default

In [19]:
class TEIFile(object):
    def __init__(self, filename):
        self.filename = filename
        self.soup = read_tei(filename)
        self._text = None
        self._title = ''
        self._abstract = ''

    @property
    def doi(self):
        idno_elem = self.soup.find('idno', type='DOI')
        if not idno_elem:
            return ''
        else:
            return idno_elem.getText()

    @property
    def title(self):
        if not self._title:
            self._title = self.soup.title.getText()
        return self._title

    @property
    def abstract(self):
        if not self._abstract:
            abstract = self.soup.abstract.getText(separator=' ', strip=True)
            self._abstract = abstract
        return self._abstract

    @property
    def authors(self):
        authors_in_header = self.soup.analytic.find_all('author')

        result = []
        for author in authors_in_header:
            persname = author.persname
            if not persname:
                continue
            firstname = elem_to_text(persname.find("forename", type="first"))
            middlename = elem_to_text(persname.find("forename", type="middle"))
            surname = elem_to_text(persname.surname)
            person = Person(firstname, middlename, surname)
            result.append(person)
        return result
    
    @property
    def text(self):
        if not self._text:
            divs_text = []
            for div in self.soup.body.find_all("div"):
                # div is neither an appendix nor references, just plain text.
                if not div.get("type"):
                    div_text = div.get_text(separator=' ', strip=True)
                    divs_text.append(div_text)

            plain_text = ",".join(divs_text)
            self._text = plain_text
        return self._text

In [21]:
from os.path import basename, splitext

def basename_without_ext(path):
    base_name = basename(path)
    stem, ext = splitext(base_name)
    if stem.endswith('.tei'):
        # Return base name without tei file
        return stem[0:-4]
    else:
        return stem

In [31]:
def tei_to_csv_entry(tei_file):
    tei = TEIFile(tei_file)
    print(f"Handled {tei_file}")
    base_name = basename_without_ext(tei_file)
    return base_name, tei.doi, tei.authors, tei.title, tei.abstract

In [32]:
tei_doc = "../data/unpaywall-grobid-sample/2047901333.tei.xml"
tei_to_csv_entry(tei_doc)

Handled ../data/unpaywall-grobid-sample/2047901333.tei.xml


('2047901333',
 '',
 [Person(firstname='Jaime', middlename='', surname='Lloret_Mauri')],
 'Network Protocols and Algorithms Introduction to Network Protocols and Algorithms',
 'Abstract Since the appearance of the data networks, many researchers have focused their efforts on designing and developing many ways to communicate the network devices. This paper will introduce the reader to the network protocols and algorithms that are used for data transfer between devices. The meaning of the terms, their description and their explanation are provided. Finally, the scope of the journal and its aim are presented.')

In [33]:
import glob
from pathlib import Path
import multiprocessing

papers = sorted(Path("../data/unpaywall-grobid-sample").glob('*.tei.xml'))


print(f"My machine has {multiprocessing.cpu_count()} cores.")
from multiprocessing.pool import Pool
pool = Pool()

My machine has 12 cores.
Handled ../data/unpaywall-grobid-sample/1502991590.tei.xmlHandled ../data/unpaywall-grobid-sample/2007152428.tei.xml
Handled ../data/unpaywall-grobid-sample/2072485337.tei.xmlHandled ../data/unpaywall-grobid-sample/2045177160.tei.xml

Handled ../data/unpaywall-grobid-sample/2047901333.tei.xml
Handled ../data/unpaywall-grobid-sample/2317770848.tei.xml

Handled ../data/unpaywall-grobid-sample/2240795849.tei.xml
Handled ../data/unpaywall-grobid-sample/2113447292.tei.xmlHandled ../data/unpaywall-grobid-sample/2167047606.tei.xmlHandled ../data/unpaywall-grobid-sample/2121353684.tei.xml

Handled ../data/unpaywall-grobid-sample/2158678211.tei.xml
Handled ../data/unpaywall-grobid-sample/2596919086.tei.xml

Handled ../data/unpaywall-grobid-sample/2766220079.tei.xml
Handled ../data/unpaywall-grobid-sample/2577697941.tei.xml
Handled ../data/unpaywall-grobid-sample/2743668674.tei.xmlHandled ../data/unpaywall-grobid-sample/2152524239.tei.xml

Handled ../data/unpaywall-grobi

In [34]:
csv_entries = pool.map(tei_to_csv_entry, papers)
csv_entries

[('1502991590', '10.1186/1747-5333-1-12', [], '', ''),
 ('2007152428', '', [], '', ''),
 ('2045177160', '', [], 'BARIUM STUDIES IN THE AGED', ''),
 ('2047901333',
  '',
  [Person(firstname='Jaime', middlename='', surname='Lloret_Mauri')],
  'Network Protocols and Algorithms Introduction to Network Protocols and Algorithms',
  'Abstract Since the appearance of the data networks, many researchers have focused their efforts on designing and developing many ways to communicate the network devices. This paper will introduce the reader to the network protocols and algorithms that are used for data transfer between devices. The meaning of the terms, their description and their explanation are provided. Finally, the scope of the journal and its aim are presented.'),
 ('2072485337',
  '10.1155/2007/36404',
  [Person(firstname='E', middlename='', surname='Izquierdo'),
   Person(firstname='Hyoung', middlename='Joong', surname='Kim'),
   Person(firstname='Thomas', middlename='', surname='Sikora')]

In [36]:
result_csv = pd.DataFrame(csv_entries, columns=['ID', 'DOI', 'Authors','Title', 'Abstract'])
result_csv

Unnamed: 0,ID,DOI,Authors,Title,Abstract
0,1502991590,10.1186/1747-5333-1-12,[],,
1,2007152428,,[],,
2,2045177160,,[],BARIUM STUDIES IN THE AGED,
3,2047901333,,"[Person(firstname='Jaime', middlename='', surn...",Network Protocols and Algorithms Introduction ...,Abstract Since the appearance of the data netw...
4,2072485337,10.1155/2007/36404,"[Person(firstname='E', middlename='', surname=...",Editorial Knowledge-Assisted Media Analysis fo...,Advances in technologies for new forms of inte...
5,2098272979,,[],,Abstract-Purpose: Planning and dosimetry of di...
6,2113447292,,"[Person(firstname='Anuradha', middlename='M P'...",Secure Aggregated Routing Protocol in WSN -A R...,ABSTRACT Wireless sensor Networks (WSN) requir...
7,2121353684,,"[Person(firstname='Dianfan', middlename='', su...",Error modeling and Sensitivity Analysis of 5-D...,Abstract. Aiming at 5-dof parallel tripod head...
8,2152524239,,"[Person(firstname='Kagan', middlename='', surn...",Ensembles of Radial Basis Function Networks fo...,Abstract The mortality related to cervical can...
9,2158678211,,"[Person(firstname='Rajesh', middlename='', sur...",Time Series Representation for Identification ...,ABSTRACT Extracting information from the huge ...
