In [1]:
from grobid.client import GrobidClient
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
import xml.etree.ElementTree as ET
import os

In [5]:
paper = "/hola/quetal.pdf"
"/".join(paper.split("/")[:-1])

'/hola'

<h1>Modelo para tokenizar</h1>

In [2]:
model = SentenceTransformer("jamescalam/minilm-arxiv-encoder")

<h1>Funciones útiles para sacar el abstract y los reconocimientos de los árboles xml</h1>

In [106]:
def get_schema(tree):
    '''
    This function gets the schema of a given XML tree.
    :param tree: an ElementTree object
    :return: the schema string
    '''

    res = tree.getroot().tag.split("}")
    return res[0] + "}" if len(res) > 0 else ""

def get_authors(papers, elem, schema):
    '''
    This function gets the authors of a given paper.
    :param papers: a dictionary of papers
    :param elem: the name of the XML file containing the paper
    :param schema: the schema string
    :return: a list of dictionaries, where each dictionary represents an author
    '''
    authors = []
    for author in papers[elem].findall(f"{schema}teiHeader/{schema}fileDesc/{schema}titleStmt/{schema}author"):
        author_dict = {}
        if author.find(f"{schema}persName") is not None:
            author_dict["name"] = author.find(f"{schema}persName").text
        if author.find(f"{schema}email") is not None:
            author_dict["email"] = author.find(f"{schema}email").text
        if author.find(f"{schema}affiliation") is not None:
            author_dict["affiliation"] = author.find(f"{schema}affiliation").text
        authors.append(author_dict)
    return authors

def get_abstract(papers, elem, schema):
    '''
    This function gets the abstract of a given paper.
    :param papers: a dictionary of papers
    :param elem: the name of the XML file containing the paper
    :param schema: the schema string
    :return: the abstract string
    '''
    if papers[elem].find(f"{schema}teiHeader") is not None:
        if papers[elem].find(f"{schema}teiHeader").find(f"{schema}profileDesc") is not None:
            if papers[elem].find(f"{schema}teiHeader").find(f"{schema}profileDesc").find(
                    f"{schema}abstract") is not None:
                return ET.tostring(
                    papers[elem].find(f"{schema}teiHeader").find(f"{schema}profileDesc").find(f"{schema}abstract"),
                    encoding='utf-8', method='text').strip().decode("utf-8")
    return ""

def get_acknowledgements(papers, elem, schema):
    try:
        return list(map(lambda x: [y.text for y in x], map(lambda x: [y for y in x.iter()], filter(lambda elem: "acknowledgement" in list(elem.attrib.values()), [elem for elem in papers[elem].find(f"{schema}text").find(f"{schema}back").findall(rf"{schema}div")]))))[-1][-1]
    except:
        return ""

def get_references(papers, elem, schema):
    '''
    This function gets the bibliographic references of a given paper.
    :param papers: a dictionary of papers
    :param elem: the name of the XML file containing the paper
    :param schema: the schema string
    :return: a list of dictionaries, where each dictionary represents a reference
    '''
    def get_name_parts(authors, schema):
        ret_authors = []
        for author in authors:
            ret_author = []
            for name_part in author.iter():
                if name_part.text is not None:
                    ret_author.append(name_part.text)
            ret_authors.append(" ".join(ret_author[1:]).strip().replace("  ", " "))
        return ret_authors

    refs = []
    for ref in papers[elem].findall(f"{schema}text/{schema}back/{schema}div/{schema}listBibl/{schema}biblStruct"):
        ref_dict = {}
        if ref.find(f"{schema}analytic") is not None:
            if ref.find(f"{schema}analytic/{schema}title") is not None:
                ref_dict["title"] = ref.find(f"{schema}analytic/{schema}title").text
            if ref.find(f"{schema}analytic/{schema}author") is not None:
                aux_authors = [author for author in ref.findall(f"{schema}analytic/{schema}author")]
                authors = get_name_parts(aux_authors, schema)
                ref_dict["authors"] = authors
        if ref.find(f"{schema}monogr") is not None:
            if ref.find(f"{schema}monogr/{schema}title") is not None:
                ref_dict["journal"] = ref.find(f"{schema}monogr/{schema}title").text
            if ref.find(f"{schema}monogr/{schema}imprint") is not None:
                if ref.find(f"{schema}monogr/{schema}imprint/{schema}date") is not None:
                    ref_dict["date"] = ref.find(f"{schema}monogr/{schema}imprint/{schema}date").text
        refs.append(ref_dict)
    return refs

In [8]:

input_path = "../res/datasets/raw"
output_path = "../res/datasets/grobid"

Extraer el xml de cada paper con grobid

In [7]:
# Extract with Grobid
client = GrobidClient(host="localhost", port=8070)
for i, file in enumerate(os.listdir(input_path)):
    if file.endswith(".pdf"):
        print(f"\rProcessing file {i + 1} of {len(os.listdir(input_path))}...", end="")
        resp = client.serve("processFulltextDocument", input_path + "/" + file, consolidate_header=True, consolidate_citations=True)
        if resp[1] != 200:
            print(f"Error processing file {file}!")
            continue
        with open(output_path + "/" + file.replace(".pdf", ".xml"), "w", encoding="utf-8") as f:
            f.write(resp[0].text)

Processing file 30 of 30...

leemos los xml y sacamos el arbol

In [9]:
papers = {}
for file in os.listdir(output_path):
    if file.endswith(".xml"):
        papers[file] = ET.parse(output_path + "/" + file)


Sacamos los abstracts

In [12]:
abstracts = dict(zip(papers.keys(), [get_abstract(papers, elem, get_schema(papers[elem])) for elem in papers.keys()]))

In [63]:
acknowledgements = dict(zip(papers.keys(), [get_acknowledgements(papers, elem, get_schema(papers[elem])) for elem in papers.keys()]))

In [104]:
references = dict(zip(papers.keys(), [get_references(papers, elem, get_schema(papers[elem])) for elem in papers.keys()]))

In [107]:
authors = dict(zip(papers.keys(), [get_authors(papers, elem, get_schema(papers[elem])) for elem in papers.keys()]))

In [108]:
authors

{'1511.01844.xml': [],
 'acs.jcim.9b01120.xml': [],
 'A_Novel_Approach_for_Classification_of_Speech_Emotions_Based_on_Deep_and_Acoustic_Features.xml': [],
 '1704.05742.xml': [],
 'NIPS-2017-adagan-boosting-generative-models-Paper.xml': [],
 '1810.09136.xml': [],
 '5423.xml': [],
 '11_43.xml': [],
 '2001.10238.xml': [],
 '1-s2.0-S2090447914001567-main.xml': [],
 'NeurIPS-2020-gradient-surgery-for-multi-task-learning-Paper.xml': [],
 'maaloe16.xml': [],
 'brotli-2015-09-22.xml': [],
 'wordCompression.xml': [],
 'grumbach.xml': [],
 'A_Comparative_Study_Of_Text_Compression_Algorithms.xml': [],
 '1803.03324.xml': [],
 '179693.xml': [],
 'annurev-statistics-010814-020120.xml': [],
 '1206.3255.xml': [],
 '1706.02901.xml': [],
 'satt17_interspeech.xml': [],
 'symmetry-12-00021 (1).xml': [],
 's10052-015-3703-3.xml': [],
 '1812.05920v1.xml': [],
 'Speech_emotion_recognition_using_deep_1D (2).xml': [],
 'document.xml': [],
 '090902_1.xml': [],
 '2005.00341.xml': [],
 'badshah2017.xml': []}