In [8]:
from grobid.client import GrobidClient
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
import xml.etree.ElementTree as ET
import os

<h1>Obtener las clases y propiedades con grobid</h1>

In [8]:
from processor import PaperProcessor

input_path = "../res/datasets/space/raw/"
output_path = "../res/datasets/space/grobid/"

processor = PaperProcessor(output_path=output_path)
papers = processor.process_folder(input_path)

In [9]:
from paper_space import PaperSet
paper_space = PaperSet(papers)

<h1> Validation set </h1>

In [1]:
input_path = "../res/datasets/validation/raw/"
output_path = "../res/datasets/validation/grobid/"

processor = PaperProcessor(output_path=output_path)
val_papers = processor.process_folder(input_path)

val_space = PaperSet(val_papers)

NameError: name 'PaperProcessor' is not defined

<h1>Obtener las clases y propiedades desde los xml ya procesados</h1>

In [1]:
from processor import PaperProcessor
from paper_space import PaperSet

input_path = "../res/datasets/space/raw/"
output_path = "../res/datasets/space/grobid/"

processor = PaperProcessor(output_path=output_path)
papers = processor.process_folder_from_xml(pdf_path=input_path)

[nltk_data] Downloading package stopwords to /home/dani/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/dani/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


<h1> Obtener el Paper Space </h1>

El paper Space es una representación pitónica de clases del grafo de conocimiento, permite realizar todas las operaciones necesarias para enriquecer el grafo de manera cómoda.

In [None]:
paper_space = PaperSet(papers)

<h1> Obtener Grafo de conocimiento a partir del Paper Space </h1>

In [3]:
from rdfparser import RDFParser

kg = RDFParser(paper_space)

<h1> Guardar KG </h1>

In [4]:
import rdflib
import json
json_ld = kg.g.serialize(format='json-ld', indent=4)
with open('../res/datasets/json-ld/kg.jsonld', 'w') as outfile:
    json.dump(json.loads(json_ld), outfile)

<h1> Pruebas </h1>

In [11]:
from rdflib import Namespace, URIRef

schema = Namespace('http://schema.org/')
instances = Namespace('http://instances.com/')

# Define the query string
query = """
    PREFIX schema: <http://schema.org/>
    PREFIX instances: <http://instances.com/>

    SELECT ?paper WHERE {
        ?paper a schema:paper ;
               schema:title "spectral clustering on large datasets: when does it work? theory from continuous clustering and density cheeger-buser" .
    }
"""

for row in kg.g.query(query):
    paper_uri = row['paper']
    paper = kg.g.resource(paper_uri)
    print(paper.value(schema.title))
    author = paper.value(schema.author)

    for author_uri in paper.objects(schema.author):
        print(f"Author name: {author_uri.value(schema.forename)}, {author_uri.value(schema.surname)}")
        for affiliation_uri in author_uri.objects(schema.affiliation):
            print(affiliation_uri)
            print(f"Affiliation: {affiliation_uri.value(schema.name)}")

    for ack in paper.objects(schema.acknowledgement):
        print(f"Acknowledgement: {ack.value(schema.text)}")
        for ack_p in ack.objects(schema.acknowledges_people):
            print(f"Acknowledges people: {ack_p.value(schema.forename)}, {ack_p.value(schema.surname)}")

spectral clustering on large datasets: when does it work? theory from continuous clustering and density cheeger-buser
Author name: Timothy, Chu
Resource(http://instances.com/unknown)
Affiliation: unknown
Author name: Gary, Miller
Resource(http://instances.com/unknown)
Affiliation: unknown
Author name: Noel, Walkington
Resource(http://instances.com/unknown)
Affiliation: unknown
Acknowledgement: We would like to thank Emanuel Milman, Luca Trevisan, and Michel Ledoux for explaining the state of the art in Buser inequalities on probability densities, manifolds, and graphs. We would also like to thank Alex Wang for helpful discussions.
Acknowledges people: Emanuel, Milman
Acknowledges people: Luca, Trevisan
Acknowledges people: Michel, Ledoux
Acknowledges people: , Buser
Acknowledges people: Alex, Wang


<h1 Validation set</h1>

In [3]:
input_path = "../res/datasets/validation/raw/"
output_path = "../res/datasets/validation/grobid/"

processor = PaperProcessor(output_path=output_path)
val_papers = processor.process_folder_from_xml(input_path)

val_space = PaperSet(val_papers)

<h1> Trabajo previo: modelos y preproceso </h1>

<h1>Modelo para tokenizar</h1>

Evidentemente para esta tarea el conjunto de validación no es necesario pero aplicamos también clustering sobre él por comparar resultados

In [4]:
encoded_papers = paper_space.encode_papers()

In [5]:
val_encoded_papers = val_space.encode_papers()

In [22]:
encoded_papers

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,374,375,376,377,378,379,380,381,382,383
a note on the evaluation of generative models,-0.131301,0.002281,-0.001868,-0.052441,0.027265,-0.000154,-0.217419,-0.025462,-0.099417,0.021509,...,0.040486,0.075056,0.013146,0.056772,0.08708,0.043438,-0.153307,-0.143206,0.122879,-0.09117
spectral clustering on large datasets: when does it work? theory from continuous clustering and density cheeger-buser,-0.211543,-0.091019,0.051769,-0.009679,-0.067754,-0.038043,-0.118742,-0.059393,-0.112872,-0.146397,...,0.243307,-0.031499,0.180494,-0.277817,0.054701,-0.028978,0.178357,0.064155,-0.135752,0.165921
"attributing emotion to static body postures: recognition accuracy, confusions, and viewpoint dependence",0.02702,-0.150171,0.110937,-0.013334,-0.020474,0.097832,0.173472,0.070862,0.226507,0.093283,...,-0.092015,0.182329,0.17868,0.028688,0.075122,0.084355,0.174075,0.148864,-0.344563,-0.009461
adversarial multi-task learning for text classification,-0.011254,-0.117132,0.008287,0.008054,0.014083,0.216623,-0.02278,-0.199792,-0.074386,-0.184818,...,0.198303,0.247609,0.128383,-0.00397,0.345068,0.105736,-0.072052,0.126342,-0.054859,-0.06518
adam: a method for stochastic optimization,-0.164474,-0.218572,-0.055986,-0.11914,-0.233879,-0.00642,-0.040818,-0.019091,0.119539,0.201531,...,0.36822,-0.2179,0.016337,-0.219282,0.003607,0.007275,-0.161104,0.26655,0.026782,-0.005474
do deep generative models know what they don't know?,-0.116699,-0.067629,-0.212448,-0.066083,0.141361,0.124095,-0.117617,-0.059158,0.031601,-0.096388,...,0.230404,0.056851,-0.156073,-0.050162,0.03596,-0.079832,-0.320558,0.286835,-0.007298,-0.154617
act1 adaptor protein is an immediate and essential signaling component of interleukin-17 receptor *,-0.21702,0.010229,-0.009581,-0.049192,0.311222,0.107234,0.377248,0.079945,0.078471,-0.008878,...,-0.080643,0.113137,-0.013617,0.162523,0.188115,-0.271989,0.150667,0.073293,0.341859,-0.255941
gradient surgery for multi-task learning,-0.014807,-0.036753,-0.185821,-0.068827,-0.210362,0.165932,-0.120382,-0.179489,0.03912,0.149806,...,0.315716,0.177421,0.051643,-0.003328,0.054881,0.085309,-0.01898,0.094931,-0.078689,0.010817
auxiliary deep generative models,-0.100119,0.091307,0.079752,0.003888,0.000883,0.066475,-0.150963,0.120281,-0.074516,-0.294434,...,0.100793,-0.043954,-0.064581,-0.000687,-0.021658,-0.05318,-0.234047,0.091222,-0.107009,-0.08653
interleukin-17 contributes to cardiovascular diseases,-0.374179,0.089261,-0.00722,-0.044155,0.336266,0.05524,0.312621,0.204889,0.10089,-0.008588,...,-0.095176,0.117587,-0.119495,0.200736,0.109948,-0.217132,0.162494,0.038443,0.282405,-0.112375


In [89]:
val_encoded_papers

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,374,375,376,377,378,379,380,381,382,383
provable guarantees for nonlinear feature learning in three-layer neural networks,-0.082117,0.041908,-0.047545,-0.017843,-0.115768,0.108033,-0.127243,-0.103105,-0.120648,-0.257781,...,0.205132,0.191714,-0.08099,-0.087909,-0.029269,0.013877,-0.317632,0.085361,-0.089079,-0.11787
"an assessment of the human sortilin1 protein network, its expression and targetability using small molecules",-0.243293,-0.268495,-0.020671,-0.036869,0.106239,0.073969,-0.002579,0.138554,0.108629,-0.120708,...,-0.31897,-0.142825,-0.04298,0.09328,-0.036528,-0.098719,0.078982,0.179944,0.359369,0.204902
porcine and human aortic valve endothelial and interstitial cell isolation and characterization,-0.001412,-0.125823,-0.085345,0.006269,-0.102429,-0.11116,-0.05061,0.211069,0.348672,-0.047409,...,-0.047278,0.121263,-0.006314,0.013009,-0.065745,-0.127206,-0.239658,0.145553,0.214032,0.004931
multi-tier client selection for mobile federated learning networks,-0.031395,0.147354,-0.352972,-0.035094,0.425945,0.139995,-0.061197,0.126299,0.084645,0.087236,...,0.216316,-0.156348,0.105378,-0.079847,-0.062678,0.149147,-0.274124,0.191511,0.044713,0.082585
nubo: a transparent python package for bayesian optimisation,-0.405352,0.048752,-0.253816,-0.190668,0.10128,-0.095346,0.082369,0.150243,-0.208206,0.217534,...,0.07365,0.043185,-0.104434,-0.316946,0.129225,0.297025,-0.352027,0.158195,-0.114724,0.376048
"cardiovascular parameters in capitive blue-fronted amazon parrots (amazona aestiva, linnaeus, 1758) with varying body condition scores",0.33283,0.186398,-0.017902,-0.044864,-0.156249,-0.093512,0.055265,-0.023497,0.037588,0.161685,...,-0.370759,0.261011,0.157867,-0.195408,-0.074799,-0.086149,0.290939,0.32325,-0.117381,0.022721
deep multi-view subspace clustering with anchor graph,-0.015752,-0.025043,-0.045793,-0.132882,-0.137407,0.108172,-0.425583,-0.085501,-0.159145,-0.087801,...,0.056323,-0.026788,0.13146,0.019156,0.003505,0.150168,-0.061166,-0.110449,-0.26217,-0.003181
multimodal integration -a statistical view,-0.235947,0.105067,0.084915,-0.142468,-0.072994,0.070532,-0.023575,0.109995,0.070256,-0.052924,...,0.152015,0.189288,0.059861,-0.085412,0.146442,0.034173,-0.053101,0.300379,-0.110512,0.04654
breakthrough: a first-in-class virtual simulator for dose optimization of ace inhibitors in veterinary cardiology,-0.257704,-0.105994,-0.169664,0.188049,-0.013976,0.061377,-0.063839,0.356896,0.378765,-0.009596,...,-0.145519,0.076209,-0.055622,-0.070074,-0.158146,-0.162314,0.123842,0.048102,0.083019,0.146946
cascaded cross-attention networks for data-efficient whole-slide image classification using transformers,-0.154733,0.125601,0.005631,-0.242891,0.05925,-0.101585,-0.196746,-0.234586,-0.025566,-0.109451,...,0.163367,0.151588,0.249048,-0.240682,0.373892,0.25364,-0.069899,-0.01261,-0.083634,0.093548


In [79]:
from sklearn.cluster import DBSCAN

spcluster = DBSCAN(eps=0.15, min_samples=2, metric='cosine')
labels = spcluster.fit_predict(encoded_papers)
pd.Series(labels, index=encoded_papers.index)

a note on the evaluation of generative models                                                                           -1
spectral clustering on large datasets: when does it work? theory from continuous clustering and density cheeger-buser   -1
attributing emotion to static body postures: recognition accuracy, confusions, and viewpoint dependence                 -1
adversarial multi-task learning for text classification                                                                 -1
adam: a method for stochastic optimization                                                                              -1
do deep generative models know what they don't know?                                                                    -1
act1 adaptor protein is an immediate and essential signaling component of interleukin-17 receptor *                      0
gradient surgery for multi-task learning                                                                                -1
auxiliary deep g

In [80]:
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

sil_train = silhouette_score(encoded_papers, labels)
davies_train = davies_bouldin_score(encoded_papers, labels)
cal_train = calinski_harabasz_score(encoded_papers, labels)
sil_train, davies_train, cal_train

(0.17516641, 0.9483440986850478, 3.037260955602544)

In [93]:
from sklearn.cluster import DBSCAN

spcluster = DBSCAN(eps=0.15, min_samples=2, metric='cosine')
labels = spcluster.fit_predict(val_encoded_papers)
pd.Series(labels, index=val_encoded_papers.index)

provable guarantees for nonlinear feature learning in three-layer neural networks                                                        -1
an assessment of the human sortilin1 protein network, its expression and targetability using small molecules                             -1
porcine and human aortic valve endothelial and interstitial cell isolation and characterization                                          -1
multi-tier client selection for mobile federated learning networks                                                                       -1
nubo: a transparent python package for bayesian optimisation                                                                             -1
cardiovascular parameters in capitive blue-fronted amazon parrots (amazona aestiva, linnaeus, 1758) with varying body condition scores   -1
deep multi-view subspace clustering with anchor graph                                                                                    -1
multimodal integrati

In [94]:
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

sil_train = silhouette_score(val_encoded_papers, labels)
davies_train = davies_bouldin_score(val_encoded_papers, labels)
cal_train = calinski_harabasz_score(val_encoded_papers, labels)
sil_train, davies_train, cal_train

ValueError: Number of labels is 1. Valid values are 2 to n_samples - 1 (inclusive)

Esto lo dice todo :)

In [6]:
from sklearn.cluster import AgglomerativeClustering

clustering = AgglomerativeClustering(n_clusters=2, affinity='cosine', linkage='complete')
labels = clustering.fit_predict(encoded_papers)

In [9]:
assined_papers = pd.Series(labels, index=encoded_papers.index)

In [83]:
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

sil_train = silhouette_score(encoded_papers, labels)
davies_train = davies_bouldin_score(encoded_papers, labels)
cal_train = calinski_harabasz_score(encoded_papers, labels)
sil_train, davies_train, cal_train

(0.188751, 2.052638468670523, 5.44873122951165)

In [90]:
from sklearn.cluster import AgglomerativeClustering

clustering = AgglomerativeClustering(n_clusters=2, affinity='cosine', linkage='complete')
labels = clustering.fit_predict(val_encoded_papers)
pd.Series(labels, index=val_encoded_papers.index)

provable guarantees for nonlinear feature learning in three-layer neural networks                                                         1
an assessment of the human sortilin1 protein network, its expression and targetability using small molecules                              0
porcine and human aortic valve endothelial and interstitial cell isolation and characterization                                           0
multi-tier client selection for mobile federated learning networks                                                                        1
nubo: a transparent python package for bayesian optimisation                                                                              1
cardiovascular parameters in capitive blue-fronted amazon parrots (amazona aestiva, linnaeus, 1758) with varying body condition scores    0
deep multi-view subspace clustering with anchor graph                                                                                     1
multimodal integrati

In [92]:
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

sil_train = silhouette_score(val_encoded_papers, labels)
davies_train = davies_bouldin_score(val_encoded_papers, labels)
cal_train = calinski_harabasz_score(val_encoded_papers, labels)
sil_train, davies_train, cal_train

(0.11765595, 1.8036050420727296, 2.3230615272762227)

Mejores resultados obtenidos con AgglomerativeClustering y dos clusters. Tiene sentido dado que nuestro conjunto de papers se compone por papers de biología y papers de ingeniería informática. Aunque existen subtemas como el speech recognition, cardiovascular deseases o virus, resulta lógico que la mayor separabilidad se obtenga con estos dos clusters. Aun así, observamos un coeficiente de silueta relativamente bajo.

Si asignáramos etiquetas reales a los papers, habríamos obtenido un 100% de accuracy en ambos conjuntos de datos.

In [20]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Tokenize the text into individual words
    tokens = word_tokenize(text)
    # Remove stop words from the token list
    filtered_tokens = [word for word in tokens if not word.lower() in stop_words]
    # Join the remaining tokens back into a single string
    return ' '.join(filtered_tokens)

df = pd.DataFrame([{'Title': paper.title, "abstract": preprocess_text(paper.abstract), 'label': assined_papers.loc[paper.title] } for paper in paper_space.get_xml_papers().values()])

[nltk_data] Downloading package stopwords to /home/dani/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/dani/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [21]:
df.head()

Unnamed: 0,Title,abstract,label
0,a note on the evaluation of generative models,Probabilistic generative models used compressi...,1
1,spectral clustering on large datasets: when do...,Spectral clustering one popular clustering alg...,1
2,attributing emotion to static body postures: r...,total 176 computer-generated mannequin figures...,1
3,adversarial multi-task learning for text class...,Neural network models shown promising opportun...,1
4,adam: a method for stochastic optimization,"introduce Adam , algorithm first-order gradien...",1


In [43]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
tokens = vectorizer.fit_transform(df['abstract'])
tokens.index = df['Title']

In [42]:
df.tokenized

0       (0, 1213)\t1\n  (0, 712)\t3\n  (0, 1015)\t6\...
1       (0, 1213)\t1\n  (0, 712)\t3\n  (0, 1015)\t6\...
2       (0, 1213)\t1\n  (0, 712)\t3\n  (0, 1015)\t6\...
3       (0, 1213)\t1\n  (0, 712)\t3\n  (0, 1015)\t6\...
4       (0, 1213)\t1\n  (0, 712)\t3\n  (0, 1015)\t6\...
5       (0, 1213)\t1\n  (0, 712)\t3\n  (0, 1015)\t6\...
6       (0, 1213)\t1\n  (0, 712)\t3\n  (0, 1015)\t6\...
7       (0, 1213)\t1\n  (0, 712)\t3\n  (0, 1015)\t6\...
8       (0, 1213)\t1\n  (0, 712)\t3\n  (0, 1015)\t6\...
9       (0, 1213)\t1\n  (0, 712)\t3\n  (0, 1015)\t6\...
10      (0, 1213)\t1\n  (0, 712)\t3\n  (0, 1015)\t6\...
11      (0, 1213)\t1\n  (0, 712)\t3\n  (0, 1015)\t6\...
12      (0, 1213)\t1\n  (0, 712)\t3\n  (0, 1015)\t6\...
13      (0, 1213)\t1\n  (0, 712)\t3\n  (0, 1015)\t6\...
14      (0, 1213)\t1\n  (0, 712)\t3\n  (0, 1015)\t6\...
15      (0, 1213)\t1\n  (0, 712)\t3\n  (0, 1015)\t6\...
16      (0, 1213)\t1\n  (0, 712)\t3\n  (0, 1015)\t6\...
17      (0, 1213)\t1\n  (0, 712)\t3\n  (0, 1015)

In [44]:
from sklearn.decomposition import LatentDirichletAllocation

num_topics = 10
lda_model = LatentDirichletAllocation(n_components=num_topics, max_iter=100, learning_method='online')
lda_model.fit(tokens)

# Print the top words for each topic
feature_names = vectorizer.get_feature_names()
for topic_idx, topic in enumerate(lda_model.components_):
    print(f"Topic {topic_idx}:")
    top_words_idx = topic.argsort()[:-11:-1]
    top_words = [feature_names[i] for i in top_words_idx]
    print(top_words)
    print()

Topic 0:
['spectral', 'data', 'clustering', 'density', 'inputs', 'drawn', 'models', 'distribution', 'graph', 'sets']

Topic 1:
['learning', 'task', 'multi', 'models', 'tasks', 'spo', 'approach', 'data', 'gradient', 'loss']

Topic 2:
['model', 'using', 'audio', 'singing', 'raw', 'make', 'modeling', 'combined', 'high', 'show']

Topic 3:
['il', '17', 'receptor', 'deep', 'signaling', 'act1', 'many', 'high', 'family', 'including']

Topic 4:
['cov', 'human', 'respiratory', 'virus', 'viruses', 'outbreaks', 'emotion', 'humans', 'dogs', 'animals']

Topic 5:
['speech', 'network', 'cells', 'cnn', 'neural', 'lstm', 'recognition', 'networks', 'deep', 'mast']

Topic 6:
['generative', 'postures', 'deep', 'emotions', 'variables', 'models', 'auxiliary', 'anger', 'state', 'performance']

Topic 7:
['based', 'sars', 'coronavirus', 'cov', 'church', 'temporal', 'methods', 'stochastic', 'non', 'adam']

Topic 8:
['communities', 'viral', 'catalog', 'largely', 'spatial', 'diversity', 'predicted', 'microbial', '



In [38]:
lda_model.transform(tokens[10])

array([[6.89700874e-04, 6.89655203e-04, 6.89701568e-04, 6.89688147e-04,
        6.89705689e-04, 6.89696944e-04, 6.89688351e-04, 9.93792779e-01,
        6.89721654e-04, 6.89662658e-04]])

In [45]:
import gensim
import numpy as np

n_topics = 10

gensim_model = gensim.models.ldamodel.LdaModel(
    corpus=None,
    id2word={i: word for i, word in enumerate(vectorizer.get_feature_names_out())},
    num_topics=n_topics,
    alpha='auto',
    eta=lda_model.components_ / lda_model.components_.sum(axis=1)[:, np.newaxis],  # Use the topic-word matrix as input for the Gensim model
    iterations=100,
)

In [46]:
from gensim.models import LdaModel
from gensim.models import LdaMulticore
preprocessed_documents = []
for document in df.abstract:
    tokens = vectorizer.get_feature_names_out()
    preprocessed_documents.append(tokens)

dictionary = gensim.corpora.Dictionary(preprocessed_documents)
corpus = [dictionary.doc2bow(doc) for doc in preprocessed_documents]

lda_model = gensim.models.LdaModel(corpus=corpus, num_topics=2, id2word=dictionary, passes=10)
coherence_model = gensim.models.CoherenceModel(model=lda_model, texts=preprocessed_documents, dictionary=dictionary, coherence='c_npmi')
coherence_score = coherence_model.get_coherence()
print(f"Coherence score: {coherence_score:.2f}")

Coherence score: -0.62


In [59]:
list(paper_space.get_xml_papers().values())[24].acknowledgements.text

"Part of this work was done when the first author worked in Advanced Analytics Institute (AAI), University of Technology, Sydney as a visiting scholar. Jianfeng Zhao, Xia Mao, and Lijiang Chen's work in this paper was supported in part by the National Natural Science Foundation of China under Grant No. 61603013. This article recently received funding from the Fundamental Research Funds for the Central Universities (Grant No. YWF-18-BJ-Y-181)."

For NER, we tried various models, but the best results were obtained with the following model:
Babelscape/wikineural-multilingual-ner

which was best able to recognize foreign names, for every other entity it performed similarly to the other models.

In [8]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("Babelscape/wikineural-multilingual-ner")
model = AutoModelForTokenClassification.from_pretrained("Babelscape/wikineural-multilingual-ner")
nlp = pipeline("ner", model=model, tokenizer=tokenizer)

In [9]:
import json

text = list(paper_space.get_xml_papers().values())[24].acknowledgements.text
ner_results = nlp(text)


In [12]:
print(json.dumps(ner_results, indent=2, default=str))

[
  {
    "entity": "B-ORG",
    "score": "0.99798715",
    "index": 13,
    "word": "Advanced",
    "start": 59,
    "end": 67
  },
  {
    "entity": "I-ORG",
    "score": "0.9962837",
    "index": 14,
    "word": "Ana",
    "start": 68,
    "end": 71
  },
  {
    "entity": "I-ORG",
    "score": "0.9973973",
    "index": 15,
    "word": "##ly",
    "start": 71,
    "end": 73
  },
  {
    "entity": "I-ORG",
    "score": "0.99747133",
    "index": 16,
    "word": "##tics",
    "start": 73,
    "end": 77
  },
  {
    "entity": "I-ORG",
    "score": "0.9977435",
    "index": 17,
    "word": "Institute",
    "start": 78,
    "end": 87
  },
  {
    "entity": "B-ORG",
    "score": "0.90609896",
    "index": 19,
    "word": "AA",
    "start": 89,
    "end": 91
  },
  {
    "entity": "I-ORG",
    "score": "0.8466319",
    "index": 20,
    "word": "##I",
    "start": 91,
    "end": 92
  },
  {
    "entity": "B-ORG",
    "score": "0.9973616",
    "index": 23,
    "word": "University",
    "start

In [23]:
def process_entities(entities, text):
    # initialize variables
    org_start = None
    org_end = None
    people_start = None
    people_end = None
    new_entities = []

    for i, entity in enumerate(entities):
        # check if entity is an organization
        if entity['entity'] == 'B-ORG':
            org_start = entity['start']
            org_end = entity['end']
        elif entity['entity'] == 'I-ORG':
            org_end = entity['end']
        # check if entity is a person
        elif entity['entity'] == 'B-PER':
            people_start = entity['start']
            people_end = entity['end']
        elif entity['entity'] == 'I-PER':
            people_end = entity['end']


        if org_start is not None and org_end is not None:
            if i == len(entities) - 1 or entities[i + 1]['entity'] != 'I-ORG':
                new_entities.append({'entity': 'ORG', "text": text[org_start:org_end]})
                org_start = None
        if people_start is not None and people_end is not None:
            if i == len(entities) - 1 or entities[i + 1]['entity'] != 'I-PER':
                new_entities.append({'entity': 'PER', "text": text[people_start:people_end]})
                people_start = None

    return new_entities
procesed = process_entities(ner_results, text)

[{'entity': 'ORG', 'text': 'Advanced Analytics Institute'},
 {'entity': 'ORG', 'text': 'AAI'},
 {'entity': 'ORG', 'text': 'University of Technology'},
 {'entity': 'PER', 'text': 'Jianfeng Zhao'},
 {'entity': 'PER', 'text': 'Xia Mao'},
 {'entity': 'PER', 'text': 'Lijiang Chen'},
 {'entity': 'ORG', 'text': 'National Natural Science Foundation of China'},
 {'entity': 'ORG',
  'text': 'Fundamental Research Funds for the Central Universities'}]

In [3]:
paper_list = list(paper_space.get_xml_papers().values())
paper_list[1].topic, paper_list[1].title, paper_list[1].abstract

('spectral, clustering',
 'spectral clustering on large datasets: when does it work? theory from continuous clustering and density cheeger-buser',
 'Spectral clustering is one of the most popular clustering algorithms that has stood the test of time. It is simple to describe, can be implemented using standard linear algebra, and often finds better clusters than traditional clustering algorithms like k-means and k-centers. The foundational algorithm for two-way spectral clustering, by Shi and Malik, creates a geometric graph from data and finds a spectral cut of the graph.In modern machine learning, many data sets are modeled as a large number of points drawn from a probability density function. Little is known about when spectral clustering works in this setting -and when it doesn\'t. Past researchers justified spectral clustering by appealing to the graph Cheeger inequality (which states that the spectral cut of a graph approximates the "Normalized Cut"), but this justification is kno

In [7]:
[f'{author.forename} {author.surname}' for author in paper_list[0].acknowledgements.acknowledges_people]

['Jascha Sohl-Dickstein', 'Ivo Danihelka', 'Andriy Mnih', 'Leon Gatys']