# Bibtex Parsing

In [1]:
import bibtexparser
from bibtexparser.library import Library as BibTexLibrary
import networkx as nx
from pybliometrics.scopus import AbstractRetrieval
from pybliometrics.scopus.exception import Scopus404Error
from collections import namedtuple as NamedTuple
import pandas as pd
import logging
import sys


In [2]:
from src.scopus_citgraph.requests import get_from_scopus

In [12]:
ret = get_from_scopus(
    identifier='10.1016/j.compchemeng.2024.108700',
    id_type='doi'
)
print(ret)

PaperInfo(internal_id=UUID('2094db8b-6565-48dd-ad6e-8ae4bc7ce95c'), title='Production rescheduling via explorative reinforcement learning while considering nervousness', authors='Hwangbo, S.; Liu, J.J.; Ryu, J.-H.; Lee, H.J.; Na, J.', year=2024, scopus_id=85190752229, doi='10.1016/j.compchemeng.2024.108700', eid='2-s2.0-85190752229', scopus_url='https://www.scopus.com/inward/record.uri?partnerID=HzOxMe3b&scp=85190752229&origin=inward')


In [13]:
ret.authors

'Hwangbo, S.; Liu, J.J.; Ryu, J.-H.; Lee, H.J.; Na, J.'

In [14]:
hash(ret)

7895906317458454975

In [11]:
hash(ret)

2320620625928939234

In [14]:
ret = AbstractRetrieval(identifier='016/j.compchemeng.2024.108700', id_type='doi')

Scopus404Error: The resource specified cannot be found.

In [12]:
ret.authors

[Author(auid=58030902700, indexed_name='Hwangbo S.', surname='Hwangbo', given_name='Sumin', affiliation='60001018;60001018'),
 Author(auid=58992374300, indexed_name='Liu J.J.', surname='Liu', given_name='J. Jay', affiliation='60011883'),
 Author(auid=7401868467, indexed_name='Ryu J.-H.', surname='Ryu', given_name='Jun Hyung', affiliation='60103637'),
 Author(auid=57191094924, indexed_name='Lee H.J.', surname='Lee', given_name='Ho Jae', affiliation='60017776'),
 Author(auid=58891412500, indexed_name='Na J.', surname='Na', given_name='Jonggeol', affiliation='60001018;60001018')]

In [13]:
name = 'Lee H.J.'
', '.join(name.split(' '))

'Lee, H.J.'

In [15]:
int('str')

ValueError: invalid literal for int() with base 10: 'str'

In [33]:
from functools import wraps

In [117]:
def retry_scopus(num_retries: int = 3):
    def wrapper(func):
        @wraps(func)
        def wrapper_func(*args, **kwargs):
            for attempt in range(1, (num_retries+1)):
                try:
                    return func(*args, **kwargs)
                except Scopus404Error:
                    if attempt == num_retries:
                        print(f'trial {attempt}')
                        return None
        return wrapper_func
    return wrapper
"""
def retry_scopus(func, num_retries: int = 3):
    @wraps(func)
    def wrapper(*args, **kwargs):
        for attempt in range(1, (num_retries+1)):
            try:
                return func(*args, **kwargs)
            except (Scopus404Error, ValueError):
                if attempt == num_retries:
                    print(f'trial {attempt}')
                    return None
    return wrapper
"""

"\ndef retry_scopus(func, num_retries: int = 3):\n    @wraps(func)\n    def wrapper(*args, **kwargs):\n        for attempt in range(1, (num_retries+1)):\n            try:\n                return func(*args, **kwargs)\n            except (Scopus404Error, ValueError):\n                if attempt == num_retries:\n                    print(f'trial {attempt}')\n                    return None\n    return wrapper\n"

In [118]:
@retry_scopus()
def func(string):
    return int(string)

In [119]:
func('123')

123

In [120]:
func.__name__

'func'

In [162]:
logging.basicConfig(stream=sys.stdout)
logger = logging.getLogger('scopus_reader')
LOGGING_LEVEL = 'INFO'
logger.setLevel(LOGGING_LEVEL)

In [163]:
#library = bibtexparser.parse_file('./data/scheduling.bib')
#library = bibtexparser.parse_file('./data/scheduling_clean.bib')
#len(library.entries)

In [1]:
import uuid

In [7]:
uuid.uuid4()

UUID('7eed1060-d549-40dc-a4bc-12ad0d921bf4')

In [164]:
NUM_PAPER_BATCH = None

In [165]:
def read_identifier_list(
    path_to_csv: str,
    use_doi: bool = False,
) -> list['Identifier']:
    df = pd.read_csv(path_to_csv, encoding='UTF-8')
    
    if use_doi:
        key = 'DOI'
    else:
        key = 'EID'
    
    ids = df[key]
    ids = ids.dropna(ignore_index=True)
    
    total_num_entries = len(df)
    cleaned_num_entries = len(ids)
    nan_entries = total_num_entries - cleaned_num_entries
    logger.info(f"Entries in dataset: {total_num_entries}, "
                f"empty: {nan_entries}")
    
    return ids.to_list()

def filter_iter_depth(
    id_tuples: set[tuple[int, int]],
    target_iter_depth: int,
) -> set[tuple[int, str, str, str]]:
    
    filter_set = set()
    
    for entry in id_tuples.copy():
        
        if (entry[0] == target_iter_depth and 
            entry[1] not in filter_set):
            filter_set.add(entry[1])
    
    return filter_set.copy()

def transform_entry_tuple_to_dict(
    entry_tuple,
) -> dict:
    
    node_attribute_dict = dict()
    node_attribute_dict['iter_depth'] = entry_tuple[0]
    # title
    title = entry_tuple[1]
    if title is None:
        title = ''
    node_attribute_dict['title'] = title
    # authors
    authors = entry_tuple[2]
    if authors is None:
        authors = ''
    node_attribute_dict['authors'] = authors
    # year
    year = entry_tuple[3]
    if year is None:
        year = ''
    node_attribute_dict['year'] = year
    # ScopusID: should never be None at this stage, but check anyways
    scopus_id = entry_tuple[4]
    if scopus_id is None:
        scopus_id = ''
    node_attribute_dict['scopus_id'] = scopus_id
    # DOI
    doi = entry_tuple[5]
    if doi is None:
        doi = ''
    node_attribute_dict['doi'] = doi
    
    return node_attribute_dict

def build_author_str(author_list: list[NamedTuple]) -> str:
    
    authors: list[str] = list()
    # build list of indexed names
    for author in author_list:
        author_name = ', '.join(author.indexed_name.split(' '))
        authors.append(author_name)
        
    return '; '.join(authors)

"""
def build_author_collection_for_ref(author_list: str) -> tuple[str, ...]:
    # ref authors are separated by ';'
    authors: list[str] = author_list.split('; ')
        
    return tuple(authors)
"""

"\ndef build_author_collection_for_ref(author_list: str) -> tuple[str, ...]:\n    # ref authors are separated by ';'\n    authors: list[str] = author_list.split('; ')\n        \n    return tuple(authors)\n"

In [166]:
node_id_counter = 1000

In [167]:
ids = read_identifier_list(path_to_csv='./data/scopus.csv')

INFO:scopus_reader:Entries in dataset: 200, empty: 0


---

In [168]:
# if only ScopusID is used as identifier of documents, 
# corpus could be represented only by these IDs

# id tuples should only be used to get papers on higher iteration depths
# should be connected to corpus, if already known skip

# REWORK: use Scopus API also for init entries
# only list of DOIs necessary

def generate_init_graph(
    ids: list['Identifier'],
) -> tuple[nx.DiGraph, set, set, dict, dict]:
    # global identifier, simple integer
    global node_id_counter
    # build known corpus
    corpus: set[int] = set()
    id_tuples: set[tuple[int,int]] = set()
    iter_depth = 0
    # custom ID mapping: using entry tuple as key and map to custom ID
    map_scopus_to_node_id = dict()
    # whole corpus with custom ID
    map_node_id_to_scopus = dict()
    # graph
    cit_graph = nx.DiGraph()
    
    
    # check ID type
    test_id = ids[0]
    if '2-s2.0-' in test_id and '/' not in test_id:
        # EID
        id_type: str = 'eid'
    else:
        # DOI
        id_type: str = 'doi'

    for idx, ident in enumerate(ids):
        
        if (NUM_PAPER_BATCH is not None and 
            idx == NUM_PAPER_BATCH):
            break
        
        # obtain information from Scopus
        paper_info = AbstractRetrieval(identifier=ident, view='FULL', id_type=id_type)
        title = paper_info.title
        authors = build_author_str(paper_info.authors)
        year = paper_info.coverDate.split('-')[0]
        scopus_id = paper_info.identifier
        doi = paper_info.doi
        
        if not all((title, year)):
            logger.warning(f"{entry=} not containing title or year. Skipped.")
            continue
        
        entry_tuple = (iter_depth, title, authors, year, scopus_id, doi)
        id_tuple = (iter_depth, scopus_id)
        
        if scopus_id not in corpus:
            corpus.add(scopus_id)
        else:
            logger.info(f"{scopus_id=} already in known corpus set. Skipped")
            continue
        
        # add id tuple if not known with this iteration depth
        # other depths not relevant
        if id_tuple not in id_tuples:
            id_tuples.add(id_tuple)
        
        map_scopus_to_node_id[scopus_id] = node_id_counter
        map_node_id_to_scopus[node_id_counter] = scopus_id
        
        # NetworkX: (node ID, node_attribute_dict)
        node_props = transform_entry_tuple_to_dict(entry_tuple=entry_tuple)
        node = (node_id_counter, node_props)
        cit_graph.add_nodes_from([node])
        
        node_id_counter += 1
        
    return (cit_graph, corpus.copy(), id_tuples.copy(), 
            map_scopus_to_node_id.copy(), map_node_id_to_scopus.copy())

In [169]:
SKIP = True

if not SKIP:
    cit_graph, corpus, id_tuples, map_scopus_to_node_id, map_node_id_to_scopus = generate_init_graph(ids=ids)

In [170]:
#cit_graph.nodes[1000]

In [171]:
#len(cit_graph.nodes)

In [172]:
def add_refs_by_depth(
    cit_graph: nx.DiGraph,
    corpus: set,
    id_tuples: set,
    map_scopus_to_node_id: dict,
    map_node_id_to_scopus: dict,
    target_iter_depth: int,
) -> tuple[nx.DiGraph, set, dict, dict]:
    
    # global identifier, simple integer
    global node_id_counter
    
    target_corpus = corpus.copy()
    target_id_tuples = id_tuples.copy()
    if target_iter_depth == 0:
        filter_depth = 0
    elif target_iter_depth > 0:
        filter_depth = target_iter_depth - 1
    else:
        raise ValueError(f"Target depth must be non-negative!")
    
    iter_corpus = filter_iter_depth(id_tuples=target_id_tuples, 
                                    target_iter_depth=filter_depth)
    
    for scopus_id in iter_corpus:
        # try using ScopusID
        """
        scopus_id = paper[-2]
        if scopus_id is not None:
            request_id = scopus_id
            id_type = 'scopus_id'
        else:
            #use doi instead; is last entry of tuple
            doi = paper[-1]
            request_id = doi
            id_type = 'doi'
        """
        
        #request_id = scopus_id
        id_type = 'scopus_id'
        
        # REWORK: should not be necessary anymore
        # skip if doi is not provided
        if scopus_id is None or scopus_id == '':
            logger.info("Skipped paper because of missing identifier")
            continue
        
        # node ID
        node_id_parent = map_scopus_to_node_id[scopus_id]
        #print(f'{node_id_parent=} \n ---------------')
        logger.debug(f"-------------- \n {scopus_id=}")
        logger.debug(f"{node_id_parent=}")
        
        
        # obtain references
        try:
            refs = AbstractRetrieval(identifier=scopus_id, view='FULL', id_type=id_type).references
        except Scopus404Error:
            # inforamtion could not be obtained from Scopus
            # continue with next entry
            logger.warning(f'Could not obtain reference information for ScopusID: {scopus_id}')
            continue
        
        # skip empty references
        if refs is None:
            logger.info(f"No references for ID type: {id_type}, ID: {scopus_id}")
            continue
        
        for ref in refs:
            title = ref.title
            #authors = build_author_tuple_for_ref(ref.authors)
            authors = ref.authors
            year = ref.publicationyear
            scopus_id = int(ref.id)
            doi = ref.doi
            logger.debug(f"ScopusID of ref: {scopus_id}, DOI of ref: {doi}")
            # ignore empty ScopusIDs
            if scopus_id is None:
                logger.warning(f"Reference with title: {title}, year: {year} does not contain ScopusID.")
                continue
            #if doi is None:
                #doi = ''
            """
            if not all((title, year)):
                # ignore references which do not contain title or year
                logger.warning(f"Reference with ScopusID {scopus_id} does not contain title or year.")
                #continue
            """
            
            
            entry_tuple = (target_iter_depth, title, authors, year, scopus_id, doi)
            id_tuple = (target_iter_depth, scopus_id)
            
            # check if tuple is already in corpus
            if scopus_id not in target_corpus:
                # not known paper, add to corpus
                target_corpus.add(scopus_id)
                map_scopus_to_node_id[scopus_id] = node_id_counter
                map_node_id_to_scopus[node_id_counter] = scopus_id
                node_id_child = node_id_counter
            else:
                # already known: get node ID for this tuple
                node_id_child = map_scopus_to_node_id[scopus_id]
            
            # add id tuple if not known with this iteration depth
            # other depths not relevant
            if id_tuple not in target_id_tuples:
                target_id_tuples.add(id_tuple)
            
            # add child to graph as node
            # NetworkX: (node ID, node_attribute_dict)
            node_props = transform_entry_tuple_to_dict(entry_tuple=entry_tuple)
            node = (node_id_child, node_props)
            cit_graph.add_nodes_from([node])
            # add edge
            cit_graph.add_edge(node_id_parent, node_id_child)
            
            # set up ID counter
            node_id_counter += 1
            
    return (cit_graph, target_corpus.copy(), target_id_tuples.copy(), 
            map_scopus_to_node_id.copy(), map_node_id_to_scopus.copy())

# function to build graphs with customizable iteration depth
def generate_iter_graph(
    ids: list['Identifier'],
    target_iter_depth: int,
) -> tuple[nx.DiGraph, set, dict, dict]:
    
    # generate init graph with library
    (cit_graph, corpus, id_tuples,
     map_scopus_to_node_id, map_node_id_to_scopus) = generate_init_graph(ids=ids)
    
    # if iteration depth greater than 0
    # sequentially build graph
    if target_iter_depth > 0:
        for iter_depth in range(1, target_iter_depth+1):
            
            (cit_graph, corpus, id_tuples,
             map_scopus_to_node_id, map_node_id_to_scopus) = add_refs_by_depth(
                cit_graph=cit_graph,
                corpus=corpus,
                id_tuples=id_tuples,
                map_scopus_to_node_id=map_scopus_to_node_id,
                map_node_id_to_scopus=map_node_id_to_scopus,
                target_iter_depth=iter_depth,
            )
    elif target_iter_depth < 0:
        raise ValueError(f"Target depth must be non-negative!")
    
    return (cit_graph, corpus.copy(), id_tuples.copy(), 
            map_scopus_to_node_id.copy(), map_node_id_to_scopus.copy())

In [173]:
SKIP = True

if not SKIP:
    (cit_graph_iter, target_corpus, target_id_tuples, 
    map_tuple_to_id, map_id_to_tuple) = add_refs_by_depth(
        cit_graph=cit_graph,
        corpus=corpus,
        id_tuples=id_tuples,
        map_scopus_to_node_id=map_scopus_to_node_id,
        map_node_id_to_scopus=map_scopus_to_node_id,
        target_iter_depth=1,
    )

In [174]:
#cit_graph_iter.nodes[1002]

In [175]:
#len(cit_graph_iter.nodes)

In [176]:
#len(target_corpus)

In [177]:
#len(cit_graph_iter.edges)

---

In [178]:
node_id_counter = 1000
ITER_DEPTH = 1

(cit_graph, known_corpus, known_id_tuples,
 map_tuple_to_id, map_id_to_tuple) = generate_iter_graph(ids=ids, target_iter_depth=ITER_DEPTH)

INFO:scopus_reader:No references for ID type: scopus_id, ID: 85183656095
INFO:scopus_reader:No references for ID type: scopus_id, ID: 85183939814
INFO:scopus_reader:No references for ID type: scopus_id, ID: 85175279593


In [179]:
len(cit_graph.nodes)

7683

In [180]:
cit_graph.nodes[1001]

{'iter_depth': 0,
 'title': 'An actor-critic framework based on deep reinforcement learning for addressing flexible job shop scheduling problems',
 'authors': 'Zhao, C.; Deng, N.',
 'year': '2024',
 'scopus_id': 85181525982,
 'doi': '10.3934/mbe.2024062'}

for node in cit_graph.nodes:
    for attrib in cit_graph.nodes[node]:
        print(f"{attrib=}, {type(cit_graph.nodes[node][attrib])}")
        if type(cit_graph.nodes[node][attrib]) == "<class 'type'>":
            print(node)

In [181]:
len(known_corpus)

7683

In [182]:
len(cit_graph.edges)

9228

In [183]:
# write to GraphML file
if NUM_PAPER_BATCH is not None:
    file_batch_info = NUM_PAPER_BATCH
else:
    file_batch_info = 'All'

GRAPH_FILE_PATH = f'citations_batch{file_batch_info}_iterDepth{ITER_DEPTH}.graphml'
nx.write_graphml(cit_graph, GRAPH_FILE_PATH)

In [125]:
doi = '10.1109/ICCSCE50387.2020.9204921'
#doi = '10.1109/ACCESS.2020.2997663'
#doi = '10.1016/j.swevo.2024.101497'
#doi = '10.48550/arXiv.1509.02971'
doi = dois[0]

In [126]:
try:
    refs = AbstractRetrieval(identifier=doi, view='FULL', id_type='doi')
except Scopus404Error:
    print('ERROR')

In [127]:
print(refs)

Cong Luo, Wenyin Gong, Fei Ming and Chao Lu: "A Q-learning memetic algorithm for energy-efficient heterogeneous distributed assembly permutation flowshop scheduling considering priorities", Swarm and Evolutionary Computation, 85, (no pages found)(2024). https://doi.org/10.1016/j.swevo.2024.101497.
0 citation(s) as of 2024-03-11
  Affiliation(s):
   China University of Geosciences


In [128]:
print(refs)

Cong Luo, Wenyin Gong, Fei Ming and Chao Lu: "A Q-learning memetic algorithm for energy-efficient heterogeneous distributed assembly permutation flowshop scheduling considering priorities", Swarm and Evolutionary Computation, 85, (no pages found)(2024). https://doi.org/10.1016/j.swevo.2024.101497.
0 citation(s) as of 2024-03-11
  Affiliation(s):
   China University of Geosciences


In [78]:
len('85142134801')

11

In [79]:
ident = 18306059
ident = 85142134801

In [82]:
eid = '2-s2.0-85142134801'
ret = AbstractRetrieval(identifier=eid, view='FULL', id_type='eid')

Scopus404Error: The resource specified cannot be found.

In [80]:
ret = AbstractRetrieval(identifier=ident, view='FULL', id_type='scopus_id')

Scopus404Error: The resource specified cannot be found.

In [136]:
# 2-s2.0-85184079846
ret = AbstractRetrieval(identifier='2-s2.0-85184079846', view='FULL', id_type='eid')

In [138]:
ret = AbstractRetrieval(identifier='85184079846', view='FULL', id_type='scopus_id')

In [139]:
print(ret)

Cong Luo, Wenyin Gong, Fei Ming and Chao Lu: "A Q-learning memetic algorithm for energy-efficient heterogeneous distributed assembly permutation flowshop scheduling considering priorities", Swarm and Evolutionary Computation, 85, (no pages found)(2024). https://doi.org/10.1016/j.swevo.2024.101497.
0 citation(s) as of 2024-03-11
  Affiliation(s):
   China University of Geosciences


- Export Scopus CSV: EID, DOI
- references: ScopusID, DOI

In [135]:
ret.references

[Reference(position='1', id='0003438602', doi=None, title=None, authors='Pinedo, M.L.', authors_auid=None, authors_affiliationid=None, sourcetitle='Scheduling: Theory, Algorithms, and Systems', publicationyear='2000', coverDate=None, volume=None, issue=None, first=None, last=None, citedbycount=None, type=None, text='Englewood Cliffs, NJ, USA: Prentice-Hall', fulltext='M. L. Pinedo, Scheduling: Theory, Algorithms, and Systems. Englewood Cliffs, NJ, USA: Prentice-Hall, 2000.'),
 Reference(position='2', id='84962109552', doi=None, title='Automated design of production scheduling heuristics: A review', authors='Branke, J.; Nguyen, S.; Pickardt, C.W.; Zhang, M.J.', authors_auid=None, authors_affiliationid=None, sourcetitle='Ieee Trans. Evol. Comput.', publicationyear='2016', coverDate=None, volume=None, issue=None, first=None, last=None, citedbycount=None, type=None, text='Feb.', fulltext='J. Branke, S. Nguyen, C. W. Pickardt, and M. J. Zhang, "Automated design of production scheduling heur

In [163]:
len('85096785100')

11

In [129]:
refs.references

[Reference(position='1', id='85096785100', doi=None, title='Scheduling dual-objective stochastic hybrid flow shop with deteriorating jobs via bi-population evolutionary algorithm', authors='Fu, Y.; Zhou, M.; Guo, X.; Qi, L.', authors_auid=None, authors_affiliationid=None, sourcetitle='IEEE Trans. Syst. Man Cybern.: Syst.', publicationyear='2019', coverDate=None, volume=None, issue=None, first=None, last=None, citedbycount=None, type=None, text=None, fulltext='Fu, Y., Zhou, M., Guo, X., Qi, L., Scheduling dual-objective stochastic hybrid flow shop with deteriorating jobs via bi-population evolutionary algorithm. IEEE Trans. Syst. Man Cybern.: Syst. 50:12 (2019), 5037–5048.'),
 Reference(position='2', id='85184299618', doi=None, title='Multi-objective home health care routing and scheduling with sharing service via a problem-specific knowledge-based artificial bee colony algorithm', authors='Fu, Y.; Ma, X.; Gao, K.; Li, Z.; Dong, H.', authors_auid=None, authors_affiliationid=None, source

In [372]:
refs.coverDate.split('-')[0]

'2020'

In [390]:
ref = refs.references[0]

In [391]:
ref.authors

'Wang, J.; Ma, Y.; Zhang, L.; Gao, R.X.; Wu, D.'

In [375]:
ref.authors.split('; ')

['Wang, J.', 'Ma, Y.', 'Zhang, L.', 'Gao, R.X.', 'Wu, D.']

In [381]:
refs.authors[0]

Author(auid=55582207400, indexed_name='Abidi M.H.', surname='Abidi', given_name='Mustufa Haider', affiliation='60013183')

In [382]:
refs.coverDate

'2020-01-01'

In [387]:
', '.join(refs.authors[0].indexed_name.split(' '))

'Abidi, M.H.'

In [389]:
build_author_collection(refs.authors)

'Abidi, M.H.; Alkhalefah, H.; Mohammed, M.K.; Umer, U.; Qudeiri, J.E.A.'

In [379]:
'; '.join(tup)

'Abidi M.H.; Alkhalefah H.; Mohammed M.K.; Umer U.; Qudeiri J.E.A.'

In [115]:
type(refs.authors[0])

pybliometrics.scopus.abstract_retrieval.Author

In [120]:
build_author_tuple(refs.authors)

('Abidi M.H.', 'Alkhalefah H.', 'Mohammed M.K.', 'Umer U.', 'Qudeiri J.E.A.')

In [31]:
ref = refs.references[0]

In [32]:
ref.authors

'Wang, J.; Ma, Y.; Zhang, L.; Gao, R.X.; Wu, D.'

In [58]:
len(cit_graph.nodes)

397

In [44]:
len(known_corpus)

423

In [40]:
for i in range(1,2+1):
    print(i)

1
2


In [30]:
cit_graph_iter, target_corpus, map_tuple_to_id, map_id_to_tuple = add_refs_by_depth(
    cit_graph=cit_graph,
    corpus=known_corpus,
    map_tuple_to_id=map_tuple_to_id,
    map_id_to_tuple=map_id_to_tuple,
    target_iter_depth=1,
)

In [31]:
len(cit_graph_iter.nodes)

397

In [32]:
len(target_corpus)

397

In [33]:
cit_graph_iter.nodes[1000]

{'iter_depth': 0,
 'title': 'Optimal Scheduling of Flexible Manufacturing System Using Improved Lion-Based Hybrid Machine Learning Approach',
 'year': '2020',
 'doi': '10.1109/ACCESS.2020.2997663'}

In [39]:
len(cit_graph_iter.edges)

389

---

In [9]:
iter_depth = 1
iter_corpus = filter_iter_depth(known_corpus, 0)

for paper in iter_corpus:
    # doi is last entry of tuple
    doi = paper[-1]
    print(f'{doi=}')
    
    # skip if doi is not provided
    if doi is None:
        continue
    
    # node ID
    node_id_parent = map_tuple_to_id[paper]
    #print(f'{node_id_parent=}')
    
    
    # obtain references
    refs = AbstractRetrieval(identifier=doi, view='FULL', id_type='doi').references
    # skip empty references
    if refs is None:
        continue
    
    #print(f'{type(refs)=}')
    
    for ref in refs:
        title = ref.title
        year = ref.publicationyear
        doi = ref.doi
        
        if not all((title, year)):
            #raise UserWarning(f"{ref=} not containing title or year")
            # ignore references which do not contain title or year
            continue
        
        entry_tuple = (iter_depth, title, year, doi)
        
        # check if tuple is already in corpus
        if entry_tuple not in known_corpus:
            # not known paper, add to corpus
            known_corpus.add(entry_tuple)
            map_tuple_to_id[entry_tuple] = node_id_counter
            map_id_to_tuple[node_id_counter] = entry_tuple
            node_id_child = node_id_counter
        else:
            # already known: get node ID for this tuple
            node_id_child = map_tuple_to_id[entry_tuple]
            #raise UserWarning(f"{entry_tuple=} already in known corpus")
        
        #map_tuple_to_id[entry_tuple] = custom_id
        #map_id_to_tuple[custom_id] = entry_tuple
        
        # add child to as node
        # NetworkX: (node ID, node_attribute_dict)
        node_props = transform_entry_tuple_to_dict(entry_tuple=entry_tuple)
        node = (node_id_child, node_props)
        cit_graph.add_nodes_from([node])
        # add edge
        cit_graph.add_edge(node_id_parent, node_id_child)
        
        
        # set up ID counter
        node_id_counter += 1

doi='10.1016/j.agwat.2022.107720'
node_id_parent=1008
type(refs)=<class 'list'>
doi='10.1109/ACCESS.2020.2975738'
node_id_parent=1021
type(refs)=<class 'list'>
doi='10.1109/ACCESS.2020.2997663'
node_id_parent=1000
type(refs)=<class 'list'>
doi='10.1016/j.jmsy.2022.03.011'
node_id_parent=1018
type(refs)=<class 'list'>
doi='10.3390/app13020806'
node_id_parent=1038
type(refs)=<class 'list'>
doi='10.3311/PPme.20145'
node_id_parent=1004
type(refs)=<class 'list'>
doi='10.3837/tiis.2021.08.016'
node_id_parent=1006
type(refs)=<class 'list'>
doi='10.1109/ACCESS.2020.3007257'
node_id_parent=1010
type(refs)=<class 'list'>
doi='10.1108/K-06-2013-0101'
node_id_parent=1040
type(refs)=<class 'list'>
doi='10.1016/j.ijepes.2022.108225'
node_id_parent=1007
type(refs)=<class 'list'>
doi='10.1016/j.eswa.2020.113405'
node_id_parent=1039
type(refs)=<class 'list'>
doi='10.18280/jesa.520202'
node_id_parent=1032
type(refs)=<class 'list'>
doi='10.4467/20838476SI.16.013.6194'
node_id_parent=1027
type(refs)=<clas

2205

In [90]:
len(known_corpus)

50

In [10]:
import uuid

In [16]:
uuid.uuid4()

UUID('042b417a-211a-4ca5-8b3e-0a29e8363cf4')

In [10]:
len(cit_graph.edges)

2187

In [12]:
cit_graph.degree

DiDegreeView({1000: 51, 1001: 39, 1002: 13, 1003: 57, 1004: 25, 1005: 47, 1006: 30, 1007: 39, 1008: 43, 1009: 45, 1010: 29, 1011: 15, 1012: 39, 1013: 69, 1014: 46, 1015: 34, 1016: 47, 1017: 4, 1018: 52, 1019: 32, 1020: 43, 1021: 35, 1022: 45, 1023: 85, 1024: 40, 1025: 34, 1026: 100, 1027: 10, 1028: 95, 1029: 37, 1030: 35, 1031: 62, 1032: 16, 1033: 30, 1034: 41, 1035: 47, 1036: 54, 1037: 47, 1038: 57, 1039: 100, 1040: 42, 1041: 39, 1042: 5, 1043: 26, 1044: 73, 1045: 112, 1046: 35, 1047: 20, 1048: 29, 1049: 37, 1050: 1, 1051: 1, 1052: 1, 1053: 1, 1054: 1, 1055: 1, 1056: 1, 1057: 1, 1058: 1, 1059: 1, 1060: 1, 1061: 1, 1062: 1, 1063: 1, 1064: 1, 1065: 3, 1066: 1, 1067: 1, 1068: 1, 1069: 1, 1070: 1, 1071: 1, 1072: 1, 1073: 1, 1074: 1, 1075: 1, 1076: 1, 1077: 1, 1078: 1, 1079: 1, 1080: 1, 1081: 1, 1082: 1, 1083: 1, 1084: 1, 1085: 1, 1086: 1, 1087: 1, 1088: 1, 1089: 1, 1090: 1, 1091: 1, 1092: 1, 1093: 1, 1094: 1, 1095: 1, 1096: 1, 1097: 1, 1098: 3, 1099: 1, 1100: 1, 1101: 1, 1102: 1, 1103: 1,

In [13]:
nx.degree_histogram(cit_graph)

[0,
 2132,
 23,
 3,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 2,
 2,
 0,
 1,
 0,
 2,
 3,
 0,
 2,
 0,
 4,
 1,
 1,
 1,
 2,
 0,
 2,
 1,
 4,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 2,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1]

In [91]:
node_id_counter

5369

In [92]:
len(known_corpus)

4248

In [183]:
filter_corpus = filter_iter_depth(known_corpus, 1)

In [184]:
filter_corpus

{(1,
  'A Petri net based deadlock avoidance policy for-exible manufacturing systems with assembly operations and multiple resource acquisition',
  '2019',
  None),
 (1,
  'A comparative study of feature extraction using PCA and LDA for face recognition',
  '2011',
  None),
 (1,
  'A deadlock prevention policy for-exible manufacturing systems modeled with Petri nets using structural analysis',
  '2019',
  None),
 (1,
  'A feature selection approach for hyperspectral image based on modi-ed ant lion optimizer',
  '2019',
  None),
 (1,
  'A hybrid metaheuristic solution approach for the cobot assignment and job shop scheduling problem',
  '2022',
  '10.1016/j.jii.2022.100350'),
 (1,
  'A minimal supervisory structure to optimally enforce liveness on Petri net models for-exible manufacturing systems',
  '2017',
  None),
 (1,
  'A next-generation hyperparameter optimization framework',
  '2019',
  '10.1145/3292500.3330701'),
 (1,
  'A novel PCA-re-y based XGBoost classi-cation model for int

In [103]:
ab = AbstractRetrieval("10.1016/j.jmsy.2021.09.011", view='FULL', id_type='doi')

In [115]:
ref = ab.references[1]

In [118]:
ref

Reference(position='2', id='85115414953', doi=None, title='A metamodel for digital planning in the supply chain 4.0', authors='Serrano-Ruiz, J.C.; Mula, J.; Poler, R.', authors_auid=None, authors_affiliationid=None, sourcetitle='J Ind Inf Integr', publicationyear='2021', coverDate=None, volume=None, issue=None, first=None, last=None, citedbycount=None, type=None, text='Elsevier. Submitted for publication', fulltext='Serrano-Ruiz, J.C., Mula, J., Poler, R., A metamodel for digital planning in the supply chain 4.0. J Ind Inf Integr, 2021 Elsevier. Submitted for publication.')

In [117]:
ref.publicationyear

'2021'

- interim result: built corpus, no connections yet

- add or make node out of each title
    - NetworkX: each node must be hashable, so tuple can be used
        - use tuple (ID, (properties)) generated by map_id_to_tuple.items()

- iterate over corpus:
    - check DOI
        - if no DOI: continue with next entry
    - lookup DOI in scopus
    - obtain references
    - build tuple out of title, year, doi
    - check if tuple already exists

- build set for current corpus (title, year)

using API request to build citation tree:
- obtain information of entry:
    - DOI
        - if no DOI: abort
    - Hash out of title, publication year (tuple)
        - hash used to identify cross links in the current corpus
        - add hash to a set of known hashes
- create mapping: DOI, etc. and custom identifier
    - remember iteration depth
    - add custom identfier to graph
    - create mapping identifier to iteration depth
- lookup DOI in Scopus by API call
- get references
    - create mapping: DOI, etc. and custom identifier
    - remember iteration depth of children
    - add custom identfier for children to graph
    - create mapping identifier to iteration depth
- create edge for parent and children
- start again for higher iteration depth if needed

In [43]:
s = set()

s.add(('Test text', 2014, None))

In [44]:
s

{('Test text', 2014, None)}

In [49]:
t = ('Test text', 2014, 'DOI')

In [50]:
t in s

False

In [3]:
library = bibtexparser.parse_string(bibtex_str)

AttributeError: module 'bibtexparser' has no attribute 'parse_string'

In [87]:
bibtexparser.parse_string()

AttributeError: module 'bibtexparser' has no attribute 'parse_string'

# Scopus API

- obtain references

In [74]:
# Elsevier API key: 287b1e3cbf50d072df062b7ad1f73d74

from pybliometrics.scopus import AbstractRetrieval

In [75]:
ab = AbstractRetrieval("10.1016/j.jmsy.2021.09.011", view='FULL')

In [76]:
ab.title

'Smart manufacturing scheduling: A literature review'

In [78]:
ab.references

[Reference(position='1', id='84992957342', doi='10.1108/09600039810247524', title='The supply chain complexity triangle: uncertainty generation in the supply chain', authors='Wilding, R.', authors_auid=None, authors_affiliationid=None, sourcetitle='Int J Phys Distrib Logist Manag', publicationyear='1998', coverDate=None, volume=None, issue=None, first=None, last=None, citedbycount=None, type=None, text=None, fulltext='Wilding, R., The supply chain complexity triangle: uncertainty generation in the supply chain. Int J Phys Distrib Logist Manag 28:January 8 (1998), 599–616, 10.1108/09600039810247524.'),
 Reference(position='2', id='85115414953', doi=None, title='A metamodel for digital planning in the supply chain 4.0', authors='Serrano-Ruiz, J.C.; Mula, J.; Poler, R.', authors_auid=None, authors_affiliationid=None, sourcetitle='J Ind Inf Integr', publicationyear='2021', coverDate=None, volume=None, issue=None, first=None, last=None, citedbycount=None, type=None, text='Elsevier. Submitte

In [82]:
ref = ab.references[1]
ref

Reference(position='2', id='85115414953', doi=None, title='A metamodel for digital planning in the supply chain 4.0', authors='Serrano-Ruiz, J.C.; Mula, J.; Poler, R.', authors_auid=None, authors_affiliationid=None, sourcetitle='J Ind Inf Integr', publicationyear='2021', coverDate=None, volume=None, issue=None, first=None, last=None, citedbycount=None, type=None, text='Elsevier. Submitted for publication', fulltext='Serrano-Ruiz, J.C., Mula, J., Poler, R., A metamodel for digital planning in the supply chain 4.0. J Ind Inf Integr, 2021 Elsevier. Submitted for publication.')

In [80]:
ref.id

'85115414953'

In [81]:
ref.title

'A metamodel for digital planning in the supply chain 4.0'