In [9]:
from scholarly import scholarly
import jsonpickle
import json
from datetime import datetime
import os


In [11]:
author: dict = scholarly.search_author_id('4DbuyrgAAAAJ')
scholarly.fill(author, sections=['basics', 'indices', 'counts', 'publications','coauthors'])
name = author['name']
author['updated'] = str(datetime.now())
author['publications'] = {v['author_pub_id']:v for v in author['publications']}
print(json.dumps(author, indent=2))
os.makedirs('results', exist_ok=True)
with open(f'results/gs_data.json', 'w') as outfile:
    json.dump(author, outfile, ensure_ascii=False)


{
  "container_type": "Author",
  "filled": [
    "basics",
    "publications",
    "indices",
    "counts",
    "coauthors"
  ],
  "scholar_id": "4DbuyrgAAAAJ",
  "source": "AUTHOR_PROFILE_PAGE",
  "name": "Daniel Rodriguez-Cardenas",
  "affiliation": "William & Mary",
  "organization": 12917003543313146137,
  "interests": [
    "Computer Science"
  ],
  "email_domain": "@wm.edu",
  "homepage": "https://danielrcardenas.github.io/",
  "citedby": 65,
  "publications": {
    "4DbuyrgAAAAJ:8k81kl-MbHgC": {
      "container_type": "Publication",
      "source": "AUTHOR_PUBLICATION_ENTRY",
      "bib": {
        "title": "Benchmarking causal study to interpret large language models for source code",
        "pub_year": "2023"
      },
      "filled": false,
      "author_pub_id": "4DbuyrgAAAAJ:8k81kl-MbHgC",
      "num_citations": 21,
      "citedby_url": "https://scholar.google.com/scholar?oi=bibs&hl=en&cites=14523889433872554206",
      "cites_id": [
        "14523889433872554206"
      ]

In [3]:
shieldio_data = {
  "schemaVersion": 1,
  "label": "citations",
  "message": f"{author['citedby']}",
}
with open(f'results/gs_data_shieldsio.json', 'w') as outfile:
    json.dump(shieldio_data, outfile, ensure_ascii=False)


In [4]:
shieldio_data_mtl = {
  "schemaVersion": 1,
  "label": "citations",
  "message": f"{author['publications']['Uf9GqRsAAAAJ:bEWYMUwI8FkC']['num_citations']}",
}
with open(f'results/gs_data_shieldsio_mtl.json', 'w') as outfile:
    json.dump(shieldio_data_mtl, outfile, ensure_ascii=False)
    

KeyError: 'Uf9GqRsAAAAJ:bEWYMUwI8FkC'

In [5]:
shieldio_data_mnemonics = {
  "schemaVersion": 1,
  "label": "citations",
  "message": f"{author['publications']['Uf9GqRsAAAAJ:k_IJM867U9cC']['num_citations']}",
}
with open(f'results/gs_data_shieldsio_mnemonics.json', 'w') as outfile:
    json.dump(shieldio_data_mnemonics, outfile, ensure_ascii=False)

KeyError: 'Uf9GqRsAAAAJ:k_IJM867U9cC'

In [None]:
shieldio_data_aanets = {
  "schemaVersion": 1,
  "label": "citations",
  "message": f"{author['publications']['Uf9GqRsAAAAJ:u_35RYKgDlwC']['num_citations']}",
}
with open(f'results/gs_data_shieldsio_aanets.json', 'w') as outfile:
    json.dump(shieldio_data_aanets, outfile, ensure_ascii=False)
    

In [None]:
shieldio_data_e3bm = {
  "schemaVersion": 1,
  "label": "citations",
  "message": f"{author['publications']['Uf9GqRsAAAAJ:vV6vV6tmYwMC']['num_citations']}",
}
with open(f'results/gs_data_shieldsio_e3bm.json', 'w') as outfile:
    json.dump(shieldio_data_e3bm, outfile, ensure_ascii=False)
   

In [None]:
shieldio_data_lst = {
  "schemaVersion": 1,
  "label": "citations",
  "message": f"{author['publications']['Uf9GqRsAAAAJ:TFP_iSt0sucC']['num_citations']}",
}
with open(f'results/gs_data_shieldsio_lst.json', 'w') as outfile:
    json.dump(shieldio_data_lst, outfile, ensure_ascii=False)


In [12]:
import feedparser

rss_url = 'https://dblp.org/pid/222/6222.rss'  # Replace with your RSS URL
feed = feedparser.parse(rss_url)



In [83]:
import yaml
def generate_yaml(entries, output_file='publications.yml'):
    with open(output_file, 'w') as file:
        yaml.dump(entries, file, allow_unicode=True, default_flow_style=False, sort_keys=False)



In [None]:
# Example usage
dblp_urls = [
    'https://dblp.org/pid/222/6222.html'
]



In [66]:
import requests
from lxml import etree
from collections import namedtuple

DBLP_BASE_URL = 'http://dblp.uni-trier.de/'
DBLP_AUTHOR_SEARCH_URL = DBLP_BASE_URL + 'search/author'

DBLP_PERSON_URL = DBLP_BASE_URL + 'pers/xk/{urlpt}'
DBLP_PUBLICATION_URL = DBLP_BASE_URL + 'rec/bibtex/{key}.xml'

class LazyAPIData(object):
    def __init__(self, lazy_attrs):
        self.lazy_attrs = set(lazy_attrs)
        self.data = None

    def __getattr__(self, key):
        if key in self.lazy_attrs:
            if self.data is None:
                self.load_data()
            return self.data[key]

    def load_data(self):
        pass

class Author(LazyAPIData):
    """
    Represents a DBLP author. All data but the author's key is lazily loaded.
    Fields that aren't provided by the underlying XML are None.

    Attributes:
    name - the author's primary name record
    publications - a list of lazy-loaded Publications results by this author
    homepages - a list of author homepage URLs
    homonyms - a list of author aliases
    """
    def __init__(self, urlpt):
        self.urlpt = urlpt
        self.xml = None
        super(Author, self).__init__(['name','publications','homepages',
                                      'homonyms'])

    def load_data(self):
        resp = requests.get(DBLP_PERSON_URL.format(urlpt=self.urlpt))
        # TODO error handling
        xml = resp.content
        self.xml = xml
        root = etree.fromstring(xml)
        data = {
            'name':root.attrib['name'],
            'publications':[Publication(k) for k in 
                            root.xpath('/dblpperson/dblpkey[not(@type)]/text()')],
            'homepages':root.xpath(
                '/dblpperson/dblpkey[@type="person record"]/text()'),
            'homonyms':root.xpath('/dblpperson/homonym/text()')
        }

        self.data = data

def first_or_none(seq):
    try:
        return next(iter(seq))
    except StopIteration:
        pass

Publisher = namedtuple('Publisher', ['name', 'href'])
Series = namedtuple('Series', ['text','href'])
Citation = namedtuple('Citation', ['reference','label'])

class Publication(LazyAPIData):
    """
    Represents a DBLP publication- eg, article, inproceedings, etc. All data but
    the key is lazily loaded. Fields that aren't provided by the underlying XML
    are None.

    Attributes:
    type - the publication type, eg "article", "inproceedings", "proceedings",
    "incollection", "book", "phdthesis", "mastersthessis"
    sub_type - further type information, if provided- eg, "encyclopedia entry",
    "informal publication", "survey"
    title - the title of the work
    authors - a list of author names
    journal - the journal the work was published in, if applicable
    volume - the volume, if applicable
    number - the number, if applicable
    chapter - the chapter, if this work is part of a book or otherwise
    applicable
    pages - the page numbers of the work, if applicable
    isbn - the ISBN for works that have them
    ee - an ee URL
    crossref - a crossrel relative URL
    publisher - the publisher, returned as a (name, href) named tuple
    citations - a list of (text, label) named tuples representing cited works
    series - a (text, href) named tuple describing the containing series, if
    applicable
    """
    def __init__(self, key):
        self.key = key
        self.xml = None
        super(Publication, self).__init__( ['type', 'sub_type', 'mdate',
                'authors', 'editors', 'title', 'year', 'month', 'journal',
                'volume', 'number', 'chapter', 'pages', 'ee', 'isbn', 'url',
                'booktitle', 'crossref', 'publisher', 'school', 'citations',
                'series'])

    def load_data(self):
        resp = requests.get(DBLP_PUBLICATION_URL.format(key=self.key))
        xml = resp.content
        self.xml = xml
        root = etree.fromstring(xml)
        publication = first_or_none(root.xpath('/dblp/*[1]'))
        if publication is None:
            raise ValueError
        data = {
            'type':publication.tag,
            'sub_type':publication.attrib.get('publtype', None),
            'mdate':publication.attrib.get('mdate', None),
            'authors':publication.xpath('author/text()'),
            'editors':publication.xpath('editor/text()'),
            'title':first_or_none(publication.xpath('title/text()')),
            'year':int(first_or_none(publication.xpath('year/text()'))),
            'month':first_or_none(publication.xpath('month/text()')),
            'journal':first_or_none(publication.xpath('journal/text()')),
            'volume':first_or_none(publication.xpath('volume/text()')),
            'number':first_or_none(publication.xpath('number/text()')),
            'chapter':first_or_none(publication.xpath('chapter/text()')),
            'pages':first_or_none(publication.xpath('pages/text()')),
            'ee':first_or_none(publication.xpath('ee/text()')),
            'isbn':first_or_none(publication.xpath('isbn/text()')),
            'url':first_or_none(publication.xpath('url/text()')),
            'booktitle':first_or_none(publication.xpath('booktitle/text()')),
            'crossref':first_or_none(publication.xpath('crossref/text()')),
            'publisher':first_or_none(publication.xpath('publisher/text()')),
            'school':first_or_none(publication.xpath('school/text()')),
            'citations':[Citation(c.text, c.attrib.get('label',None))
                         for c in publication.xpath('cite') if c.text != '...'],
            'series':first_or_none(Series(s.text, s.attrib.get('href', None))
                      for s in publication.xpath('series'))
        }

        self.data = data

def search(author_str):
    resp = requests.get(DBLP_AUTHOR_SEARCH_URL, params={'xauthor':author_str})
    #TODO error handling
    root = etree.fromstring(resp.content)
    return [Author(urlpt) for urlpt in root.xpath('/authors/author/@urlpt')]

In [67]:
authors = search('Daniel Rodriguez-Cardenas')

In [68]:
daniel = authors[0]

In [69]:
print(len(daniel.publications))

15


In [80]:
from collections import OrderedDict

entries = []


for entry in daniel.publications:
        out = dict()
        out['title']= str(entry.title)
        out['authors']= ', '.join(entry.authors)
        out['year']= entry.year
        out['journal']= str(entry.journal)
        out['conference_short']= str(entry.booktitle)
        out['pdf']= str(entry.ee) if entry.ee else ''
        
        entries.append(out)


In [81]:
entries

[{'title': 'SnipGen: A Mining Repository Framework for Evaluating LLMs for Code.',
  'authors': 'Daniel Rodríguez-Cárdenas, Alejandro Velasco, Denys Poshyvanyk',
  'year': 2025,
  'journal': 'CoRR',
  'conference_short': 'None',
  'pdf': 'https://doi.org/10.48550/arXiv.2502.07046'},
 {'title': 'Testing Practices, Challenges, and Developer Perspectives in Open-Source IoT Platforms.',
  'authors': 'Daniel Rodríguez-Cárdenas, Safwat Ali Khan, Prianka Mandal, Adwait Nadkarni, Kevin Moran, Denys Poshyvanyk',
  'year': 2025,
  'journal': 'CoRR',
  'conference_short': 'None',
  'pdf': 'https://doi.org/10.48550/arXiv.2502.07257'},
 {'title': 'On Explaining (Large) Language Models For Code Using Global Code-Based Explanations.',
  'authors': 'David N. Palacio, Dipin Khati, Daniel Rodríguez-Cárdenas, Alejandro Velasco, Denys Poshyvanyk',
  'year': 2025,
  'journal': 'CoRR',
  'conference_short': 'None',
  'pdf': 'https://doi.org/10.48550/arXiv.2503.16771'},
 {'title': 'Which Syntactic Capabiliti

In [84]:
generate_yaml(entries)