In [None]:
import urllib.request
import re
import time
import os
import shutil
import pprint

import titlecase
import bibtexparser

from selenium import webdriver
from selenium.webdriver.chrome.service import Service

In [None]:
# If you haven't installed chromedriver already
import chromedriver_autoinstaller
_p = chromedriver_autoinstaller.install()

Steps:
* Download the current list of bibliography entries from DBLP.
* Figure out the new entries and deleted entries since last update.
* Add new entries to the website and removed deleted entries automatically using Selenium.

Notes:
* Necessary: A correct map from the DBLP IDs to the DSG website Node IDs. This is maintained in the `dblp_to_web_mappings.txt` file.
* When needed: Selenium can also be used to mass update existing entries. Adapt from the code for adding new entries.

In [None]:
# List of DSG faculty for whom publications should be downloaded.
authors_list = [
     # filename, dblp url id, expected author names, normalized author name
     ('CharlieClarke', '96/3666', ['Charles L. A. Clarke'], 'Charles Clarke'),
     ('GordonVCormack', 'c/GVCormack', ['Gordon V. Cormack'], 'Gordon Cormack'),
     ('KhuzaimaDaudjee', 'd/KhuzaimaDaudjee', ['Khuzaima Daudjee', 'Khuzaima S. Daudjee'], 'Khuzaima Daudjee'),
     ('LukaszGolab', '42/3296', ['Lukasz Golab'], 'Lukasz Golab'),
     ('MauraGrossman', '122/5875', ['Maura R. Grossman'], 'Maura Grossman'),
     ('XiHe', '28/949-1', ['Xi He'], 'Xi He'),
     ('IhabFIlyas', 'i/IhabFIlyas', ['Ihab F. Ilyas', 'Ihab Francis Ilyas'], 'Ihab Ilyas'),
     ('JimmyLin', '00/7739', ['Jimmy J. Lin', 'Jimmy Lin'], 'Jimmy Lin'),
     ('TamerOzsu', 'o/MTamerOzsu', ['M. Tamer Özsu'], 'Tamer Özsu'),
     ('KennethSalem', 's/KennethSalem', ['Kenneth Salem', 'Ken Salem'], 'Ken Salem'),
     ('SemihSalihoglu', '55/6560', ['Semih Salihoglu'], 'Semih Salihoglu'),
     ('MarkDSmucker', '07/801', ['Mark D. Smucker'], 'Mark Smucker'),
     ('DavidToman', 't/DavidToman', ['David Toman'], 'David Toman'),
     ('FrankTompa', 't/FrankWmTompa', ['Frank Wm. Tompa'], 'Frank Tompa'),
     ('GrantWeddell', '81/5447', ['Grant E. Weddell'], 'Grant Weddell')
]
# Individual entries to avoid.
bibid_blacklist = [
    'DBLP:journals/sigmod/Ozsu04',
    'DBLP:journals/sigmod/Ozsu04a',
    'DBLP:journals/sigmod/Ozsu04b',
    'DBLP:journals/sigmod/Ozsu03',
    'DBLP:journals/sigmod/Ozsu03a',
    'DBLP:journals/sigmod/Ozsu02',
    'DBLP:journals/sigmod/Ozsu02a',
    'DBLP:journals/sigmod/Ozsu02b',
    'DBLP:journals/sigmod/Ozsu01',
    'DBLP:journals/sigmod/Ozsu01a',
    'DBLP:books/mc/19/Ozsu19',
    'DBLP:journals/anthology/SnodgrassO01',
    'DBLP:conf/ntcir/YilmazC16',
    'DBLP:journals/vldb/Ozsu05',
    'DBLP:journals/dpd/BertinoO94',
    'DBLP:journals/pvldb/BonczS16',
]
# Bibliography types to avoid.
btype_blacklist = [
    'proceedings'
]
# Fields we are interested in. `common_fields` is just for reference but not used directly. Those fields are parsed individually below.
common_fields = ['title', 'author', 'year', 'doi', 'url']
extra_fields = {
    'article': ['journal', 'volume', 'number', 'pages'],
    'inproceedings': ['booktitle'],
    'incollection': ['booktitle', 'publisher'],
    'inproceedings': ['booktitle'],
    'book': ['publisher'],
    'phdthesis': ['school'],
}

# Helper function to parse dblp id for remote url.
def get_remote_id(bibid):
    parts = bibid.split("/")
    parts[0] = parts[0].replace("DBLP:","")
    return  "/".join(parts[:-1])

data_dir = "data"
authors_dirs_previous = f"{data_dir}/authors"
authors_dirs_new = f"{data_dir}/authors_new"
mappings_file = f"{data_dir}/dblp_to_web_mappings.txt"

def get_parsed_files(dir):
    print(f"Working version = {dir}")
    parsed = []
    for author in authors_list:
        with open(f"{dir}/{author[0]}.bib") as f:
            content = f.read()
            parser = bibtexparser.bparser.BibTexParser()
            parser.customization = bibtexparser.customization.convert_to_unicode # Convert latex code to unicode. Not perfect.
            bibliography = bibtexparser.loads(content, parser=parser)
            parsed.append((author[0], bibliography))
    return parsed

def combine(dir):
    count = 0
    all_entries = {}
    for author,collection in get_parsed_files(dir):
        print("len(%s) = %s" % (author, len(collection.entries)))
        count += len(collection.entries)
        for entry in collection.entries:
            bibid = entry['ID']
            if bibid not in all_entries and bibid not in bibid_blacklist and entry['ENTRYTYPE'] not in btype_blacklist:
                all_entries[bibid] = entry
    print("total entries = %s, deduplicated and filtered entries = %s" % (count, len(all_entries)))
    
    # Separate entries by bibliography type for easier processing.
    by_type = {}
    for entry in all_entries.values():
        by_type.setdefault(entry['ENTRYTYPE'],list()).append(entry)
    count = 0
    for btype, entry in by_type.items():
        print(btype, len(entry))
        count += len(entry)
    print("total =", count)
    
    return all_entries

In [None]:
# Download bib files for individual authors and save to disk.
assert os.path.exists(authors_dirs_previous), "'{authors_dirs_previous}' missing"
if not os.path.exists(authors_dirs_new):
    os.makedirs(authors_dirs_new)
print(f"Downloading to '{authors_dirs_new}':")
for author in authors_list:
    print(">",author)
    req = urllib.request.Request("https://dblp.uni-trier.de/pid/%s.bib?param=1" % author[1])
    response = urllib.request.urlopen(req)
    content = response.read()
    with open("%s/%s.bib" % (authors_dirs_new,author[0]),"wb") as f:
        f.write(content)

In [None]:
# Combine authors and deduplicate entries for latest and previous set of downloads.
all_entries = combine(authors_dirs_new)
print()
previous_all_entries = combine(authors_dirs_previous)

In [None]:
# Find new entries that did not exist
new_ids = {}
deleted_ids = {}
for bid in all_entries:
    if bid not in previous_all_entries:
        new_ids[bid] = all_entries[bid]
for bid in previous_all_entries:
    if bid not in all_entries:
        deleted_ids[bid] = previous_all_entries[bid]
print("New =", len(new_ids),"Deleted = ", len(deleted_ids))
assert(len(new_ids) - len(deleted_ids) == len(all_entries) - len(previous_all_entries))

by_type = {}
for entry in new_ids.values():
    by_type.setdefault(entry['ENTRYTYPE'],list()).append(entry)
for btype, entry in by_type.items():
    print(btype, len(entry))

In [None]:
# Full names for conferences and journals. Edits should be done in the `edited_strings` variable below.
raw_strings = {
    "journals/tois": "ACM Transactions on Information Systems (TOIS)",
    "conf/chiir": "Conference on Human Information Interaction and Retrieval (CHIIR)",
    "conf/sigir": "Annual International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR)",
    "conf/cikm": "International Conference on Information and Knowledge Management (CIKM)",
    "conf/ictir": "International Conference on the Theory of Information Retrieval (ICTIR)",
    "conf/trec": "Text Retrieval Conference (TREC)",
    "journals/corr": "Computing Research Repository (CoRR)",
    "conf/ntcir": "NTCIR Conference on Evaluation of Information Access Technologies (NTCIR)",
    "journals/csur": "ACM Computing Surveys",
    "journals/sigir": "SIGIR Forum",
    "journals/tkde": "IEEE Transactions on Knowledge and Data Engineering",
    "conf/wsdm": "Web Search and Data Mining (WSDM)",
    "reference/db": "Encyclopedia of Database Systems",
    "conf/chi": "ACM Conference on Human Factors in Computing Systems (CHI)",
    "conf/icccrea": "International Conference on Computational Creativity (ICCC)",
    "conf/www": "The Web Conference (WWW)",
    "journals/internet": "IEEE Internet Computing",
    "journals/ir": "Information Retrieval Journal",
    "conf/adcs": "Australasian Document Computing Symposium (ADCS)",
    "conf/acl": "Annual Meeting of the Association for Computational Linguistics (ACL)",
    "conf/ecir": "European Conference on Information Retrieval (ECIR)",
    "conf/russir": "Russian Summer School on Information Retrieval (RuSSIR)",
    "journals/tist": "ACM Transactions on Intelligent Systems and Technology (TIST)",
    "conf/iiix": "International Conference on Information Interaction in Context (IIiX)",
    "journals/kais": "Knowledge and Information Systems (KAIS)",
    "journals/spe": "Software - Practice and Experience (SPE)",
    "conf/icdm": "IEEE International Conference on Data Mining (ICDM)",
    "conf/eurohcir": "European Workshop on Human-Computer Interaction and Information Retrieval (EuroHCIR)",
    "conf/hcir": "Symposium on Human-Computer Interaction and Information Retrieval (HCIR)",
    "conf/ceas": "International Conference on Email and Anti-Spam (CEAS)",
    "books/daglib": "Books from the Dagstuhl Library",
    "conf/graphicsinterface": "Graphics Interface",
    "conf/riao": "Open research Areas in Information Retrieval (OAIR)",
    "conf/inex": "INitiative for the Evaluation of XML Retrieval (INEX)",
    "conf/webi": "IEEE/WIC/ACM International Conference on Web Intelligence (WI)",
    "journals/usenix-login": "login - The Usenix Magazine",
    "conf/clef": "Conference and Labs of the Evaluation Forum (CLEF)",
    "conf/fast": "USENIX Conference on File and Storage Technologies (FAST)",
    "conf/uist": "ACM Symposium on User Interface Software and Technology (UIST)",
    "journals/entcs": "Electronic Notes in Theoretical Computer Science (ENTCS)",
    "conf/coling": "International Conference on Computational Linguistics (COLING)",
    "conf/iwpc": "IEEE International Conference on Program Comprehension (ICPC)",
    "conf/naacl": "North American Chapter of the Association for Computational Linguistics (NAACL)",
    "conf/icsm": "IEEE International Conference on Software Maintenance and Evolution (ICSME)",
    "journals/ipm": "Information Processing and Management",
    "conf/apsec": "Asia-Pacific Software Engineering Conference (APSEC)",
    "conf/wflp": "International Workshop on Functional and (Constraint) Logic Programming (WFLP)",
    "conf/cascon": "Conference of the Centre for Advanced Studies on Collaborative Research (CASCON)",
    "journals/toplas": "ACM Transactions on Programming Languages and Systems (TOPLAS)",
    "journals/cj": "The Computer Journal",
    "journals/jdiq": "Journal of Data and Information Quality",
    "conf/doceng": "ACM Symposium on Document Engineering (DocEng)",
    "conf/kdd": "ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD)",
    "journals/cacm": "Communications of the ACM",
    "conf/ppdp": "ACM-SIGPLAN International Conference on Principles and Practice of Declarative Programming (PPDP)",
    "journals/ftir": "Foundations and Trends in Information Retrieval",
    "journals/iie": "Informatics in Education",
    "journals/jmlr": "Journal of Machine Learning Research (JMLR)",
    "conf/issep": "International Conference on Informatics in Secondary Schools (ISSEP)",
    "journals/ipl": "Information Processing Letters",
    "conf/cscw": "Conference on Computer Supported Cooperative Work (CSCW)",
    "journals/acta": "Acta Informatica",
    "conf/podc": "ACM SIGACT-SIGOPS Symposium on Principles of Distributed Computing (PODC)",
    "journals/cl": "Computer Languages, Systems &amp; Structures",
    "conf/coodbse": "Colloquium on Object Orientation in Databases and Software Engineering (COODBSE)",
    "conf/dcc": "Data Compression Conference (DCC)",
    "conf/icpp": "International Conference on Parallel Processing (ICPP)",
    "conf/pldi": "ACM-SIGPLAN Symposium on Programming Language Design and Implementation (PLDI)",
    "journals/sigplan": "ACM SIGPLAN Notices",
    "conf/asplos": "International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS)",
    "conf/sigmod": "ACM SIGMOD Conference (SIGMOD)",
    "conf/ssdbm": "International Conference on Statistical and Scientific Database Management (SSDBM)",
    "journals/dpd": "Distributed and Parallel Databases",
    "journals/pvldb": "Proceedings of the VLDB Endowment",
    "conf/debs": "Distributed Event-Based Systems (DEBS)",
    "conf/icde": "IEEE International Conference on Data Engineering (ICDE)",
    "journals/vldb": "The VLDB Journal",
    "conf/conext": "Conference on Emerging Network Experiment and Technology (CoNEXT)",
    "conf/sigcse": "Technical Symposium on Computer Science Education (SIGCSE)",
    "conf/edbt": "International Conference on Extending Database Technology (EDBT)",
    "conf/icdcs": "IEEE International Conference on Distributed Computing Systems (ICDCS)",
    "conf/middleware": "International Middleware Conference (Middleware)",
    "conf/edcc": "European Dependable Computing Conference (EDCC)",
    "conf/nca": "IEEE International Symposium on Network Computing and Applications (NCA)",
    "journals/jpdc": "Journal of Parallel and Distributed Computing",
    "journals/ppna": "Peer-to-Peer Networking and Applications",
    "conf/cloud": "ACM Symposium on Cloud Computing (SoCC)",
    "conf/vldb": "Very Large Data Bases Conference (VLDB)",
    "conf/semweb": "International Semantic Web Conference (ISWC)",
    "conf/spaa": "ACM Symposium on Parallelism in Algorithms and Architectures (SPAA)",
    "conf/icdcsw": "International Conference on Distributed Computing\nSystems (ICDCS) - Workshops",
    "conf/ieeehpcs": "International Conference on High Performance Computing &amp; Simulation (HPCS)",
    "conf/ht": "ACM Conference on Hypertext and Social Media (HT)",
    "conf/ipccc": "IEEE International Performance, Computing, and Communications Conference  (IPCCC)",
    "conf/otm": "OnTheMove Federated Conferences &amp; Workshops (OTM)",
    "conf/p2p": "IEEE International Conference on Peer-to-Peer Computing",
    "conf/icpads": "International Conference on Parallel and Distributed Systems (ICPADS)",
    "journals/re": "Requirements Engineering",
    "conf/dolap": "International Workshop on Data Warehousing and OLAP (DOLAP)",
    "journals/computing": "Computing",
    "journals/ijnm": "International Journal of Network Management",
    "conf/ai": "Canadian Conference on Artificial Intelligence (AI)",
    "conf/edm": "Educational Data Mining (EDM)",
    "conf/europar": "European Conference on Parallel Processing (Euro-Par)",
    "conf/icbc2": "IEEE International Conference on Blockchain and Cryptocurrency (ICBC)",
    "conf/amw": "Alberto Mendelzon International Workshop on Foundations of Data Management (AMW)",
    "conf/eenergy": "Energy-Efficient Computing and Networking (e-Energy)",
    "conf/wassa": "Workshop on Computational Approaches to Subjectivity, Sentiment and Social Media Analysis (WASSA)",
    "reference/bdt": "Encyclopedia of Big Data Technologies",
    "series/synthesis": "Synthesis",
    "journals/debu": "IEEE Data Engineering Bulletin",
    "journals/tits": "IEEE Transactions on Intelligent Transportation Systems",
    "journals/is": "Information Systems",
    "journals/tods": "ACM Transactions on Database Systems (TODS)",
    "conf/bigdataconf": "IEEE International Conference on Big Data (IEEE BigData)",
    "journals/cn": "Computer Networks",
    "conf/iwcmc": "International Conference on Wireless Communications and Mobile Computing (IWCMC)",
    "books/sp/13": "************ failed",
    "conf/pam": "Passive and Active Network Measurement Conference (PAM)",
    "journals/mst": "Theory of Computing Systems",
    "conf/cidr": "Conference on Innovative Data Systems Research (CIDR)",
    "phd/basesearch": "************ failed",
    "conf/edbtw": "International Conference on Extending Database Technology (EDBT) - Workshops",
    "journals/sigmod": "SIGMOD Record",
    "conf/imc": "ACM/SIGCOMM Internet Measurement Conference (IMC)",
    "conf/mie": "Medical Informatics Europe (MIE)",
    "conf/isit": "International Symposium on Information Theory (ISIT)",
    "journals/popets": "Proceedings on Privacy Enhancing Technologies (PoPETs)",
    "reference/sp": "Springer Handbooks",
    "conf/ccs": "Conference on Computer and Communications Security (CCS)",
    "conf/mlsys": "Conference on Machine Learning and Systems (MLSys)",
    "books/acm": "************ failed",
    "conf/aistats": "International Conference on Artificial Intelligence and Statistics (AISTATS)",
    "conf/btw": "Datenbanksysteme f&uuml;r Business, Technologie und Web (BTW)",
    "conf/icdt": "International Conference on Database Theory (ICDT)",
    "conf/uai": "Conference on Uncertainty in Artificial Intelligence (UAI)",
    "books/mc/19": "************ failed",
    "journals/ml": "Machine Learning",
    "journals/ftdb": "Foundations and Trends in Databases",
    "journals/www": "World Wide Web (WWW)",
    "conf/seco": "SeCO Workshops (SeCO)",
    "journals/sadm": "Statistical Analysis and Data Mining",
    "conf/dagstuhl": "Dagstuhl Publications",
    "journals/mms": "Multimedia Systems",
    "conf/icac": "IEEE International Conference on Autonomic Computing (ICAC)",
    "conf/mmdb": "ACM International Workshop on Multimedia Databases (MMDB)",
    "conf/mis": "Workshop on Multimedia Information Systems (MIS)",
    "journals/jiis": "Journal of Intelligent Information Systems (JIIS)",
    "journals/dhq": "Digital Humanities Quarterly",
    "journals/envsoft": "Environmental Modelling and Software",
    "conf/aaai": "AAAI Conference on Artificial Intelligence (AAAI)",
    "conf/eacl": "Conference of the European Chapter of the Association for Computational Linguistics (EACL)",
    "journals/oclc": "Digital Library Perspectives",
    "journals/scientometrics": "Scientometrics",
    "conf/birws": "International Workshop on Bibliometric-enhanced Information Retrieval (BIR)",
    "conf/emnlp": "Conference on Empirical Methods in Natural Language Processing (EMNLP)",
    "conf/icml": "International Conference on Machine Learning (ICML)",
    "conf/jcdl": "ACM/IEEE Joint Conference on Digital Libraries (JCDL)",
    "conf/rep4nlp": "Workshop on Representation Learning for NLP (RepL4NLP)",
    "conf/acl-deeplo": "Workshop on Deep Learning Approaches for Low-Resource Natural Language Processing (DeepLo)",
    "conf/amia": "American Medical Informatics Association Annual Symposium (AMIA)",
    "conf/iui": "International Conference on Intelligent User Interfaces (IUI)",
    "conf/IEEEcloud": "IEEE International Conference on Cloud Computing (CLOUD)",
    "conf/desires": "Biennial Conference on Design of Experimental Search &amp; Information Retrieval Systems (DESIRES)",
    "conf/hotcloud": "USENIX Workshop on Hot Topics in Cloud Computing (HotCloud)",
    "conf/icassp": "IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)",
    "journals/jocch": "ACM Journal on Computing and Cultural Heritage",
    "conf/grades": "International Workshop on Graph Data Management Experiences and Systems (GRADES)",
    "conf/isbi": "IEEE International Symposium on Biomedical Imaging (ISBI)",
    "books/sp/17": "************ failed",
    "conf/ccnc": "Consumer Communications and Networking Conference (CCNC)",
    "conf/dihu": "Digital Humanities Conference (DH)",
    "conf/mm": "ACM International Conference on Multimedia (MM)",
    "conf/sac": "ACM Symposium on Applied Computing (SAC)",
    "conf/semeval": "International Workshop on Semantic Evaluation (SemEval )",
    "journals/tacl": "Transactions of the Association for Computational Linguistics",
    "conf/medinfo": "World Congress on Medical and Health (Medical) Informatics (MedInfo)",
    "conf/dexaw": "DEXA Workshops",
    "conf/gis": "ACM SIGSPATIAL International Workshop on Advances in Geographic Information Systems (GIS)",
    "conf/ieeevast": "IEEE Conference on Visual Analytics Science and Technology (VAST)",
    "conf/lats": "ACM Conference on Learning @ Scale (L@S)",
    "journals/bigdata": "Big Data",
    "conf/icwsm": "International Conference on Web and Social Media (ICWSM)",
    "conf/wmt": "Conference on Machine Translation (WMT)",
    "journals/sigkdd": "SIGKDD Explorations",
    "conf/iconference": "iConference",
    "conf/cloudcom": "International Conference on Cloud Computing (CloudCom)",
    "conf/mlg": "Mining and Learning with Graphs (MLG)",
    "journals/bmcbi": "BMC Bioinformatics",
    "journals/firstmonday": "First Monday",
    "journals/jasis": "Journal of the Association for Information Science and Technology (JASIST)",
    "journals/lre": "Language Resources and Evaluation (LRE)",
    "journals/mta": "Multimedia Tools and Applications",
    "conf/tac": "Text Analysis Conference (TAC)",
    "journals/coling": "Computational Linguistics",
    "conf/latech": "Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature (LaTeCH)",
    "conf/asist": "ASIS&amp;T Annual Meeting (ASIST)",
    "conf/bionlp": "Workshop on Biomedical Natural Language Processing (BioNLP)",
    "conf/eamt": "European Association for Machine Translation Conferences/Workshops (EAMT)",
    "phd/ndltd": "************ failed",
    "conf/ndqa": "New Directions in Question Answering",
    "conf/acl-iwp": "International Workshop on Paraphrasing (IWP)",
    "conf/dgo": "(Inter)National Conference on Digital Government Research (DG.O)",
    "conf/interact": "IFIP TC13 International Conference on Human-Computer Interaction (INTERACT)",
    "conf/acl-xml": "Workshop on NLP and XML",
    "conf/coopis": "International Conference on Cooperative Information Systems (CoopIS)",
    "conf/lrec": "International Conference on Language Resources and Evaluation (LREC)",
    "conf/nldb": "International Conference on Applications of Natural Language to Data Bases (NLDB)",
    "journals/ijst": "International Journal of Speech Technology",
    "conf/icmcs": "IEEE International Conference on Multimedia and Expo (ICME)",
    "journals/tlsdkcs": "Transactions on Large-Scale Data- and Knowledge-Centered Systems",
    "books/sp": "************ failed",
    "journals/tpds": "IEEE Transactions on Parallel and Distributed Systems (TPDS)",
    "conf/dsc": "International Conference on Data Science in Cyberspace (DSC)",
    "conf/ppopp": "ACM SIGPLAN Symposium on Principles &amp; Practice of Parallel Programming (PPoPP)",
    "conf/dasfaa": "International Conference on Database Systems for Advanced Applications (DASFAA)",
    "journals/dase": "Data Science and Engineering",
    "journals/fcsc": "Frontiers of Computer Science",
    "conf/ideas": "International Database Engineering and Applications Symposium (IDEAS)",
    "books/crc/chb": "************ failed",
    "conf/wbdb": "Workshop on Big Data Benchmarking (WBDB)",
    "conf/birthday": "Festschrifts: Birthday, In Memory of ...",
    "conf/xsym": "International XML Database Symposium (XSym)",
    "conf/waim": "Interational Conference on Web-Age Information Management (WAIM)",
    "conf/recsys": "ACM Conference on Recommender Systems (RecSys)",
    "conf/adbis": "Symposium on Advances in Databases and Information Systems (ADBIS)",
    "journals/ijdsn": "International Journal of Distributed Sensor Networks",
    "conf/mmm": "Conference on Multimedia Modeling (MMM)",
    "conf/wise": "International Conference on Web Information Systems Engineering (WISE)",
    "conf/cvdb": "Computer Vision meets Databases (CVDB)",
    "conf/mir": "International Conference on Multimedia Retrieval (ICMR)",
    "journals/ijig": "International Journal of Image and Graphics",
    "conf/bda": "Journ&eacute;es Bases de Donn&eacute;es Avanc&eacute;es  (BDA)",
    "conf/icann": "International Conference on Artificial Neural Networks and Machine Learning (ICANN)",
    "conf/iscis": "International Symposium on Computer and Information Sciences (ISCIS)",
    "conf/spieSR": "Storage and Retrieval Methods and Applications for Multimedia",
    "conf/diweb": "International Workshop on Data Integration over the Web (DIWeb)",
    "conf/eextt": "Efficiency and Effectiveness of XML Tools and Techniques (EEXTT)",
    "conf/icip": "IEEE International Conference on Image Processing (ICIP)",
    "reference/ap": "Academic Press Reference",
    "conf/caise": "International Conference on Advanced Information Systems Engineering (CAiSE)",
    "journals/anthology": "************ failed",
    "conf/ride": "International Workshop on Research Issues in Data Engineering (RIDE)",
    "conf/wiiw": "Workshop on Information Integration on the Web (WIIW)",
    "books/mk/dittrichG01": "************ failed",
    "journals/dr": "ACM SIGMOD Digital Review",
    "books/ph": "************ failed",
    "conf/ds": "IFIP Working Conference on Database Semantics (DS)",
    "journals/dke": "Data &amp; Knowledge Engineering",
    "conf/adl": "Advances in Digital Libraries (ADL)",
    "conf/tools": "International Conference on Software Technology: Methods and Tools (TOOLS)",
    "conf/vdb": "Visual Database Systems (VDB)",
    "conf/iadt": "Issues and Applications of Database Technology (IADT)",
    "journals/ibmsj": "IBM Systems Journal",
    "conf/er": "International Conference on Conceptual Modeling (ER)",
    "books/crc/tucker97": "************ failed",
    "conf/iwmmdbms": "International Workshop on Multi-Media Database Management Systems (IW-MMDBMS)",
    "conf/icccn": "International Conference on Computer Communications and Networks (ICCCN)",
    "books/acm/kim95": "************ failed",
    "conf/nato": "NATO Advanced Study Institute (NATO ASI)",
    "conf/oodbs": "International Symposium on Objects and Databases (SODB)",
    "conf/icci": "International Conference on Computing and Information (ICCI)",
    "books/mk/elmagarmid92": "************ failed",
    "journals/computer": "IEEE Computer",
    "conf/dood": "International Conference on Deductive and Object-Oriented Databases (DOOD)",
    "tr/gte": "************ failed",
    "conf/compsac": "Annual International Computer Software and Applications Conference (COMPSAC)",
    "conf/oopsla": "ACM SIGPLAN International Conference on Systems, Programming, Languages and Applications: Software for Humanity (SPLASH)",
    "journals/tse": "IEEE Transactions on Software Engineering (TSE)",
    "conf/damon": "International Workshop on Data Management on New Hardware (DaMoN)",
    "conf/hotstorage": "USENIX Workshop on Hot Topics in Storage and File Systems (HotStorage)",
    "conf/srds": "IEEE International Symposium on Reliable Distributed Systems (SRDS)",
    "conf/dnis": "Databases in Networked Information Systems (DNIS)",
    "conf/eurosys": "European Conference on Computer Systems (EuroSys)",
    "journals/jcss": "Journal of Computer and System Sciences (JCSS)",
    "journals/tc": "IEEE Transactions on Computers",
    "journals/tocs": "ACM Transactions on Computer Systems (TOCS)",
    "conf/usenix": "USENIX Annual Technical Conference (USENIX ATC)",
    "conf/compcon": "IEEE Computer Society International Conference (COMPCON)",
    "conf/pods": "ACM SIGACT-SIGMOD-SIGART Symposium on Principles of Database Systems (PODS)",
    "conf/db-workshops": "Data Base Workshops",
    "conf/hpts": "High Performance Transaction Systems Workshop (HPTS)",
    "conf/complexnetworks": "International Workshop on Complex Networks &amp; Their Applications (COMPLEX NETWORKS)",
    "conf/cgo": "IEEE/ACM International Symposium on Code Generation and Optimization (CGO)",
    "books/others/11": "************ failed",
    "conf/eScience": "IEEE International Conference on e-Science (E-Science)",
    "conf/ecal": "European Conference on Artificial Life (ECAL)",
    "journals/ki": "K&uuml;nstliche Intelligenz (KI)",
    "conf/dlog": "International Workshop on Description Logics (DL)",
    "conf/ausai": "Australian Joint Conference on Artificial Intelligence (AUS-AI)",
    "conf/pricai": "Pacific Rim International Conference on Artificial Intelligence (PRICAI)",
    "conf/ekaw": "International Conference Knowledge Engineering and Knowledge Management (EKAW)",
    "conf/kr": "International Conference on Principles of Knowledge Representation and Reasoning (KR)",
    "journals/jlp": "Journal of Logic Programming",
    "conf/ijcai": "International Joint Conference on Artificial Intelligence (IJCAI)",
    "conf/lpar": "International Conference on Logic Programming and Automated Reasoning (LPAR)",
    "journals/ws": "Journal of Web Semantics",
    "conf/tableaux": "International Conference on Theorem Proving with Analytic Tableaux and Related Methods (TABLEAUX)",
    "journals/jar": "Journal of Automated Reasoning",
    "conf/sebd": "Sistemi Evoluti per Basi di Dati (SEBD)",
    "conf/dsd": "Euromicro Symposium on Digital Systems Design (DSD)",
    "conf/owled": "W3C Web Ontology Language (OWL) Experiences and Directions Workshop (OWLED)",
    "journals/iandc": "Information and Computation",
    "conf/cade": "Conference on Automated Deduction (CADE)",
    "conf/iclp": "International Conference on Logic Programming (ICLP)",
    "conf/time": "International Symposium/Workshop on Temporal Representation and Reasoning (TIME)",
    "conf/stdbm": "International Workshop on Spatio-Temporal Database Management (STDBM)",
    "journals/tcs": "Theoretical Computer Science",
    "reference/fai": "Foundations of Artificial Intelligence",
    "conf/ssd": "International Symposium on Spatial and Temporal Databases (SSTD)",
    "conf/dbtel": "Databases in Telecommunications (DBTel)",
    "conf/cl": "International Conference on Computational Logic (CL)",
    "conf/cdb": "International Symposium on the Applications of Constraint Databases (CDB)",
    "journals/constraints": "Constraints - An International Journal",
    "conf/slp": "Joint International Conference and Symposium on Logic Programming (JICSLP)",
    "journals/jis": "Journal of Information Science",
    "conf/sacmat": "ACM Symposium on Access Control Models and Technologies (SACMAT)",
    "conf/mkm": "International Conference on Intelligent Computer Mathematics (CICM)",
    "conf/enlg": "European Workshop on Natural Language Generation (ENLG)",
    "journals/lalc": "Digital Scholarship in the Humanities (DSH)",
    "conf/policy": "IEEE International Symposium on Policies for Distributed Systems and Networks (POLICY)",
    "conf/webdb": "International Workshop on the Web and Databases (WebDB)",
    "conf/re": "IEEE International Requirements Engineering Conference (RE)",
    "conf/sdmw": "Secure Data Management (VLDB Workshop) (SDM)",
    "journals/csse": "Computer Systems: Science &amp; Engineering",
    "journals/tapos": "TAPOS - Theory and Practice of Object Systems",
    "conf/w3c": "W3C Workshops (W3C)",
    "journals/csi": "Computer Standards &amp; Interfaces",
    "conf/adb": "Applications of Databases (ADB)",
    "journals/isci": "Information Sciences",
    "conf/iccal": "International Conference on Computers and Learning (ICCAL)",
    "journals/ijmms": "International Journal of Human-Computer Studies",
    "journals/algorithmica": "Algorithmica",
    "conf/siggraph": "International Conference on Computer Graphics and Interactive Techniques (SIGGRAPH)",
    "journals/ac": "Advances in Computers",
    "journals/logcom": "Journal of Logic and Computation",
    "conf/fois": "Formal Ontology in Information Systems (FOIS)",
    "journals/tkdd": "ACM Transactions on Knowledge Discovery from Data",
    "conf/fab": "International Symposium on Foundations and Applications of Blockchain (FAB) ",
}
# Edits and cleanups of above entries where required.
edited_strings = {
    # These keys should not exist.
    "phd/basesearch": None,
    "books/acm": None,
    "books/sp": None,
    "books/ph": None,
    "phd/us": None,
    "phd/ca": None,
    # Override entries, e.g., for missing keys or to make string shorter.
    "conf/sigir": "International Conference on Research and Development in Information Retrieval (SIGIR)",
    "journals/corr": "ArXiv",
    "conf/ntcir": "Conference on Evaluation of Information Access Technologies (NTCIR)",
    "journals/tkde": "IEEE Transactions on Knowledge and Data Engineering (TKDE)",
    "conf/acl": "Association for Computational Linguistics (ACL)",
    "journals/cl": "Computer Languages, Systems & Structures",
    "conf/ieeehpcs": "International Conference on High Performance Computing & Simulation (HPCS)",
    "conf/otm": "OnTheMove Federated Conferences & Workshops (OTM)",
    "conf/btw": "Datenbanksysteme für Business, Technologie und Web(BTW)",
    "conf/icdcsw": "International Conference on Distributed Computing Systems (ICDCS) - Workshops",
    "conf/desires": "Conference on Design of Experimental Search & Information Retrieval Systems (DESIRES)",
    "conf/asist": "ASIS&T Annual Meeting (ASIST)",
    "conf/ppopp": "ACM Symposium on Principles & Practice of Parallel Programming (PPoPP)",
    "conf/bda": "Journées Bases de Données Avancées (BDA)",
    "journals/dke": "Data & Knowledge Engineering (DKE)",
    "conf/complexnetworks": "International Workshop on Complex Networks & Their Applications",
    "journals/ki": "German Journal of Artificial Intelligence (KI)",
    "journals/csse": "Computer Systems: Science & Engineering",
    "journals/csi": "Computer Standards & Interfaces",
    "conf/sigmod": "ACM International Conference on Management of Data (SIGMOD)",
    "journals/pvldb": "Proceedings of the VLDB Endowment (PVLDB)",
    "conf/ipccc": "IEEE International Performance, Computing, and Communications Conference (IPCCC)",
    "conf/p2p": "IEEE International Conference on Peer-to-Peer Computing (P2P)",
    "books/sp/13": "Handbook of Data Quality",
    "books/mc/19": "Making Databases Work: the Pragmatic Wisdom of Michael Stonebraker",
    "books/sp/17": "Algorithms for Next-Generation Sequencing Data, Techniques, Approaches, and Applications",
    "conf/semeval": "International Workshop on Semantic Evaluation (SemEval)",
    "conf/dexaw": "International Conference on Database and Expert Systems Applications (DEXA) - Workshops",
    "phd/ndltd": "Massachusetts Institute of Technology, USA",
    "books/crc/chb": "Computing Handbook: Information Systems and Information Technology",
    "conf/birthday": "Description Logic, Theory Combination, and All That - Essays Dedicated to Franz Baader",
    "journals/anthology": "ACM SIGMOD Anthology",
    "books/mk/dittrichG01": "Component Database Systems",
    "books/crc/tucker97": "The Computer Science and Engineering Handbook",
    "books/acm/kim95": "Modern Database Systems: The Object Model, Interoperability, and Beyond",
    "books/mk/elmagarmid92": "Database Transaction Models for Advanced Applications",
    "tr/gte": "GTE Laboratories Incorporated",
    "conf/usenix": "USENIX Annual Technical Conference (ATC)",
    "conf/pods": "ACM Symposium on Principles of Database Systems (PODS)",
    "books/others/11": "Interactive Information Seeking, Behaviour and Retrieval",
    "journals/dpd": "Distributed and Parallel Databases",
    "conf/sigada": "SIGAda Conference",
    "conf/blackboxnlp": "Workshop on Analyzing and Interpreting Neural Networks for NLP (BlackboxNLP)",
    "journals/health": "ACM Transactions on Computing for Healthcare (HEALTH)",
    "conf/um": "User Modeling, Adaptation, and Personalization (UMAP)",
    "conf/cec": "IEEE Congress on Evolutionary Computation (CEC)",
    "conf/wadl": "Web Archiving and Digital Libraries Workshop (WADL)",
    "journals/toit": "ACM Transactions on Internet Technology (TOIT)",
    "conf/ruleml": "International Web Rule Symposium (RuleML)",
    "journals/cbm": "Computers in Biology and Medicine",
    "conf/IEEEpact": "International Conference on Parallel Architectures and Compilation Techniques (PACT)",
    "conf/acl-louhi": "International Workshop on Health Text Mining and Information Analysis (Louhi)",
    "conf/evia": "International Workshop on Evaluating Information Access (EVIA)",
    "conf/atal": "International Joint Conference on Autonomous Agents &amp; Multiagent Systems (AAMAS)",
    "conf/nips": "Conference on Neural Information Processing Systems (NeurIPS)",
    "journals/pacmmod": "Proceedings of the ACM on Management of Data",
    "journals/tweb": "ACM Transactions on the Web",
    "conf/sigir-ap": "ACM SIGIR Conference on Research and Development in Information Retrieval in the Asia Pacific Region (SIGIR-AP)",
}
# Combine the two above.
final_strings = {}
for k,v in edited_strings.items():
    if v is not None:
        final_strings[k] = v                               # Add overridden entries first.
for k,v in raw_strings.items():
    if k in edited_strings and edited_strings[k] is None:
        continue                                           # Ignore entries that are explicitly set to None.      
    if k not in final_strings:
        final_strings[k] = v

# Replace publisher names.
publisher_strings = {
    'Springer US': 'Springer',
    'Morgan & Claypool Publishers': 'Morgan & Claypool',
}

# Override titles, e.g., when titles contain LATEX.
title_strings = {
    'DBLP:conf/ausai/TomanW15': 'On the Krom Extension of CFDI^{∀−}_{nc}',
    'DBLP:conf/ijcai/JacquesTW16': 'Object-Relational Queries over CFDI^{∀−}_{nc} Knowledge Bases: OBDA for the SQL-Literate',
    'DBLP:conf/pricai/TomanW14': 'On Adding Inverse Features to the Description Logic CFD^{∀}_{nc}',
    'DBLP:conf/ausai/TomanW13': 'Conjunctive Query Answering in CFD_{nc}: A PTIME Description Logic with Functional Constraints and Disjointness',
    'DBLP:journals/dpd/HerodotouCCHDWC22': 'Introduction to the special issue on self‑managing and hardware‑optimized database systems 2020',
}

In [None]:
# Retrieve nomalized names of journals, conferences, and books.
def get_names(bibids, found=set()):
    new = False
    for bibid in bibids:
        remote_id = get_remote_id(bibid)
        if edited_strings.get(remote_id, "") is None:
            continue
        if remote_id not in found:
            try:
                req = urllib.request.Request("https://dblp.org/db/%s/index.html" % remote_id)
                response = urllib.request.urlopen(req)
                content = str(response.read())
                m = re.search("\<h1\>(.*)\<\/h1\>", content)
                full_name = m.group(1)
            except:
                full_name = "************ failed: %s" % bibid
            print('"%s": "%s",' % (remote_id, full_name))
            found.add(remote_id)
            new = True
    return new

# get_names(all_entries.keys()) # Uncomment to get names for all entries. Replace `raw_strings` with the output. Normally not required.

if get_names(new_ids.keys(),set(final_strings.keys())):
    raise Exception("Missing entries found. Add them to the `edited_strings` map.")

In [None]:
# Finally create the combined and cleaned-up list of bibliographies ready to be added to the website.
cleaned_entries = {}
for entry in new_ids.values():
    btype = entry['ENTRYTYPE']
    efields = extra_fields[btype]
    try:
        cleaned_entry = {}
        bibid = entry['ID']

        cleaned_entry['id'] = bibid
        cleaned_entry['btype'] = btype

        if bibid in title_strings:
            cleaned_entry['title'] = title_strings[bibid] # Use manually set titles
        else:
            cleaned_entry['title'] = titlecase.titlecase(entry['title']).replace("\n",' ').replace("\\emph",'').replace("\\Emph",'') # Cleanup stray escape characters.
        if '\\' in cleaned_entry['title']:
            raise Exception("Error: Make sure no escape character is present in the title by overriding name in `title_strings`")

        authors = (entry['author'] if 'author' in entry else entry['editor']) if btype == 'book' else entry['author']
        found = False
        for author in authors_list:
            for variant in author[2]:
                if variant in authors:
                    authors = authors.replace(variant, author[3]) # Normalize author names.
                    found = True
        if not found:
            raise Exception("Error: Make sure atleast one of the authors belongs to the DSG group by normalizing in `authors_list`.")
        cleaned_entry['author'] = authors.split(" and\n")

        cleaned_entry['year'] = entry['year']
        cleaned_entry['doi'] = entry.get('doi','')
        cleaned_entry['url'] = entry.get('url','')

        # Process fields specific to each bibliography type.
        for efield in efields:
            if efield in ['number','pages']:
                data = entry.get(efield,'') # Optional.
            else:
                data = entry[efield]
            data = data.replace("\n",' ')

            # Normalize conference, journal, and publisher names.
            if efield in ['journal','booktitle']:
                remote_id = get_remote_id(bibid)
                if remote_id not in final_strings:
                    raise Exception("Key missing. See previous cell and add missing entries to `final_strings`.")
                data = final_strings[remote_id]
            elif efield == 'publisher':
                if data in publisher_strings:
                    data = publisher_strings[data]

            cleaned_entry[efield] = data
        cleaned_entries[bibid] = cleaned_entry
    except Exception as e:
        print(entry)
        raise Exception("Error!")
print(len(cleaned_entries))

In [None]:
# Persist to disk in csv format.
fields = ['id','btype','title','author','year','journal','volume','number','pages','booktitle','publisher','school','doi','url']
separator = '\t'
with open(f"{data_dir}/combined.csv","w") as f:
    f.write(separator.join(fields))
    f.write('\n')
    for bibid,entry in cleaned_entries.items():
        data = []
        for field in fields:
            if field == 'author':
                data.append(" and ".join(entry[field]))
            else:
                data.append(entry.get(field,''))
        f.write(separator.join(data))
        f.write('\n')

In [None]:
# Load info about entries already added to the website. Contains mapping from DBLP ID to Node ID on the website.
# This is needed to automate modifying and deleting existing entries.
# IMPORTANT: Make sure this file is correctly maintained.
def load_mappings():
    mappings = {}
    with open(mappings_file) as f:
        for line in f:
            parts = line.rstrip().split(",")
            mappings[parts[0]] = int(parts[1])
    return mappings

mappings = load_mappings()

In [None]:
# Login to 'My Workbench' and copy the cookie value of key 'SSESSc2214f6938283aa908e28c4bfa176f5b'
# DANGER: since the repo is public, make sure not to commit the cookie value, otherwise anyone on the internet would be able to use it to login to Workbench.
cookie_value = ""

In [None]:
# Open a test page using the provided cookies. If you get an "Access denied" error, the cookie value was incorrect.
assert len(cookie_value) != 0
opts = webdriver.ChromeOptions()
opts.binary_location = "/run/current-system/sw/bin/chromium"
browser = webdriver.Chrome(options=opts)
browser.get('https://uwaterloo.ca')
browser.add_cookie({'name' : 'SSESSc2214f6938283aa908e28c4bfa176f5b', 'value' : cookie_value}) 
browser.add_cookie({'name' : 'Drupal.tableDrag.showWeight', 'value' : '0'})
browser.add_cookie({'name' : 'cookie-agreed-version', 'value' : '1.0.0'}) 
browser.add_cookie({'name' : 'has_js', 'value' : '1'}) 
browser.get('https://uwaterloo.ca/data-systems-group/node/add/biblio')

In [None]:
# Helper functions to set bibliography details, specific to each type.
# Note: Might have to fiddle with sleep() and scrollBy() values if selenium throws any error.

def post_author(entry):
    next_click = 4
    for i,author in enumerate(entry['author']):
        if i == next_click:
            browser.find_element("xpath",'//*[@id="edit-add-more"]').click()
            next_click += 2
            browser.execute_script("window.scrollBy(0,100)")
            time.sleep(2)
        browser.find_element("xpath",'//*[@id="edit-biblio-contributors-%d-name"]' % i).send_keys(author)
    browser.execute_script("window.scrollTo(0,10)")

def post_doi_url(entry):
    if entry.get('doi','') != '':
        browser.find_element("xpath",'//*[@id="edit-biblio-doi"]').send_keys(entry['doi'])
    url = entry.get('url','')
    if entry.get('doi','') == '' or 'doi.org' not in url:
        browser.find_element("xpath",'//*[@id="edit-biblio-url"]').send_keys(url)
    
def save_page():
    browser.find_element("xpath",'//*[@id="edit-submit"]').click()
    browser.find_element("xpath",'//*[@id="edit-state"]/option[text()="Published"]').click()
    browser.find_element("xpath",'//*[@id="edit-submit"]').click()
    try:
        href = browser.find_element("xpath",'//*[@id="main"]/div/div[2]/ul/li[2]/a').get_attribute('href')
        pid = int(href.split("/")[-2])
    except:
        print(href)
        raise Exception("Error")
    return pid

def post_conference_paper(entry):
    browser.get('https://uwaterloo.ca/data-systems-group/node/add/biblio')

    browser.find_element("xpath",'//*[@id="edit-biblio-type"]/option[text()="Conference Paper"]').click()

    browser.find_element("xpath",'//*[@id="edit-title"]').send_keys(entry['title'])
    post_author(entry)

    browser.find_element("xpath",'//*[@id="biblio-node-form"]/div/div[3]/ul/li[4]/a').click()
    browser.find_element("xpath",'//*[@id="edit-biblio-year"]').send_keys(entry['year'])
    browser.find_element("xpath",'//*[@id="edit-biblio-secondary-title"]').send_keys(entry['booktitle'])

    browser.find_element("xpath",'//*[@id="biblio-node-form"]/div/div[3]/ul/li[7]/a').click()
    post_doi_url(entry)
    
    return save_page()

def post_article(entry):
    browser.get('https://uwaterloo.ca/data-systems-group/node/add/biblio')

    browser.find_element("xpath",'//*[@id="edit-biblio-type"]/option[text()="Journal Article"]').click()

    browser.find_element("xpath",'//*[@id="edit-title"]').send_keys(entry['title'])
    post_author(entry)

    browser.find_element("xpath",'//*[@id="biblio-node-form"]/div/div[3]/ul/li[4]/a').click()
    browser.find_element("xpath",'//*[@id="edit-biblio-year"]').send_keys(entry['year'])
    browser.find_element("xpath",'//*[@id="edit-biblio-secondary-title"]').send_keys(entry['journal'])
    browser.find_element("xpath",'//*[@id="edit-biblio-volume"]').send_keys(entry['volume'])
    browser.find_element("xpath",'//*[@id="edit-biblio-issue"]').send_keys(entry['number'])
    browser.find_element("xpath",'//*[@id="edit-biblio-pages"]').send_keys(entry['pages'])

    browser.find_element("xpath",'//*[@id="biblio-node-form"]/div/div[3]/ul/li[6]/a').click()
    post_doi_url(entry)

    return save_page()

def post_book(entry):
    browser.get('https://uwaterloo.ca/data-systems-group/node/add/biblio')

    browser.find_element("xpath",'//*[@id="edit-biblio-type"]/option[text()="Book"]').click()

    browser.find_element("xpath",'//*[@id="edit-title"]').send_keys(entry['title'])
    post_author(entry)

    browser.find_element("xpath",'//*[@id="biblio-node-form"]/div/div[3]/ul/li[4]/a').click()
    browser.find_element("xpath",'//*[@id="edit-biblio-year"]').send_keys(entry['year'])

    browser.find_element("xpath",'//*[@id="biblio-node-form"]/div/div[3]/ul/li[5]/a').click()
    browser.find_element("xpath",'//*[@id="edit-biblio-publisher"]').send_keys(entry.get('publisher',''))

    browser.find_element("xpath",'//*[@id="biblio-node-form"]/div/div[3]/ul/li[7]/a').click()
    post_doi_url(entry)

    return save_page()

def post_book_chapter(entry):
    browser.get('https://uwaterloo.ca/data-systems-group/node/add/biblio')

    browser.find_element("xpath",'//*[@id="edit-biblio-type"]/option[text()="Book Chapter"]').click()

    browser.find_element("xpath",'//*[@id="edit-title"]').send_keys(entry['title'])
    post_author(entry)

    browser.find_element("xpath",'//*[@id="biblio-node-form"]/div/div[3]/ul/li[4]/a').click()
    browser.find_element("xpath",'//*[@id="edit-biblio-year"]').send_keys(entry['year'])
    browser.find_element("xpath",'//*[@id="edit-biblio-secondary-title"]').send_keys(entry['booktitle'])

    browser.find_element("xpath",'//*[@id="biblio-node-form"]/div/div[3]/ul/li[5]/a').click()
    browser.find_element("xpath",'//*[@id="edit-biblio-publisher"]').send_keys(entry.get('publisher',''))

    browser.find_element("xpath",'//*[@id="biblio-node-form"]/div/div[3]/ul/li[7]/a').click()
    post_doi_url(entry)

    return save_page()

def post_thesis(entry):
    browser.get('https://uwaterloo.ca/data-systems-group/node/add/biblio')

    browser.find_element("xpath",'//*[@id="edit-biblio-type"]/option[text()="Thesis"]').click()

    browser.find_element("xpath",'//*[@id="edit-title"]').send_keys(entry['title'])
    post_author(entry)

    browser.find_element("xpath",'//*[@id="biblio-node-form"]/div/div[3]/ul/li[4]/a').click()
    browser.find_element("xpath",'//*[@id="edit-biblio-year"]').send_keys(entry['year'])

    browser.find_element("xpath",'//*[@id="biblio-node-form"]/div/div[3]/ul/li[5]/a').click()
    browser.find_element("xpath",'//*[@id="edit-biblio-publisher"]').send_keys(entry['school'])

    browser.find_element("xpath",'//*[@id="biblio-node-form"]/div/div[3]/ul/li[7]/a').click()
    post_doi_url(entry)

    return save_page()
    
def delete_entry(nodeid):
    browser.get('https://uwaterloo.ca/data-systems-group/node/%s/delete' % nodeid)
    browser.find_element("xpath",'//*[@id="edit-submit"]').click()

In [None]:
# Create a deterministic list of keys from the dict to make continuing from errors easier.
keys = list(cleaned_entries.keys())
print(len(keys))

In [None]:
# Set to the index of `'keys' from which to start uploading.
# Should be set to `0` on a fresh run or to a particular index after recovering from an error.
index = 0

In [None]:
# Publish new entries to the website.
# Note: Do not try to scroll or interact with the running browser as it may cause an error in Selenium.
try:
    # Simultaneously write successful ids to disk to prevent posting duplicate entries on the next run.
    with open(mappings_file,"a") as f:
        while index < len(keys):
            bibid = keys[index]

            if bibid in mappings:
                raise Exception(f"'{bibid}' present in mappings. Was it already posted?")

            entry = cleaned_entries[bibid]
            btype = entry['btype']
            if btype == "article":
                post_id = post_article(entry)
            elif btype == "inproceedings":
                post_id = post_conference_paper(entry)
            elif btype == "incollection":
                post_id = post_book_chapter(entry)
            elif btype == "book":
                post_id = post_book(entry)
            elif btype == "phdthesis":
                post_id = post_thesis(entry)
            else:
                raise Exception("Not found")
            
            mappings[bibid] = post_id
            f.write("%s,%s\n" % (bibid, post_id))
            f.flush()
            print("Posted:",index, entry['title'],bibid,post_id)
            index += 1
            time.sleep(1)
        print("Done!")
except:
    pp = pprint.PrettyPrinter()
    pp.pprint(entry)
    raise Exception("Error in index", index)

In [None]:
# Delete old entries no longer on DBLP
for bibid,entry in deleted_ids.items():
    nodeid = mappings[bibid]
    print("Deleting: %s | %s | %s | %s | %s" % (nodeid, bibid, entry['title'].replace('\n',' '), entry['author'].replace('\n',' '), entry['biburl']))
    delete_entry(nodeid)

In [None]:
mappings = load_mappings()

In [None]:
# Make sure the mappings for all the `bibid`s are present on disk and correctly reflects the ids posted online.
mappings_check = load_mappings()
for bibid in keys:
    print(mappings[bibid], bibid, cleaned_entries[bibid]['title'])
    assert bibid in mappings_check, "'{bibid}' not present in mappings file"
    assert mappings[bibid] == mappings_check[bibid], f"'{mappings[bibid]}' != '{mappings_check[bibid]}'"

In [None]:
# Replace old data with new
shutil.rmtree(authors_dirs_previous)
os.rename(authors_dirs_new, authors_dirs_previous) 

In [None]:
# 1. IMPORTANT: Delete the cookie value above.
assert len(cookie_value) == 0
# 2. Reset index to 0 to avoid missing entries in the next run.
assert index == 0
# 3. Clear all cells outputs.
# 4. Save the notebook.
# 5. Commit to git and push.

In [None]:
# Sample code to bulk delete entries

# browser.get('https://uwaterloo.ca/data-systems-group/admin/workbench/archived')
# while True:
#     browser.find_element("xpath",'//*[@id="block-system-main"]/div/div/div[2]/table/tbody/tr[1]/td[5]/ul/li[4]/a').click()
#     browser.find_element("xpath",'//*[@id="edit-submit"]').click()