# Using data from citedtags.bib and fordiva.json files look up DiVA IDs for these publications

## Some installs of packages

In [65]:
!pip install bibtexparser
!pip install thefuzz python-Levenshtein

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


## Import some useful libraries

In [83]:
import sys
import os	# to make OS calls, here to get time zone info

import json
import pandas as pd
import numpy as np

import urllib.request
import requests

import time
from datetime import datetime, date, timedelta
from io import StringIO

import re
from pprint import pprint

# import logging

import pymods

import bibtexparser
from bibtexparser.bparser import BibTexParser



from thefuzz import fuzz

#from bs4 import BeautifulSoup
from IPython.display import Markdown, display


Configuration variables

In [84]:
# For debugging, set the flag to True
Verbose_Flag=False

# set directory to use for output of the collected words
directory_prefix ='/tmp/'


## Define some helper functions and data

In [85]:
def is_file_older_than_x_days(file, days=1): 
    file_time = os.path.getmtime(file) 
    # Check against 24 hours 
    return ((time.time() - file_time) / 3600 > 24*days)

# define a function to compute lower case Roman numeras from an integer
def int_to_roman(num):
    res = ""
    table = [
        (1000, "m"),
        (900, "cm"),
        (500, "d"),
        (400, "cd"),
        (100, "c"),
        (90, "xc"),
        (50, "l"),
        (40, "xl"),
        (10, "x"),
        (9, "ix"),
        (5, "v"),
        (4, "iv"),
        (1, "i"),
    ]
    for cap, roman in table:
        d, m = divmod(num, cap)
        res += roman * d
        num = m
    return res

# Using the above function fill a dict with the lower case Roman numerals from 0 to 499
# These are precomputed, so later we can simply do a lookup of the string and et the integer.
roman_table=dict()
for i in range(0,500):
    roman_table[int_to_roman(i)]=i

def collect_originInfo(mod_elem):
    originInfo=dict()
    for elem in mod_elem:
        if elem.tag.count("}languageTerm") == 1:
            if elem.text is not None:
                originInfo["LanguageTerm"]=elem.text
        elif elem.tag.count("}dateIssued") == 1:
            if elem.text is not None:
                originInfo["CreatedDate"]=elem.text
        elif elem.tag.count("}dateOther") == 1:
            # <dateOther type="defence">2018-07-26T13:00:00</dateOther>
            # <dateOther type="academicTerm">VT 2018</dateOther>
            # <dateOther type="availableFrom">2018-11-19T09:16:45</dateOther>
            if elem.text is not None:
                type=elem.attrib.get('type')
                if type == 'defence':
                    originInfo['DefenceDate']=elem.text
                elif type == 'academicTerm':
                    originInfo['academicTerm']=elem.text
                elif type == 'availableFrom':
                    originInfo['PublicationDate']=elem.text
                else:
                    originInfo["dateOther"]=elem.text
        elif elem.tag.count("}place") == 1:
            if elem.text is not None:
                originInfo["place"]=elem.text
        elif elem.tag.count("}publisher") == 1:
            if elem.text is not None:
                originInfo["publisher"]=elem.text
        elif elem.tag.count("}edition") == 1:
            if elem.text is not None:
                originInfo["edition"]=elem.text
        elif elem.tag.count("}genre") == 1:
            if elem.text is not None:
                originInfo["genre"]=elem.text
        else:
            print("Unhandled case in collect_originInfo: {}".format(elem))
    return originInfo

def update_key_list(diva_entry, key, name_struct):
    current_value=diva_entry.get(key, list())
    current_value.append(name_struct)
    return current_value

# The following have been adapted from https://www.loc.gov/marc/relators/relaterm.html
# for use in the content of KTH, where a 'mon' is an 'examiner' and a 'ths' is a 'supervisor'

MARC_Code_for_Relators = {
    'abr': 'abridger',
    'acp': 'art copyist',
    'act': 'actor',
    'adi': 'art director',
    'adp': 'adapter',
    'anl': 'analyst',
    'anm': 'animator',
    'ann': 'annotator',
    'anc': 'announcer',
    'ant': 'bibliographic antecedent',
    'ape': 'appellee',
    'apl': 'appellant',
    'app': 'applicant',
    'aqt': 'author in quotations or text abstracts',
    'arc': 'architect',
    'ard': 'artistic director',
    'arr': 'arranger',
    'art': 'artist',
    'asg': 'assignee',
    'asn': 'associated name',
    'ato': 'autographer',
    'att': 'attributed name',
    'auc': 'auctioneer',
    'aud': 'author of dialog',
    'aue': 'audio engineer',
    'aui': 'author of introduction',
    'aup': 'audio producer',
    'aus': 'screenwriter',
    'aut': 'author',
    'bdd': 'binding designer',
    'bjd': 'bookjacket designer',
    'bka': 'book artist',
    'bkd': 'book designer',
    'bkp': 'book producer',
    'blw': 'blurb writer',
    'bnd': 'binder',
    'bpd': 'bookplate designer',
    'brd': 'broadcaster',
    'brl': 'braille embosser',
    'bsl': 'bookseller',
    'cad': 'casting director',
    'cas': 'caster',
    'ccp': 'conceptor',
    'chr': 'choreographer',
    'cli': 'client',
    'cll': 'calligrapher',
    'clr': 'colorist',
    'clt': 'collotyper',
    'cmm': 'commentator',
    'cmp': 'composer',
    'cmt': 'compositor',
    'cnd': 'conductor',
    'cng': 'cinematographer',
    'cns': 'censor',
    'coe': 'contestant-appellee',
    'col': 'collector',
    'com': 'compiler',
    'con': 'conservator',
    'cop': 'camera operator',
    'cor': 'collection registrar',
    'cos': 'contestant',
    'cot': 'contestant-appellant',
    'cou': 'court governed',
    'cov': 'cover designer',
    'cpc': 'copyright claimant',
    'cpe': 'complainant-appellee',
    'cph': 'copyright holder',
    'cpl': 'complainant',
    'cpt': 'complainant-appellant',
    'cre': 'creator',
    'crp': 'correspondent',
    'crr': 'corrector',
    'crt': 'court reporter',
    'csl': 'consultant',
    'csp': 'consultant to a project',
    'cst': 'costume designer',
    'ctb': 'contributor',
    'cte': 'contestee-appellee',
    'ctg': 'cartographer',
    'ctr': 'contractor',
    'cts': 'contestee',
    'ctt': 'contestee-appellant',
    'cur': 'curator',
    'cwt': 'commentator for written text',
    'dbd': 'dubbing director',
    'dbp': 'distribution place',
    'dfd': 'defendant',
    'dfe': 'defendant-appellee',
    'dft': 'defendant-appellant',
    'dgc': 'degree committee member',
    'dgg': 'degree granting institution',
    'dgs': 'degree supervisor',
    'dis': 'dissertant',
    'djo': 'dj',
    'dln': 'delineator',
    'dnc': 'dancer',
    'dnr': 'donor',
    'dpc': 'depicted',
    'dpt': 'depositor',
    'drm': 'draftsman',
    'drt': 'director',
    'dsr': 'designer',
    'dst': 'distributor',
    'dtc': 'data contributor',
    'dte': 'dedicatee',
    'dtm': 'data manager',
    'dto': 'dedicator',
    'dub': 'dubious author',
    'edc': 'editor of compilation',
    'edd': 'editorial director',
    'edm': 'editor of moving image work',
    'edt': 'editor',
    'egr': 'engraver',
    'elg': 'electrician',
    'elt': 'electrotyper',
    'enj': 'enacting jurisdiction',
    'eng': 'engineer',
    'etr': 'etcher',
    'evp': 'event place',
    'exp': 'expert',
    'fac': 'facsimilist',
    'fds': 'film distributor',
    'fld': 'field director',
    'flm': 'film editor',
    'fmd': 'film director',
    'fmk': 'filmmaker',
    'fmo': 'former owner',
    'fmp': 'film producer',
    'fnd': 'funder',
    'fon': 'founder',
    'fpy': 'first party',
    'frg': 'forger',
    'gdv': 'game developer',
    'gis': 'geographic information specialist',
    'his': 'host institution',
    'hnr': 'honoree',
    'hst': 'host',
    'ill': 'illustrator',
    'ilu': 'illuminator',
    'ink': 'inker',
    'ins': 'inscriber',
    'inv': 'inventor',
    'isb': 'issuing body',
    'itr': 'instrumentalist',
    'ive': 'interviewee',
    'ivr': 'interviewer',
    'jud': 'judge',
    'jug': 'jurisdiction governed',
    'lbr': 'laboratory',
    'lbt': 'librettist',
    'ldr': 'laboratory director',
    'led': 'lead',
    'lee': 'libelee-appellee',
    'lel': 'libelee',
    'len': 'lender',
    'let': 'libelee-appellant',
    'lgd': 'lighting designer',
    'lie': 'libelant-appellee',
    'lil': 'libelant',
    'lit': 'libelant-appellant',
    'lsa': 'landscape architect',
    'lse': 'licensee',
    'lso': 'licensor',
    'ltg': 'lithographer',
    'ltr': 'letterer',
    'lyr': 'lyricist',
    'mcp': 'music copyist',
    'mdc': 'metadata contact',
    'med': 'medium',
    'mfp': 'manufacture place',
    'mfr': 'manufacturer',
    'mka': 'makeup artist',
    'mod': 'moderator',
    #'mon': 'monitor',
    'mon': 'examiner',
    'mrb': 'marbler',
    'mrk': 'markup editor',
    'msd': 'musical director',
    'mte': 'metal-engraver',
    'mtk': 'minute taker',
    'mup': 'music programmer',
    'mus': 'musician',
    'mxe': 'mixing engineer',
    'nan': 'news anchor',
    'nrt': 'narrator',
    'onp': 'onscreen participant',
    'opn': 'opponent',
    'osp': 'onscreen presenter',
    'org': 'organizer',
    'orm': 'organizer',
    'oth': 'other',
    'own': 'owner',
    'pad': 'place of address',
    'pan': 'panelist',
    'pat': 'patron',
    'pbd': 'publishing director',
    'pbl': 'publisher',
    'pdr': 'project director',
    'pfr': 'proofreader',
    'pgr': 'programmer',
    'pht': 'photographer',
    'plt': 'platemaker',
    'pma': 'permitting agency',
    'pmn': 'production manager',
    'pnc': 'penciller',
    'pop': 'printer of plates',
    'ppm': 'papermaker',
    'ppt': 'puppeteer',
    'pra': 'praeses',
    'prc': 'process contact',
    'prd': 'production personnel',
    'pre': 'presenter',
    'prf': 'performer',
    'prg': 'programmer',
    'prm': 'printmaker',
    'prn': 'production company',
    'pro': 'producer',
    'prp': 'production place',
    'prs': 'production designer',
    'prt': 'printer',
    'prv': 'provider',
    'pta': 'patent applicant',
    'pte': 'plaintiff-appellee',
    'pth': 'patent holder',
    'ptf': 'plaintiff',
    'ptt': 'plaintiff-appellant',
    'pup': 'publication place',
    'rbr': 'rubricator',
    'rcd': 'recordist',
    'rce': 'recording engineer',
    'rcp': 'addressee',
    'rdd': 'radio director',
    'red': 'redaktor',
    'ren': 'renderer',
    'res': 'researcher',
    'rev': 'reviewer',
    'rpc': 'radio producer',
    'rps': 'repository',
    'rpt': 'reporter',
    'rpy': 'responsible party',
    'rsd': 'stage director',
    'rsg': 'restager',
    'rsr': 'restorationist',
    'rsp': 'respondent',
    'rst': 'respondent-appellant',
    'rse': 'respondent-appellee',
    'rth': 'research team head',
    'rtm': 'research team member',
    'rxa': 'remix artist',
    'sad': 'scientific advisor',
    'sce': 'scenarist',
    'scl': 'sculptor',
    'scr': 'scribe',
    'sds': 'sound designer',
    'sde': 'sound engineer',
    'sec': 'secretary',
    'sfx': 'special effects provider',
    'sgd': 'stage director',
    'sgn': 'signer',
    'sht': 'supporting host',
    'sll': 'seller',
    'sng': 'singer',
    'spk': 'speaker',
    'spn': 'sponsor',
    'spy': 'second party',
    'srv': 'surveyor',
    'std': 'set designer',
    'stg': 'setting',
    'stm': 'stage manager',
    'stn': 'standards body',
    'str': 'stereotyper',
    'stl': 'storyteller',
    'swd': 'software developer',
    'tad': 'technical advisor',
    'tau': 'television writer',
    'tcd': 'technical director',
    'tch': 'teacher',
    #'ths': 'thesis advisor',
    'ths': 'supervisor',
    'tld': 'television director',
    'tlg': 'television guest',
    'tlh': 'television host',
    'tlp': 'television producer',
    'trc': 'transcriber',
    'trl': 'translator',
    'tyd': 'type designer',
    'tyg': 'typographer',
    'uvp': 'university place',
    'vac': 'voice actor',
    'vdg': 'videographer',
    'vfx': 'visual effects provider',
    'wac': 'writer of added commentary',
    'wal': 'writer of added lyrics',
    'wam': 'writer of accompanying material',
    'wat': 'writer of added text',
    'waw': 'writer of afterword',
    'wdc': 'woodcutter',
    'wde': 'wood engraver',
    'wfs': 'writer of film story',
    'wfw': 'writer of foreword',
    'wft': 'writer of intertitles',
    'win': 'writer of introduction',
    'wit': 'witness',
    'wpr': 'writer of preface',
    'wst': 'writer of supplementary textual content',
}

# Define a function to parse the MODS record

In [86]:
def verbosePrint(msg):
    global Verbose_Flag
    if Verbose_Flag:
        print(msg)


def mods_records_to_dataframe(mods_records): # return a dataframe
    n_index=0                                # index over entries
    df=pd.DataFrame()           # define a dataframe to collect the results
    df2=pd.DataFrame()          # define a dataframe for each entry
    for record in mods_records:
        n_index=n_index+1
        diva_entry=dict()
        diva_entry['node']=n_index
        verbosePrint(f"{n_index=} {record=}")
        verbosePrint(record.tag)
        if record.tag.count("}mods") == 1:
            #print("Attribute: {}".format(record.attrib))
            for mod_element in record:
                verbosePrint(mod_element)
                if mod_element.tag.count("}genre") >= 1:
                    if mod_element.attrib.get('authority') == "diva" and mod_element.attrib.get('type') == "publicationTypeCode":
                        diva_entry['publicationTypeCode']=mod_element.text
                        verbosePrint(F"publicationTypeCode= {mod_element.text}")
                        if mod_element.attrib.get('type') == "publicationType":
                            authority=mod_element.attrib.get('authority')
                            current_pubtype=diva_entry.get('publicationType', dict())
                            if authority == 'diva':
                                lang=mod_element.attrib.get('lang')
                                if lang:
                                    current_pubtype.update({lang: mod_element.text})
                                diva_entry['diva_publicationType']=current_pubtype
                            elif authority == 'svep':
                                diva_entry['svep_publicationType']=mod_element.text
                            elif authority == 'kev':
                                lang=mod_element.attrib.get('lang')
                                current_pubtype=diva_entry.get('kev_publicationType', dict())
                                if lang:
                                    current_pubtype.update({lang: mod_element.text})
                                diva_entry['kev_publicationType']=current_pubtype
                            else:
                                print(f"Unhandled case in publicationType: {mod_element.attrib=} {mod_element.text=}")
                elif mod_element.tag.count("}name") == 1:
                    # <name type="personal" authority="kth" xlink:href="u19gy7zg">
                    # <name type="corporate" authority="kth" xlink:href="5956"><namePart>KTH</namePart><namePart>Skolan för datavetenskap och kommunikation (CSC)</namePart>
                    if mod_element.attrib.get('type') == "personal":
                        name_type='personal'
                    elif mod_element.attrib.get('type') == "corporate":
                        name_type='corporate'
                    elif mod_element.attrib.get('type') == "conference":
                        name_type='conference'    
                    else:
                        name_type='unknown'
                    name_struct={'type': name_type}
                    name_authority=mod_element.attrib.get('authority')
                    if name_authority is not None:
                        name_struct['authority']=name_authority
                    xlink=mod_element.attrib.get('{http://www.w3.org/1999/xlink}href', None)
                    if xlink is not None:
                        name_struct['xlink']=xlink
                    for elem in mod_element:
                        if elem.tag.count("}namePart") == 1:
                            # personal name
                            namePart_type=elem.attrib.get('type')
                            if namePart_type == 'family':
                                name_struct['family']=elem.text
                            elif namePart_type == 'given':
                                name_struct['given']=elem.text
                            elif namePart_type == 'termsOfAddress':
                                # <namePart type="termsOfAddress">professor</namePart>
                                name_struct['termsOfAddress']=elem.text
                            elif name_type =='conference':
                                name_struct['conference']=elem.text
                            else:
                                orglevel=0
                                orglevels=dict()
                                for org in elem:
                                    orglevelname=f"L{orglevel}"
                                    orglevels[orglevelname]=elem.text
                                    orglevel=orglevel+1
                                if orglevels:
                                    name_struct['orglevels']=orglevels
                        elif elem.tag.count("}role") == 1:
                            # <role><roleTerm type="code" authority="marcrelator">pbl</roleTerm>
                            for role in elem:
                                role_type=role.attrib.get('type')
                                role_authority=role.attrib.get('authority')
                                # the codes come from https://www.loc.gov/marc/relators/relaterm.html
                                if role_type=='code'and role_authority=='marcrelator':
                                    if role.text in MARC_Code_for_Relators:
                                        name_struct['role']=MARC_Code_for_Relators[role.text]
                                    else:
                                        print(f'Unhandled role: {role.text}')
                        elif elem.tag.count("}affiliation") == 1:
                            # <affiliation>KTH, Kommunikationssystem, CoS</affiliation>
                            name_struct['affiliation']=elem.text
                        elif elem.tag.count("}description") == 1:
                            # <description>orcid.org=0000-0002-6066-746X</description>
                            name_struct['description']=elem.text
                        else:
                            print('Unhandled case of role')
                    name_role=name_struct.get('role', None)
                    if name_role is not None:
                        # There should only be one examiner, but we support several
                        # There can be multiple supervisors and authors
                        if name_role in ['examiner',  'supervisor', 'opponent',  'applicant', 'architect', 'author', 'author of dialog',
                                         'author of introduction', 'author in quotations or text abstracts', 'editor', 'artist',
                                         'cinematographer', 'commentator',
                                         'commentator for written text', 'contributor', 'cover designer',
                                         'creator', 'curator',  'designer', 'director', 'dissertant',
                                         'filmmaker',  'illustrator', 'inventor',
                                         'narrator', 'photographer', 'producer', 'project director', 'programmer',
                                         'publisher', 'researcher', 'screenwriter', 'translator', 'writer of accompanying material'
                                         ]:
                            diva_entry[name_role] = update_key_list(diva_entry, name_role, name_struct)
                        elif name_role == 'other':
                            if name_struct.get('type', None) == 'corporate':
                                if name_struct.get('description', None) == 'Research Group':
                                    diva_entry['research group']= name_struct
                                else:
                                    print(f'Unhandled name - other: {name_struct} with role: {name_role}')
                            else:
                                print(f'Unhandled name - unknown: {name_struct} with role: {name_role}')
                    else:
                        if name_type == 'conference':
                            diva_entry['conference']=name_struct.get('conference')
                        elif name_type == 'personal': # as it an author but role not explicit
                            diva_entry['author'] = update_key_list(diva_entry, 'author', name_struct)
                        else:
                            print(f'Unhandled name - with name_type: {name_type=} {name_struct=} {diva_entry=}')
                elif mod_element.tag.count("}titleInfo") == 1:
                    #current_titleInfo=diva_entry.get('titleInfo', dict())
                    #diva_entry['titleInfo']=current_titleInfo
                    lang=mod_element.attrib.get('lang')
                    for elem in mod_element:
                        if elem.tag.count("}title") == 1:
                            #current_title=current_titleInfo.get('title', dict())
                            current_title=diva_entry.get('title', dict())
                            current_title.update({lang: elem.text})
                            diva_entry['title']=current_title
                        elif elem.tag.count("}subTitle") == 1:
                            #current_subtitle=current_titleInfo.get('subtitle', dict())
                            current_subtitle=diva_entry.get('subtitle', dict())
                            current_subtitle.update({lang: elem.text})
                            diva_entry['subtitle']=current_subtitle
                        else:
                            print("Unhandled case in titleInfo")
                elif mod_element.tag.count("}language") == 1:
                    i=0
                    temp_dict=dict()
                    for elem in mod_element:
                        i=i+1
                        if elem.tag.count("}languageTerm") == 1:
                            if elem.text is not None:
                                name='languageTerm_{0}'.format(i)
                                temp_dict[name]=[elem.text]
                        elif elem.tag.count("}dateIssued") == 1:
                            if elem.text is not None:
                                verbosePrint("dateIssued: ".format(elem.text))
                                name='dateIssued_{0}'.format(i)
                                temp_dict[name]=[elem.text]
                        elif elem.tag.count("}dateOther") == 1:
                            if elem.text is not None:
                                verbosePrint("dateOther: {0}{1}".format(elem.attrib, elem.text))
                                name='dateOther_{0}'.format(i)
                                temp_dict[name]=[elem.text]
                        else:
                            verbosePrint("mod_emem[" + str(i) +"]".format(elem))
                            name='language_{0}'.format(i)
                            temp_dict[name]=[elem.text]
                    verbosePrint("temp_dict={}".format(temp_dict))
                elif mod_element.tag.count("}originInfo") == 1:
                    diva_entry['originInfo']=collect_originInfo(mod_element)
                    diva_entry['Year']=diva_entry['originInfo'].get("CreatedDate", 'Unknown year')
                elif mod_element.tag.count("}identifier") == 1:
                    if mod_element.text is not None and mod_element.attrib.get('type') in ['libris', 'articleId', 'url', 'doi', 'pmid', 'isi', 'scopus', 'uri', 'isrn', 'isbn', 'local']:
                        diva_entry[mod_element.attrib.get('type')]=mod_element.text
                    else:
                        print(f"unexpected identifier type: {mod_element.attrib.get('type')}")
                elif mod_element.tag.count("}abstract") == 1:
                    current_abstracts=diva_entry.get('abstract', dict())
                    lang=mod_element.attrib.get('lang')
                    if lang == '-1' or not isinstance(lang , str):
                        # print(f"{lang=} in {diva_entry=}")
                        lang='swe' # corect the entry in  diva2:905489
                    current_abstracts.update({lang: mod_element.text})
                    diva_entry['abstract']=current_abstracts
                elif mod_element.tag.count("}subject") == 1:
                    #<subject lang="eng" xlink:href="9895"><topic>Master of Science - Computer Science</topic><genre>Educational program</genre></subject>
                    #<subject lang="swe" xlink:href="9895"><topic>Teknologie masterexamen - Datalogi</topic><genre>Educational program</genre></subject>
                    #<subject lang="eng" xlink:href="10280"><topic>Computer Science</topic><genre>Subject/course</genre></subject>
                    #<subject lang="swe" xlink:href="10280"><topic>Datalogi</topic><genre>Subject/course</genre></subject>
                    xlink=mod_element.attrib.get('{http://www.w3.org/1999/xlink}href', None)
                    xlinks=mod_element.attrib.get('xlink', None)
                    authority=mod_element.attrib.get('authority', None)
                    if authority:
                        subject=f'{authority}'
                    elif xlink:    
                        subject='xlink' 
                    elif xlinks:    
                        subject='xlinks' 
                    else:
                        subject='keywords'
                    lang=mod_element.attrib.get('lang')
                    current_subject=diva_entry.get(subject, dict())
                    topics=current_subject.get(lang, list())
                    for elem in mod_element:
                        if elem.tag.count("}topic") == 1:
                            topics.append(elem.text)
                            current_subject.update({lang: topics})
                    diva_entry[subject]=current_subject
                elif mod_element.tag.count("}recordInfo") == 1:
                    #<recordInfo>
                    #<recordOrigin>u1d13i2c</recordOrigin>
                    #<recordContentSource>kth</recordContentSource>
                    #<recordCreationDate>2019-06-26</recordCreationDate>
                    #<recordChangeDate>2022-06-26</recordChangeDate>
                    #<recordIdentifier>diva2:1330685</recordIdentifier>
                    #</recordInfo>
                    current_recordInfo=diva_entry.get('recordInfo', dict())
                    for elem in mod_element:
                        #if elem.text is not None:
                        #    print(f"in recordInfo {elem.text}")
                        if elem.tag.count("}recordOrigin") == 1:
                            current_recordInfo["recordOrigin"]=elem.text
                        elif elem.tag.count("}recordContentSource") == 1:
                            current_recordInfo["recordContentSource"]=elem.text
                        elif elem.tag.count("}recordCreationDate") == 1:
                            current_recordInfo["recordCreationDate"]=elem.text
                        elif elem.tag.count("}recordChangeDate") == 1:
                            current_recordInfo["recordChangeDate"]=elem.text
                        elif elem.tag.count("}recordIdentifier") == 1:
                            current_recordInfo["recordIdentifier"]=elem.text
                        else:
                            print("unhandled case in recordInfo")
                        diva_entry['recordInfo']=current_recordInfo
                elif mod_element.tag.count("}location") == 1:
                    # <location><url displayLabel="fulltext" note="free" access="raw object">http://kth.diva-portal.org/smash/get/diva2:821850/FULLTEXT01.pdf</url></location>
                    current_location=diva_entry.get('location', dict())
                    for elem in mod_element:
                        if elem.tag.count("}url") == 1:
                            label=elem.attrib.get('displayLabel', None)
                            if label:
                                if elem.text:
                                    if elem.text == diva_entry['title'].get('eng', None) or elem.text == diva_entry['title'].get('swe', None):
                                        cl=current_location.get('url', list())
                                        cl.append(elem.text)
                                        current_location['url']=cl
                                    elif label.find(',') >= 0 or label.startswith('Betydelsen av skuggning'):
                                        cl=current_location.get('url', list())
                                        cl.append(elem.text)
                                        current_location['url']=cl
                                    elif label == 'Fulltext' or label == 'fulltext' or label == 'Kandidatexjobb i Medieteknik (DM129X) år 2010':
                                        cl=current_location.get('url', list())
                                        cl.append(elem.text)
                                        current_location['url']=cl
                                    else:
                                        cl=current_location.get('other_links', list())
                                        cl.append(label+' url: '+elem.text)
                                        current_location['other_links']=cl
                                else:
                                    cl=current_location.get(label, list())
                                    cl.append("")
                                    current_location[label]=cl
                            else:
                                if elem.text:
                                    cl=current_location.get('url', list())
                                    cl.append(elem.text)
                                    current_location['url']=cl
                                else:
                                    cl=current_location.get('url', list())
                                    cl.append("")
                                    current_location['url']=cl
                        else:
                            print("Unhandled case in }location")
                        diva_entry['location']=current_location
                elif mod_element.tag.count("}typeOfResource") == 1:
                    # <typeOfResource>text</typeOfResource>
                    diva_entry['typeOfResource']=mod_element.text
                elif mod_element.tag.count("}relatedItem") == 1:            
                    #<relatedItem type="series">
                    #  <titleInfo><title>Trita-ICT-COS</title></titleInfo>
                    #  <identifier type="issn">1653-6347</identifier>
                    #  <identifier type="local">247</identifier>
                    #  <identifier type="issue number">COS/CCS 2007-24</identifier>
                    #</relatedItem>
                    relatedItemType=mod_element.attrib.get('type', None)
                    if not relatedItemType:
                        relatedItemType='relatedItem'
                    titleInfo=dict()
                    for elem in mod_element:
                        if relatedItemType == 'host':
                            titleInfo["host"]='host'
                        if elem is not None:
                            if elem.tag.count("}titleInfo") == 1:
                                for title in elem:
                                    if title is not None and title.text:
                                        titleInfo["title"]=title.text
                            elif elem.tag.count("}identifier") >= 1:
                                for subelem in elem:
                                    identifier_type=subelem.attrib.get('type', None)
                                    titleInfo[f"{identifier_type}"]=subelem.text
                            elif elem.tag.count("}note") == 1:
                                note_type=elem.attrib.get('type', None)
                                if note_type is None:
                                    note_type='note'
                                if elem.text is not None:
                                    titleInfo[note_type]=elem.text
                            elif elem.tag.count("}part") == 1:
                                for subelem in elem:
                                    if subelem.tag == 'detail':
                                        detail_type=subelem.attrib.get('type', None)
                                        if detail_type == 'volume':
                                            for subsubelem in subelem:
                                                if subsubelem.tag == 'number':
                                                    titleInfo[f"volume"]=subsubelem.text
                                        if detail_type == 'issue':
                                            for subsubelem in subelem:
                                                if subsubelem.tag == 'number':
                                                    titleInfo[f"issue"]=subsubelem.text
                            elif elem.tag.count("}genre") == 1:
                                if elem.text is not None:
                                    titleInfo['genre']=elem.text
                                else:
                                    elemattr=elem.attrib.get('type', None)
                                    if elemattr:
                                        print(f"genre case in relatedItem for {elem=} with type {elemattr}")
                                    else:
                                        print(f"genre case in relatedItem for {elem=}")
                            else:
                                print(f"Unhanded case in relatedItem for {elem=}")
                    diva_entry[relatedItemType]=titleInfo
                elif mod_element.tag.count("}physicalDescription") == 1:
                    #<physicalDescription>
                    #  <form authority="marcform">electronic</form>
                    #  <extent>xii,74</extent></physicalDescription>
                    for elem in mod_element:
                        if elem.tag.count("}extent") == 1:
                            diva_entry['Pages']=elem.text
                elif mod_element.tag.count("}note") == 1:
                    #<note type="level" lang="swe">Självständigt arbete på avancerad nivå (masterexamen)</note>
                    #<note type="universityCredits" lang="swe">20 poäng / 30 hp</note>
                    #<note type="cooperation">Saab AB</note>
                    notetype=mod_element.attrib.get('type', None)
                    if not notetype:
                        continue
                    elif notetype in ['level', 'universityCredits', 'cooperation', 'venue',  'funder', 'papers',
                                      'sustainableDevelopment', 'publicationStatus', 'creatorCount',
                                      'version identification'
]:
                        diva_entry[notetype]=mod_element.text

                    elif notetype in ['degree', 'thesis', 'patent', 'project']:
                        diva_entry[f"{notetype}_note"]=mod_element.text
                    elif notetype == 'publicationChannel':
                        diva_entry["degree_publicationChannel"]=mod_element.text
                    else:
                        print(f"unhandled case of note for {notetype}")
                else:
                    print("Unhandled case in mod mod_element={}".format(mod_element))
        try:
            df2 = pd.json_normalize(diva_entry)
        except BaseException as err:
            print(f"Unexpected {err=}, {type(err)=} with {diva_entry=}")
            continue

        try:
            df=pd.concat([df, df2], axis=0, join='outer', ignore_index = True, verify_integrity=False)
        except BaseException as err:
            print(f"Unexpected {err=}, {type(err)=} with {df2=}")
            continue

    return df


# Get data from the fordiva.json file

In [87]:
fordiva_filename=directory_prefix+'fordiva.json'
user_info=dict()
try:
    with open(fordiva_filename, 'r', encoding='utf-8') as json_FH:
        json_string=json_FH.read()
        user_info=json.loads(json_string)
except FileNotFoundError:
    print(f"File not found: {fordiva_filename}")
kthid=user_info['Author1']['Local User Id']
print(f"{kthid}")

u1XXXXXX


# For testing purposes we will use my KTHID

In [88]:
kthid='u1d13i2c'

# Get information from the citedtags.bib file

In [89]:
bibtext_filename=directory_prefix+'citedtags.bib'
bibtex_string=''
try:
    with open(bibtext_filename, 'r', encoding='utf-8') as bibtex_FH:
        bibtex_string=bibtex_FH.read()
except FileNotFoundError:
    print(f"File not found: {bibtext_filename}")

if Verbose_Flag:
    print("read bibtex file: {}".format(d))

# Create a parser object
parser = bibtexparser.bparser.BibTexParser(ignore_nonstandard_types=False)

# Use this custom parser to load the data
bib_database = bibtexparser.loads(bibtex_string, parser=parser)

# The 'entries' list will now correctly contain all your data
pprint(bib_database.entries)

[{'ENTRYTYPE': 'article',
  'ID': 'ioannidis_coherent_1991_pub',
  'abstract': 'This memo describes the Coherent File Distribution Protocol '
              '(CFDP). This is an Experimental Protocol for the Internet '
              'community. It does not specify an Internet standard.',
  'author': 'Ioannidis, J. and Maguire, G.',
  'doi': '10.17487/RFC1235',
  'issn': '2070-1721',
  'journal': 'Internet Request for Comments',
  'month': 'June',
  'title': 'Coherent {File} {Distribution} {Protocol}',
  'url': 'http://www.rfc-editor.org/rfc/rfc1235.txt',
  'volume': 'RFC 1235 (Experimental)',
  'year': '1991'},
 {'ENTRYTYPE': 'article',
  'ID': 'maguire_jr_new_2014_pub',
  'abstract': 'As the most advantageous total hip arthroplasty (THA) operation '
              'is the first, timely replacement of only the liner is socially '
              'and economically important because the utilization of THA is '
              'increasing as younger and more active patients are receiving '
     

# Fetch a user's data from DiVA

In [90]:
def get_mods_kthid(kthid, filename):
        url='https://kth.diva-portal.org/smash/export.jsf?format=mods&addFilename=true&aq=[[{"personId":"'+f"{kthid}"+'"}]]&aqe=[]&aq2=[[]]&onlyFullText=false&noOfRows=5000&sortOrder=title_sort_asc&sortOrder2=title_sort_asc'
        print("url={}".format(url))
        req = urllib.request.Request(url)
        try:
            with urllib.request.urlopen(req) as response:
                data_str=response.read()
        except urllib.error.HTTPError as e:
            print(e.code)
            print(e.read())
            raise

        with open(filename, "wb") as mods_data_file:
            mods_data_file.write(data_str)
        mods_records = pymods.MODSReader(filename)
        return mods_records


filename=f"/tmp/{kthid}-diva-mods"
mods_records=get_mods_kthid(kthid, filename)
user_df=mods_records_to_dataframe(mods_records)
user_df.shape

url=https://kth.diva-portal.org/smash/export.jsf?format=mods&addFilename=true&aq=[[{"personId":"u1d13i2c"}]]&aqe=[]&aq2=[[]]&onlyFullText=false&noOfRows=5000&sortOrder=title_sort_asc&sortOrder2=title_sort_asc


(336, 61)

# Functions to do normalization and matching

In [91]:
# --- 1. More Robust Normalization Function ---
def normalize_identifier(identifier):
    """Strips common prefixes and handles empty strings."""
    if not isinstance(identifier, str) or identifier.strip() == "":
        return None # Return None for empty or non-string data
    identifier = re.sub(r'^(https?://)?(doi.org/)?(doi:)?', '', identifier, flags=re.IGNORECASE)
    return identifier.lower().strip()

def normalize_title(title, subtitle=None, delimiter_pattern=r'[:—-]' ):
    if pd.notna(subtitle):
        full_title = f"{title} {subtitle}"
    else:
        full_title = str(title)
    
    parts = re.split(delimiter_pattern, full_title, 1)
    cleaned_parts = [re.sub(r'[^\w\s]', '', part).lower().strip() for part in parts]
    
    return ' '.join(cleaned_parts)

# --- 2. Pre-process the DataFrame ---
identifier_cols = ['doi'] # Add 'url', 'isbn', etc. as needed
def preprocess_dataframe(df_to_process):
    """
    Adds normalized columns to the DataFrame for efficient searching.
    """
    # *** THE CRITICAL FIX ***
    # Create an explicit copy to work on. This prevents the SettingWithCopyWarning.
    df_processed = df_to_process.copy()

    # List of identifier columns to check
    identifier_cols = ['doi', 'url', 'isbn', 'pmid']
    for col in identifier_cols:
        if col in df_processed.columns:
            df_processed[f'norm_{col}'] = df_processed[col].apply(normalize_identifier)

    # List of title/subtitle language pairs to check
    title_cols = {
        'eng': ('title.eng', 'subtitle.eng'),
        'swe': ('title.swe', 'subtitle.swe')
    }
    for lang, (title_col, sub_col) in title_cols.items():
        if title_col in df_processed.columns:
            if sub_col in df_processed.columns:
                df_processed[f'norm_title_{lang}'] = df_processed.apply(lambda row: normalize_title(row[title_col], row[sub_col]), axis=1)
            else:
                df_processed[f'norm_title_{lang}'] = df_processed[title_col].apply(normalize_title)
    
    return df_processed

    
# --- 3. Main Matching Logic with Debugging ---
def find_diva_ids(bib_database, dataframe):
    #parser = BibTexParser(ignore_nonstandard_types=False)
    #bib_database = bibtexparser.loads(bibtex_content, parser=parser)
    results = {}
    FUZZY_MATCH_THRESHOLD = 90
    print("--- Starting Match Process ---\n")
    
    for entry in bib_database.entries:
        bib_key = entry['ID']
        print(f"Processing BibTeX key: {bib_key}")
        found_match = None

        # Method 1: Match by Identifiers
        for id_type in identifier_cols:
            if id_type in entry and f'norm_{id_type}' in dataframe.columns:
                norm_bib_id = normalize_identifier(entry[id_type])
                
                # --- DEBUGGING PRINTS ---
                print(f"  Attempting to match on '{id_type}'...")
                print(f"    - Raw .bib value:      '{entry.get(id_type)}'")
                print(f"    - Normalized .bib value: '{norm_bib_id}'")
                
                if norm_bib_id:
                    id_match = dataframe[dataframe[f'norm_{id_type}'] == norm_bib_id]
                    if not id_match.empty:
                        diva_id = id_match.iloc[0]['recordInfo.recordIdentifier']
                        found_match = (diva_id, f"Identifier ({id_type})")
                        print(f"    -> SUCCESS: Found DiVA ID {diva_id}")
                        break
                else:
                    print("    -> SKIPPED: Identifier is empty after normalization.")
        if found_match:
            results[bib_key] = {'DiVA_ID': found_match[0], 'match_method': found_match[1]}
            continue

        # Method 2: Fallback to Fuzzy Title Matching
        if 'title' in entry:
            norm_bib_title = normalize_title(entry['title'])
            # print(f"{norm_bib_title=}")
            best_score = 0
            best_match_id = None
            
            title_cols_map = {
                'eng': ('title.eng', 'subtitle.eng'),
                'swe': ('title.swe', 'subtitle.swe')
            }
            
            for lang in title_cols_map.keys():
                if f'norm_title_{lang}' in dataframe.columns:
                    scores = dataframe[f'norm_title_{lang}'].apply(lambda df_title: fuzz.ratio(norm_bib_title, df_title))
                    if scores.max() > best_score:
                        best_score = scores.max()
                        best_match_id = dataframe.loc[scores.idxmax()]['recordInfo.recordIdentifier']

            if best_score > FUZZY_MATCH_THRESHOLD:
                found_match = (best_match_id, f"Fuzzy Title (Score: {best_score})")
        if found_match:
            results[bib_key] = {'DiVA_ID': found_match[0], 'match_method': found_match[1]}
            continue

        # method 3 - brute force check for matching software link
        bib_type = entry['ENTRYTYPE']
        if bib_type == 'software':
            for idx, row in dataframe.iterrows():
                #print(f"{idx=}")
                diva_id=row['recordInfo.recordIdentifier']
                d=row['Year']
                bib_other_links=row['location.other_links']
                bib_url=entry['url']
                # 'url': 'https://doi.org/10.5281/zenodo.4435970'
                doi_prefix='https://doi.org/'
                if bib_url.startswith(doi_prefix):
                    bib_pseudo_doi=bib_url[len(doi_prefix):]
                # diva2:1527198: ['Software url: https://dl.acm.org/do/10.5281/zenodo.4435970/abs/', 'fulltext:postprint url: https://kth.diva-portal.org/smash/get/diva2:1527198/FULLTEXT01.pdf', 'fulltext:postprint url: https://kth.diva-portal.org/smash/get/diva2:1527198/FULLTEXT02.pdf']
                if diva_id == 'diva2:1527198':
                    #print(f"{diva_id}: {bib_other_links} {type(bib_other_links)} {bib_pseudo_doi=}")
                    if isinstance(bib_other_links, list):
                        for ol in bib_other_links:
                            #print(f"{ol=} {type(ol)=}")
                            software_url_prefix='Software url: '
                            if ol.startswith(software_url_prefix):
                                if ol[len(software_url_prefix):].find(bib_pseudo_doi) > 0:
                                    found_match = (diva_id, f"Software link (url: {ol[len(software_url_prefix):]})")
                                    break

        
        if found_match:
            results[bib_key] = {'DiVA_ID': found_match[0], 'match_method': found_match[1]}
        else:
            print("  -> FAILED: No match found for this entry.")
            results[bib_key] = {'DiVA_ID': 'Not found in database', 'match_method': 'None'}
        print("-" * 20)

    return results

# --- 4. Run and print final results ---

df1=user_df.copy()
df1=preprocess_dataframe(df1)
found_entries = find_diva_ids(bib_database, df1)

print("\n--- Final Matching Results ---")
pprint(found_entries)

--- Starting Match Process ---

Processing BibTeX key: ioannidis_coherent_1991_pub
  Attempting to match on 'doi'...
    - Raw .bib value:      '10.17487/RFC1235'
    - Normalized .bib value: '10.17487/rfc1235'
    -> SUCCESS: Found DiVA ID diva2:461420
Processing BibTeX key: maguire_jr_new_2014_pub
  Attempting to match on 'doi'...
    - Raw .bib value:      '10.1155/2014/528407'
    - Normalized .bib value: '10.1155/2014/528407'
    -> SUCCESS: Found DiVA ID diva2:690828
Processing BibTeX key: farshin_make_2019_pub
  Attempting to match on 'doi'...
    - Raw .bib value:      '10.1145/3302424.3303977'
    - Normalized .bib value: '10.1145/3302424.3303977'
    -> SUCCESS: Found DiVA ID diva2:1291291
Processing BibTeX key: kim_small-mass_2016_pub
  Attempting to match on 'doi'...
    - Raw .bib value:      '10.1155/2016/1847620'
    - Normalized .bib value: '10.1155/2016/1847620'
    -> SUCCESS: Found DiVA ID diva2:948742
Processing BibTeX key: verardo2023fmmheadenhancingautoencoderbase

In [92]:
def included_pubs_DiVA_ids(bib_database, found_entries):
    print(f"Publication:\tDiVA Id\tBibliographic key")
    for idx, entry in enumerate(bib_database.entries):
        bib_key = entry['ID']
        fe=found_entries[bib_key]
        did=fe.get('DiVA_ID', None)
        print(f"{idx+1}:\t{did}\t{bib_key} ")

included_pubs_DiVA_ids(bib_database, found_entries)

Publication:	DiVA Id	Bibliographic key
1:	diva2:461420	ioannidis_coherent_1991_pub 
2:	diva2:690828	maguire_jr_new_2014_pub 
3:	diva2:1291291	farshin_make_2019_pub 
4:	diva2:948742	kim_small-mass_2016_pub 
5:	diva2:1944107	verardo2023fmmheadenhancingautoencoderbasedecg_pub 
6:	diva2:456465	US7107453B2_pub 
7:	diva2:1926019	US12111768B2_pub 
8:	diva2:1527198	10.5281/zenodo.4435970_pub 
