In [None]:
import os
import datetime
import urllib
from lxml import etree
from rdflib.namespace import Namespace, XSD, RDF, RDFS
from rdflib import Graph, plugin, URIRef, Literal
from rdflib.serializer import Serializer
import gzip
import csv
import tqdm

In [None]:
tag_path = "/mnt/storage1/nlp/PMC_OA_PREPRO_LINKS"
pmc_path = "/mnt/storage1/nlp/PMC_OA_XML/PMC_OA_XML_LINKS/"

# final graph URL
base_url = "http://data.gesis.org/softwarekg/PMC/"

context = {
    "schema": Namespace("http://schema.org/"),
    "sms": Namespace("http://data.gesis.org/somesci/"),
    "nif" : Namespace("http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#"),
    "wd" : Namespace("http://www.wikidata.org/entity/"),
    "xsd": Namespace("http://www.w3.org/2001/XMLSchema#"),
    "rdfs": Namespace("http://www.w3.org/2000/01/rdf-schema#"),
    "comment": Namespace("http://www.w3.org/2000/01/rdf-schema#comment"),
    "datacite" : Namespace("http://purl.org/spar/datacite/"),
    "rdf" : Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#"),
    "doi" : Namespace("https://doi.org/"),
    "skg" : Namespace("http://data.gesis.org/softwarekg/PMC/"),
    "skgv" : Namespace("http://data.gesis.org/softwarekg/vocab/"),
    "bibo": Namespace("http://purl.org/ontology/bibo/"),
    "skos": Namespace("http://www.w3.org/2004/02/skos/core#"),
    "dct": Namespace("http://purl.org/dc/terms/"),
    "dbpedia-owl": Namespace("http://dbpedia.org/ontology/"),
    "irao": Namespace("http://ontology.ethereal.cz/")
}



# Load existing information

The graph `common_g_out.jsonld` was generated from the publicly available information from the PubmedKG.

In [None]:
%%time
common_g = Graph()
common_g.parse("common_g_out.jsonld", format='json-ld')

papers_with_journal = common_g.query("""
SELECT
    ?paper 
    ?journal
    ?journal_name
    ?eissn
    ?issn
    ?publisher
    ?publisher_name
where{
    ?paper <http://purl.org/dc/terms/isPartOf> ?journal.
    ?journal <http://schema.org/name> ?journal_name.
    OPTIONAL { ?journal <http://purl.org/ontology/bibo/eissn> ?eissn }
    OPTIONAL { ?journal <http://purl.org/ontology/bibo/issn> ?issn }
    OPTIONAL { ?journal <http://purl.org/dc/terms/publisher> ?publisher.
                ?publisher <http://schema.org/name> ?publisher_name}
}""")

print("Papers with Journal: {}".format(len(papers_with_journal)))

In [None]:
%%time
common_papers = {}
journal_by_issn = {}
journal_by_eissn = {}
journal_by_title = {}
common_journals = []
common_publishers = {}


for row in tqdm.tqdm(papers_with_journal):
    # just store all papers we already know, as we can ignore those information later on
    paper_pmc_id = row[0].toPython().split('/')[-1][3:]
    #print(paper_pmc_id)
    common_papers[paper_pmc_id] = row[1].toPython()
    
    # in order to re-use the journals we should save the journal information
    journal_id = row[1].toPython().split('/')[-1]
    if journal_id not in common_journals:
        common_journals.append(journal_id)
        
        journal = {
            "id" : row[1],
            "name" : row[2],
            "eissn" : row[3],
            "issn" : row[4]
        }
        journal_by_issn[str(journal['issn'])] = journal
        journal_by_eissn[str(journal['eissn'])] = journal
        lower_journal = journal['name'].lower().replace('&', 'and').lstrip().rstrip()
        journal_by_title[journal['name'].lower().replace('&', 'and').lstrip().rstrip()] = journal
    
    if row[6]:
        publisher_name = row[6].toPython()
        if not publisher_name in common_publishers:
            common_publishers[publisher_name] = row[5].toPython()
    

print("Number of Journals {}".format(len(common_journals)))
journal_count = len(common_journals)

print("Number of Papers {}".format(len(common_papers)))

print("Number of publishers {}".format(len(common_publishers)))

The file `PMX_ids.csv.gz` contains a mapping from *PMCID* to the Journal incl. some journal information just as title, issn, and eissn. The file was downloaded from  https://www.ncbi.nlm.nih.gov/pmc/pmctopmid/ 

In [None]:
%%time

pmc_info = {}

with gzip.open("/mnt/storage1/nlp/PMC-ids.csv.gz", 'rt') as a_in:
    read = csv.reader(a_in, delimiter=',', quotechar='"', escapechar='\\')
    content = False
    for row in tqdm.tqdm(read):
        if content:
            #print(row)
            pmc_info[row[8][3:]] = {
                'id' :row[8],
                'title' : row[0],
                'issn' : row[1].replace('-', '').strip(),
                'eissn' : row[2].replace('-', '').strip(),
                'doi' : row[7]
            }
        else:
            content = True
            print(row)
        
print("Read {} rows from journal mapping.".format(len(pmc_info)))

# Helper Functions

Some URLs actually look weird, check them before usage

In [None]:
#copied from RDFlib
#https://rdflib.readthedocs.io/en/stable/_modules/rdflib/term.html
_invalid_uri_chars = '<>" {}|\\^`'

def _is_valid_uri(uri):
    for c in _invalid_uri_chars:
        if c in uri:
            return False
    return True

In [None]:
def entities_from_file(fn): 
    entities = {}
    with open(fn,'r') as f:
        for line in f:
            line_tokens = line.strip().split('\t')

            if line.startswith("T"):
                entity = {}
                if "SoftwareCoreference" in line:
                    #print("Skip Coreference in {}".format(fn))
                    continue
                mention_tokens = line_tokens[1].split(' ')
                entity['type'] = [e_type for e_type in mention_tokens[0].split('_')]
                entity['mention'] = line_tokens[2]
                e_len = int(mention_tokens[2]) - int(mention_tokens[1])
                if len(entity['mention']) != e_len:
                    print("Invalid entity length {} ({})".format(e_len,len(entity['mention'])))
                if len(line_tokens) == 4:
                    # found disambiguation identifier
                    entity['id'] = line_tokens[3]
                entities[line_tokens[0]] = entity
            elif line.startswith("R"):
                rel_tokens = line_tokens[1].split(' ')
                if len(rel_tokens) != 3:
                    print("Invalid relation {}".format(line))
                else:
                    rel = rel_tokens[0].split('_')[0]
                    arg_num1, e1 = rel_tokens[1].split(':')
                    arg_num2, e2 = rel_tokens[2].split(':')
                    if arg_num1 != 'Arg1' or arg_num2 != 'Arg2':
                        print("Invalid relation arguments in {}".format(line))
                    if e2 in entities:
                        entities[e2][rel] = e1
            else:
                print("Invalid line in {}:{}".format(fn,line))
    return entities

In [None]:
def get_meta_from_article(fn):
    general_info_paths = {
        'journal' : "//journal-meta//journal-title//text()",
        'title' : "//title-group/article-title//text()",
        'pmc-id' : "//article-meta/article-id[@pub-id-type='pmc']//text()",
        'publisher' : "//publisher/publisher-name//text()",
    }

    id_info_paths = { 

        'doi' : "//article-meta/article-id[@pub-id-type='doi']",
        'pm-id' : "//article-meta/article-id[@pub-id-type='pmid']",
        'issn' : "//journal-meta/issn[@pub-type='ppub']",
        'eissn' : "//journal-meta/issn[@pub-type='epub']",


    }

    meta = {}
    if not os.path.exists(fn):
        print("Warning: Expected file {} missing.".format(fn))
        return None

    with open(fn, 'r') as f:
        tree = etree.parse(f)

    # get general information
    for k,p in general_info_paths.items():
        tree_res = tree.xpath(p)
        
        if len(tree_res) < 1:
            #print("Warning: Invalid number of results {} for node {} in document {}.".format(len(tree_res), p, fn))
            pass
        else:
            meta[k] = tree_res[0]

    for k,p in id_info_paths.items():
        tree_res = tree.xpath(p)
        
        if len(tree_res) > 0 and tree_res[0].text:
            meta[k] = tree_res[0].text
            meta[k] = urllib.parse.unquote(meta[k])
            if k == 'doi' and not _is_valid_uri(meta[k]):
                print("Warning: Found potentially problematic doi {} in {}".format(meta[k], fn))

    meta['keywords'] = [k.text for k in tree.xpath("//article-meta/kwd-group/kwd")]
    meta['type'] = [k for k in tree.xpath("//article/@article-type")][0]

    #get date
    for att in ['epub', 'ppub','epub-ppub','pub']:
        year = None
        #TODO: current date XXXX-XX-XX -> invalid date is set to 1 
        month = 1
        day = 1
        date_node = tree.xpath("//pub-date[@pub-type='{}']".format(att))
        if len(date_node) == 0:
            date_node = tree.xpath("//pub-date[@date-type='{}']".format(att))
            if len(date_node) == 0:
                continue
        yn = date_node[0].xpath("year")
        mn = date_node[0].xpath("month")
        dn = date_node[0].xpath("day")
        
        if len(yn) == 0:
            continue
        year = yn[0].text
        
        if len(mn) > 0:
            month = mn[0].text
        
        if len(dn) > 0:
            day = dn[0].text
        
        try:
            meta['date'] = datetime.date(int(year), int(month), int(day)).isoformat()
        except:
            print("Warning: Invalid date: {}-{}-{} in document {}. (Resetting day to 1)".format(year,month,day, fn))
            meta['date'] = datetime.date(int(year), int(month), 1).isoformat()
        break
    if 'date' not in meta:
        print("Warning: No publication date for document {}.".format(fn))
        
    def add_node_text(dict,key,node, path):
        res_node = node.xpath(path)
        if len(res_node) >= 1:
        #if len(res_node) == 1:
            dict[key] = res_node[0].text
        #elif len(res_node) > 1:
        #    dict[key] =  [n.text for n in res_node]
        return dict

    def add_attrib(dict, key, node, path):
        res_node = node.xpath(path)
        if len(res_node) > 0:
            dict[key] =  res_node
        return dict

    # get affiliation information
    meta['affiliations'] = []
    affiliations = tree.xpath("//article-meta/aff")
    for idx, aff_tree in enumerate(affiliations):
        aff = {}
        add_attrib(aff, 'id',aff_tree, "@id")
        if not 'id' in aff:
            aff['id'] = [idx]
        #TODO: there is still some problem with the label/sub
        add_attrib(aff,'name',aff_tree,"string()")
        if not 'name' in aff:
            # in some weired cases the affiliations are empty
            aff['name'] = ''
        aff['name'] = aff['name'].strip()
       
        #print(aff['name'])
        for t in ['label','sup']:
            label_node = aff_tree.xpath(t)
            label_text = label_node[0].text if len(label_node) > 0 else '' 
            if not label_text:
                continue
            if not aff['name'].startswith(label_text):
                pass
                #print("Warning: {} does not start with {} in {}".format(aff['name'], label_text, fn))
            aff['name'] = aff['name'][len(label_text):] if aff['name'].startswith(label_text) else aff['name']
        #print(aff['name'])
        meta['affiliations'].append(aff)
    #get author information
    meta['authors'] = []
    authors = tree.xpath("//contrib-group/contrib[@contrib-type='author']")
    for author_tree in authors:
        author = {}
        add_node_text(author, 'firstname', author_tree, "name/given-names")
        add_node_text(author, 'lastname', author_tree,"name/surname")
        add_node_text(author, 'email', author_tree, "email")
        # TODO: Add corresponding author
        add_node_text(author, 'orcid', author_tree,"contrib-id[@contrib-id-type='orcid']")
        if 'orcid' in author:
            #print(author['orcid'])
            author['orcid'] = author['orcid'].strip()
            if not _is_valid_uri(author['orcid']):
                # ORCIDs must be valid URLs as the only consist of numbers, remove everything else
                print("Warning: found potentially invalid orcid {} in {}".format(author['orcid'], fn))
                author.pop('orcid')
        add_attrib(author, 'affiliation-id', author_tree, "xref[@ref-type='aff']/@rid")
        add_attrib(author, 'corresponding', author_tree, "@corresp")

        
        meta['authors'].append(author)
        #print(author)

    # fix affilitation assigment
    # Sometimes authors to not have an affilitation assigned
    # We assign the affilition to them which is not assigned to any other author
    author_affiliations = [a['affiliation-id'] for a in meta['authors'] if 'awas ffiliation-id' in a]
    author_affiliations = [item for sublist in author_affiliations for item in sublist]
    #print(author_affiliations)
    affiliation_ids = [a['id'] for a in meta['affiliations'] if not a['id'] in author_affiliations]
    affiliation_ids = [item for sublist in affiliation_ids for item in sublist]

    #print(affiliation_ids)
    for author in meta['authors']:
        if 'affiliation-id' not in author:
            #print("fix aff")
            # author has no affiliation yet, set list of un-assigned affiliations
            author['affiliation-id'] = affiliation_ids
            #print(author['affiliation-id'])


    return meta

In [None]:
issn_match = 0
eissn_match = 0
title_match = 0
in_pmc = 0
not_in_pmc=0


def add_paperinfo_to_graph(g, f, infos):
    global journal_count
    global issn_match
    global eissn_match
    global title_match
    global in_pmc
    global not_in_pmc
    
    software_info = {}

    
    pmcfn = ".".join(f.split(".")[:1]) + ".nxml"
    entities = entities_from_file(os.path.join(tag_path,f))
    try:
        meta = get_meta_from_article(os.path.join(pmc_path,pmcfn))
    except Exception as e: 
        print("Warning: Cannot get meta data from {}".format(pmcfn))
        raise e
    if not meta:
        return
    
    software_info['article'] = meta['pmc-id']

    article_id = "skg:article/PMC{}".format(meta['pmc-id'])
    #print(article_id)
    g.add((URIRef(article_id), RDF.type, URIRef("schema:ScholarlyArticle")))
    if 'title' in meta:
        g.add((URIRef(article_id), URIRef("schema:name"), Literal(meta['title'])))
        software_info['title'] = meta['title']
        
    if 'type' in meta:
        g.add((URIRef(article_id), URIRef("skgv:documentType"), Literal(meta['type'])))
        software_info['type'] = meta['type']
        
    if meta['pmc-id'] in pmc_info:
        # we do have all information from the PMC mapping table and actually trust it more than the XML extraction
        info = pmc_info[meta['pmc-id']]
        in_pmc +=1
    else: 
        not_in_pmc += 1
        #print(meta)
        #print(meta['pmc-id'])
        info = {}
        if 'doi' in meta:
            info['doi'] = meta['doi']
        if 'journal' in meta:
            info['title'] = meta['journal']
        if 'issn' in meta:            
            info['issn'] = meta['issn'].replace('-','').strip()
            #print("Found ISSN: {}".format(info['issn']))
        if 'eissn' in meta:
            info['eissn'] = meta['eissn'].replace('-','').strip()
            #print("Found EISSN: {}".format(info['eissn']))
    
    # Add DOI either from PMC mapping or from XML
    if 'doi' in info:
        g.add((URIRef(article_id), URIRef("datacite:doi"), URIRef("doi:{}".format(info['doi']))))
    #g.add((URIRef(article_id), URIRef("datacite:doi"), URIRef("doi:{}".format(meta['doi']))))

    
    # We now need to add information about the journal
    # two possible cases arise here
    # 1.) journal already available from common_g -> no action necessary
    # 2.) not available -> take information from pmc mapping table

    if meta['pmc-id'] not in common_papers:
        # we do not have all information about the paper from common_g
        # we have to add the information from either pmc_info or from XML
        journal = None
        if 'eissn' in info and info['eissn'] in journal_by_eissn:
            eissn_match += 1
            journal = journal_by_eissn[info['eissn']]
        elif 'issn' in info and info['issn'] in journal_by_issn:
            issn_match += 1
            journal = journal_by_issn[info['issn']]
        elif 'title' in info: 
            lower_title = info['title'].lower().replace('&', 'and').lstrip().rstrip()
            if lower_title in journal_by_title:
                title_match += 1
                journal = journal_by_title[lower_title]
        
        if not journal:
            journal = {}
            #print(journal_count)
            journal['id'] = journal_count
            journal['title'] = info['title']
            if 'issn' in info:
                journal_by_issn[info['issn']] = journal
            if 'eissn' in info:
                journal_by_eissn[info['eissn']] = journal

            journal_count += 1
            journal_by_title[info['title'].lower().replace('&', 'and').lstrip().rstrip()] = journal #.lower().replace('&', 'and').lstrip().rstrip()
            journal_uri = URIRef("skg:journal/{}".format(journal['id']))
            g.add((journal_uri, URIRef("schema:name"), Literal(journal['title'])))
            g.add((journal_uri, RDF.type, URIRef("bibo:Journal")))
        else:
            journal_uri = URIRef("skg:journal/{}".format(journal['id']))

        g.add((URIRef(article_id), URIRef("dct:isPartOf"), journal_uri))
        software_info['journal'] = journal_uri

        # we need the publisher now
        
        
        if 'publisher' in meta:
            if meta['publisher'] in common_publishers:
                publisher_uri = URIRef(common_publishers[meta['publisher']])
            else:
                publisher_url = "skg:publisher/{}".format(len(common_publishers))
                publisher_uri = URIRef(publisher_url)
                g.add((publisher_uri, RDF.type, URIRef("schema:Organization")))
                g.add((publisher_uri, URIRef("schema:name"), Literal(meta['publisher'])))
                common_publishers[meta['publisher']] = publisher_url
            g.add((journal_uri, URIRef("dct:publisher"), publisher_uri))
    else:
        software_info['journal'] = common_papers[meta['pmc-id']]

        
    if 'keywords' in meta:
        for k in meta['keywords']:
            g.add((URIRef(article_id), URIRef("schema:keywords"), Literal(k)))
    if 'date' in meta:
        g.add((URIRef(article_id), URIRef("schema:datePublished"), Literal(meta['date'], datatype= XSD.date)))
        software_info['date'] = meta['date']
    affs = {}
    if 'affiliations' in meta:
        for idx, a in enumerate(meta['affiliations']):
            aff_id = "{}/affilation/A{}".format(article_id,idx)
            if 'id' not in a:
                
                if len(meta['affiliations']) == 1:
                    affs['dummy'] = aff_id
                else:
                    print("Warning: Affiliation ID for {} missing in {} but multiple affiliations given".format(a,pmcfn))
                    return
            else:
                # TODO: Why list [0]?
                affs[a['id'][0]] = aff_id
            g.add((URIRef(aff_id), RDF.type, URIRef("schema:Organization")))
            g.add((URIRef(aff_id), URIRef("schema:name"), Literal(a['name'])))
    if 'authors' in meta:
        for idx, author in enumerate(meta['authors']):
            author_id = "{}/author/A{}".format(article_id,idx)
            g.add((URIRef(article_id), URIRef("schema:author"), URIRef(author_id)))
            g.add((URIRef(author_id), RDF.type, URIRef("schema:Person")))
            if 'firstname' in author:
                g.add((URIRef(author_id), URIRef("schema:givenName"), Literal(author['firstname'])))
            if 'lastname' in author:
                g.add((URIRef(author_id), URIRef("schema:familyName"), Literal(author['lastname'])))
            if 'orcid' in author:
                g.add((URIRef(author_id), URIRef("dbpedia-owl:orcidId"), URIRef(author['orcid'])))
            if 'affiliation-id' in author:
                for author_aff in author['affiliation-id']:
                    #print(author_aff)
                    #print(affs)
                    if author_aff in affs:
                        g.add((URIRef(author_id), URIRef("schema:affiliation"), URIRef(affs[author_aff])))
            elif len(affs) == 1 and 'dummy' in affs: # no affiliation id given at all, assume that the only one belongs to each author
                g.add((URIRef(author_id), URIRef("schema:affiliation"), URIRef(affs['dummy'])))
    # create entities
    #print(entities)
    software_info['mentions'] = []
    for eid, entity in entities.items():
        software_id = None
        mention_info = {}
        #print(entity)   
        mention_id = "{}/mentions/{}".format(article_id,eid)
        g.add((URIRef(article_id), URIRef("schema:mentions"), URIRef(mention_id)))
        g.add((URIRef(mention_id), RDF.type, URIRef("nif:String")))
        #mention_info['id'] = mention_id
        
        if 'id' in entity:
            software_id = "skg:software/{}".format(entity['id'])
            g.add((URIRef(mention_id), URIRef("skgv:software"), URIRef(software_id)))
            g.add((URIRef(software_id), RDF.type, URIRef("irao:Software")))
            software_info['mentions'].append(entity['id'])
        
        for key, value in entity.items():
            if key == 'mention':
                g.add((URIRef(mention_id), URIRef("nif:isString"), Literal(value)))
            elif key == 'id':
                continue
            elif key == 'type':
                for t in entity['type']:
                    if t in ['Unknown']:
                        # Unknown -> we were not able to determine the type for sure
                        pass 
                    elif t in ['Creation', 'Usage', 'Deposition', 'Mention']:
                        m = {
                            'Creation':'skgv:MentionType_Creation',
                            'Usage' : 'skgv:MentionType_Usage',
                            'Deposition' : 'skgv:MentionType_Deposition',
                            'Mention' : 'skgv:MentionType_Allusion'
                        }
                        g.add((URIRef(mention_id), URIRef("skgv:mentionType"),URIRef(m[t])))
                        #mention_info['mentiontype'] = m[t]
                    elif t in ['ProgrammingEnvironment','Application','OperatingSystem','PlugIn']:
                        g.add((URIRef(mention_id), URIRef("skgv:softwareType"),URIRef("skgv:{}".format(t))))
                        #mention_info['softwaretype'] = t
                    elif t in ['Version', 'Developer', 'URL', 'Citation', 'Extension', 'Release', 'Abbreviation', 
                               'Fullname', 'License','AlternativeName']:
                        g.add((URIRef(mention_id), URIRef("skgv:informationType"),URIRef("skgv:{}".format(t))))
                        #mention_info['infotype'] = t
                    else:
                        raise Exception('Unknown mention type: {}'.format(t))
            else:
                #print("{}:{}".format(key,value))
                g.add((URIRef(mention_id), URIRef("skgv:referredToBy{}".format(key)), URIRef("{}/mentions/{}".format(article_id, value))))
            if 'software' in mention_info:
                # we only add software specific rows
                software_info['mentions'].append(mention_info)
#        if 'id' in entity:
#            g.add((URIRef(mention_id), URIRef("skg:software"), ))
    infos.append(software_info)

In [None]:
#%%time
g = Graph()

list_of_files = [fn for fn in os.listdir(tag_path) if fn.startswith("PMC") and fn.endswith(".ann.unique")]


In [None]:
%%time
def gen_n_papers(start, n, files):
    publisher_list = []
    softwareid_list = []
    data_list = []
    g = Graph()
    infos = []
    n = min(n,len(files))
    start = max(start,0)
    
    for f in tqdm.tqdm(files[start:n]):
        add_paperinfo_to_graph(g, f, data_list)

    g.serialize(format="json-ld", context=context, destination="softwarekg_{}-{}.jsonld".format(start,n))
    
    ll = [pd.DataFrame(article).drop_duplicates() for article in data_list]
    pd.concat(ll).to_csv('software_in_articles_{}-{}.csv'.format(start,n))



In [None]:
%%time
n = 50
step = int(len(list_of_files)/n)

i = 0
while i < len(list_of_files):
    gen_n_papers(i, i+step, list_of_files)
    i += step + 1
