In [None]:
import csv
import re

from collections import Counter

from rdflib import Graph, plugin, URIRef, Literal
from rdflib.serializer import Serializer
from rdflib.namespace import XSD, RDF, RDFS

In [None]:
base_path = '/mnt/storage1/docker/files/'
article_file = 'PMCArticleData.csv'
scimago_file = 'ScimagoData.csv'
citation_file = 'CitationData.csv'
pmc_mapping_file = 'pmc_pm_map.csv'

PMCArticleData.csv, ScimagoData.csv and CitationData.csv were created by downloading the corresponding data (A01_Articles, B14_Scimago, C04_ReferenceList) from http://er.tacc.utexas.edu/datasets/ped, importing them in a local MySQL database and exporting them as csv with:

SELECT field_of_interest_1, field_of_interst_2
INTO OUTFILE 'base_path/filename'
FIELDS TERMINATED BY ','
ENCLOSED BY '"'
LINES TERMINATED BY '\n'
FROM table;

Article Data was filtered to only contain articles from our PMC OA data based on the IDs in our corpus and with the PMCID to PMID mapping available from https://www.ncbi.nlm.nih.gov/pmc/pmctopmid/

Get information on articles

In [None]:
articles = []
with open(base_path + article_file, 'r') as a_in:
    read = csv.reader(a_in, delimiter=',', quotechar='"', escapechar='\\')
    for row in read:
        if len(row) != 7:
            print(row)
            break
        else:
            articles.append(row)

Get information on journals

In [None]:
scimago_title_index = {}
scimago_issn_index = {}
scimago_index = {}
count = 0
with open(base_path + scimago_file, 'r') as a_in:
    read = csv.reader(a_in, delimiter=',', quotechar='"', escapechar='\\')
    for row in read:
        if len(row) != 13:
            print(row)
            break
        else:
            eissn = row[4].rstrip().lstrip()
            pissn = row[5].rstrip().lstrip()
            title = row[2].lower().replace('&', 'and').lstrip().rstrip()
            source_id = row[1].strip()
            
            if eissn:
                scimago_issn_index[eissn] = source_id
            if pissn:
                scimago_issn_index[pissn] = source_id
            if title:
                scimago_title_index[title] = source_id
            
            if source_id not in scimago_index: 
                scimago_index[source_id] = {
                    'eissn': eissn,
                    'pissn': pissn,
                    'title': row[2],
                    'years': {}
                }
            else:
                if row[2] != scimago_index[source_id]['title']:
                    pass
                    #print("Found two differnt titles for {} and {}".format(row[2], scimago_index[source_id]['title']))
                
            scimago_index[source_id]['years'][row[-1]] = {
                'Rank': row[6],
                'Quartile': row[7],
                'Hindex': row[8],
                'Country': row[9],
                'Publisher': row[10], 
                'Domains': {}
            }
            for domain in row[11].split('; '):
                if re.search(r'\d{4}', domain):
                    scimago_index[source_id]['years'][row[-1]]['Domains']['WEIRD'] = 'None'
                elif domain.strip():
                    if '(Q' in domain:
                        d, q = domain.rstrip(')').rsplit('(', maxsplit=1)
                        scimago_index[source_id]['years'][row[-1]]['Domains'][d.rstrip()] = q.strip()
                    else:
                        scimago_index[source_id]['years'][row[-1]]['Domains'][domain.rstrip()] = 'None' 
            

In [None]:
for idx, (k,v) in enumerate(scimago_index.items()):
    categories = Counter()
    publishers = Counter()
    hindices = Counter()
    for year in v['years']:
        publisher = v['years'][year].pop('Publisher')
        category = v['years'][year].pop('Domains')
        hindex = v['years'][year].pop('Hindex')
        v['years'][year].pop('Country')
        
        if not 'WEIRD' in category:
            publishers[publisher] += 1
            hindices[hindex] += 1
            for c in category:
                categories[c] += 1
        elif len(v['years']) == 1:
            pass
            
    if len(publishers) > 0:
        v['Publisher'] = publishers.most_common(1)[0][0]
    if len(hindices) > 0:
        v['Hindex'] = hindices.most_common(1)[0][0]
    if len(categories) > 0:
        v['categories'] = []
        for c_k in categories:
            v['categories'].append(c_k)
    v['id'] = idx

In [None]:
title_matches = 0
issn_matches = 0
years = {}
source_ids = set()
matches = []
for idx, article in enumerate(articles):
    issn = article[-2].replace('-', '').rstrip().lstrip()
    match = ''
    if issn:
        if issn in scimago_issn_index:
            issn_matches += 1
            match = scimago_issn_index[issn]
        else:
            j_title = article[4].lower().replace('&', 'and')
            if j_title in scimago_title_index:
                title_matches += 1
                match = scimago_title_index[j_title]
            else:
                if article[3] not in years:
                    years[article[3]] = 0
                years[article[3]] += 1
    if match:
        matches.append([idx, article[0], match])
        source_ids.update([match])
print(issn_matches)
print(title_matches)

In [None]:
source_id_mapping = {}
for x in source_ids:
    source_id_mapping[x] = len(source_id_mapping)

In [None]:
years = {}
domains = {}
publishers = {}
for s_id in source_ids:
    entry = scimago_index[s_id]
    if 'Publisher' in entry:
        if entry['Publisher'] not in publishers:
            publishers[entry['Publisher']] = 0
        publishers[entry['Publisher']] += 1
    if 'categories' in entry:
        for domain in entry['categories']:
            if domain not in domains:
                domains[domain] = 0
            domains[domain] += 1
    
for idx, (k, v) in enumerate(domains.items()):
    domains[k] = {
        'num': v,
        'id': idx
    }
for idx, (k, v) in enumerate(publishers.items()):
    publishers[k] = {
        'num': v,
        'id': idx
    }

In [None]:
import json
with open('domain_overview.json', 'w') as j_out:
    json.dump(domains, j_out, indent=4)

Writing the Graph

In [None]:
scimago_g = Graph()
common_g = Graph()

In [None]:
for match in matches:
    common_g.add((URIRef('skg:article/' + match[1]), URIRef("dct:isPartOf"), URIRef('skg:journal/' + str(source_id_mapping[match[2]]))))

In [None]:
for domain, v_domain in domains.items():
    domain_name = 'skg:domain/' + str(v_domain['id'])
    scimago_g.add((URIRef(domain_name), RDF.type, URIRef("skos:Concept")))
    scimago_g.add((URIRef(domain_name), URIRef('schema:name'), Literal(domain, datatype=XSD.string)))

In [None]:
for publisher, v_publisher in publishers.items():
    publisher_name = 'skg:publisher/' + str(v_publisher['id'])
    common_g.add((URIRef(publisher_name), RDF.type, URIRef("schema:Organization")))
    common_g.add((URIRef(publisher_name), URIRef('schema:name'), Literal(publisher, datatype=XSD.string)))

In [None]:
add_info_count = 0
for s_id in source_ids:
    journal_id = source_id_mapping[s_id]
    entry = scimago_index[s_id]
    journal_name = 'skg:journal/' + str(journal_id)
    
    # Common entries
    common_g.add((URIRef(journal_name), RDF.type, URIRef('bibo:Journal')))
    common_g.add((URIRef(journal_name), URIRef('schema:sameAs'), URIRef('skg:scimago/' + s_id)))
    if entry['eissn'].strip():
        common_g.add((URIRef(journal_name), URIRef('bibo:eissn'), Literal(entry['eissn'])))
    if entry['pissn'].strip():
        common_g.add((URIRef(journal_name), URIRef('bibo:issn'), Literal(entry['pissn'])))
    common_g.add((URIRef(journal_name), URIRef('schema:name'), Literal(entry['title'])))
    if 'Publisher' in entry:
        j_y_publisher_name = "skg:publisher/" + str(publishers[entry['Publisher']]['id'])
        common_g.add((URIRef(journal_name), URIRef("dct:publisher"), URIRef(j_y_publisher_name)))
    
    # Scimago entries
    if 'Hindex' in entry:
        scimago_g.add((URIRef(journal_name), URIRef("skgv:H-Index"), Literal(int(entry['Hindex']), datatype=XSD.nonNegativeInteger)))

    if 'categories' in entry:
        for domain in entry['categories']:
            domain_id = 'skg:domain/' + str(domains[domain]['id'])
            scimago_g.add((URIRef(journal_name), URIRef('dct:subject'), URIRef(domain_id)))
    
    for year, v_year in entry['years'].items():
        #print(entry)
        if '-' in v_year['Quartile'] and ( not v_year['Rank'] or float(v_year['Rank']) == 0 ):
            continue
        add_info_name = 'skg:journalInformation/' + str(add_info_count)
        add_info_count += 1
        scimago_g.add((URIRef(journal_name), URIRef("skgv:hasJournalInformation"), URIRef(add_info_name)))
        scimago_g.add((URIRef(add_info_name), RDF.type, URIRef("skgv:JournalInformation")))
        scimago_g.add((URIRef(add_info_name), URIRef("dct:date"), Literal(year, datatype=XSD.date)))
        if v_year['Rank'] and float(v_year['Rank']) != 0:
            scimago_g.add((URIRef(add_info_name), URIRef("skgv:rank"), Literal(float(v_year['Rank']), datatype=XSD.float)))
        if not '-' in v_year['Quartile']:
            scimago_g.add((URIRef(add_info_name), URIRef("skgv:bestQuartile"), Literal(int(v_year['Quartile'].split('Q')[-1]), datatype=XSD.nonNegativeInteger)))
        

In [None]:
context = {
    "bibo": "http://purl.org/ontology/bibo/",
    "skos": "http://www.w3.org/2004/02/skos/core#",
    "dct": "http://purl.org/dc/terms/",
    "schema" : "http://schema.org/",
    "rdf" : "http://www.w3.org/1999/02/22-rdf-syntax-ns#",   
    "skg": "http://data.gesis.org/softwarekg/PMC/",
    "skgv": "http://data.gesis.org/softwarekg/vocab/"
}

common_g.serialize(format="json-ld", context=context, destination="common_g_out.jsonld")
scimago_g.serialize(format="json-ld", context=context, destination="scimago_g_out.jsonld")

# Citations

In [None]:
citations = []
with open(base_path + citation_file, 'r') as a_in:
    read = csv.reader(a_in, delimiter=',', quotechar='"', escapechar='\\')
    for row in read:
        if len(row) != 3:
            print(row)
            break
        else:
            citations.append(row)

In [None]:
pmc_pm_mapping = {}
pm_pmc_map = {}
with open(base_path + pmc_mapping_file, 'r') as a_in:
    read = csv.reader(a_in, delimiter=',', quotechar='"', escapechar='\\')
    for row in read:
        if len(row) != 2:
            print(row)
            break
        else:
            pmc_pm_mapping[row[0]] = {
                'pm_id': row[1],
                'cited_by_all': [],
                'cited_by_pmc': []
            }
            if int(row[1]) != 0:
                pm_pmc_map[row[1]] = row[0]

In [None]:
for citation in citations:
    pmc_pm_mapping[citation[-1]]['cited_by_all'].append(citation[0])
    if citation[0] in pm_pmc_map:
        pmc_pm_mapping[citation[-1]]['cited_by_pmc'].append(pm_pmc_map[citation[0]])


In [None]:
import json
with open('pmc_citation_count_mapping.json', 'w') as j_out:
    json.dump(pmc_pm_mapping, j_out, indent=4)

In [None]:
citation_g = Graph()

In [None]:
for k,v in pmc_pm_mapping.items():
    citation_g.add((URIRef('skg:article/' + k), URIRef("skgv:citationCount"), Literal(len(v['cited_by_all']), datatype=XSD.nonNegativeInteger)))
    for pmc_id in v['cited_by_pmc']:
        citation_g.add((URIRef('skg:article/' + pmc_id), URIRef("schema:citation"), URIRef('skg:article/' + k)))


In [None]:
citation_g.serialize(format="json-ld", context=context, destination="citation_g_out.jsonld")