In [None]:
!pip install rdflib
from rdflib import Graph, URIRef, RDF, Namespace, Literal
from rdflib.namespace import XSD, RDFS

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rdflib
  Downloading rdflib-6.1.1-py3-none-any.whl (482 kB)
[K     |████████████████████████████████| 482 kB 4.5 MB/s 
[?25hCollecting isodate
  Downloading isodate-0.6.1-py2.py3-none-any.whl (41 kB)
[K     |████████████████████████████████| 41 kB 301 kB/s 
Installing collected packages: isodate, rdflib
Successfully installed isodate-0.6.1 rdflib-6.1.1


In [None]:
from rdflib import Graph, URIRef, RDF, Namespace, Literal, BNode
from rdflib.namespace import XSD, RDFS
import pandas as pd
import json
from pprint import pprint
from json import load


my_graph = Graph()

In [None]:
# Define namespaces used
FABIO = Namespace("http://purl.org/spar/fabio/")
PRISM = Namespace("http://prismstandard.org/namespaces/basic/2.0/")
RES = Namespace("https://allorapy.github.io/res/")
CITO = Namespace("http://purl.org/spar/cito/")
DATACITE = Namespace("http://purl.org/spar/datacite/")
DCTERMS = Namespace("http://purl.org/dc/terms/")
FOAF = Namespace("http://xmlns.com/foaf/0.1/")
SCHEMA = Namespace("https://schema.org/")
WD = Namespace("http://widata.org/entity/")

my_graph.bind('fabio', FABIO)
my_graph.bind('prism', PRISM)
my_graph.bind('wd', WD)
my_graph.bind('', RES)

# classes of resources
JournalArticle = URIRef("http://purl.org/spar/fabio/JournalArticle")
BookChapter = URIRef("http://purl.org/spar/fabio/BookChapter")
ProceedingsPaper = URIRef("http://purl.org/spar/fabio/ProceedingsPaper")
Journal = URIRef("http://purl.org/spar/fabio/Journal")
Book = URIRef("https://schema.org/Book")
Proceedings = URIRef("http://purl.org/spar/fabio/ConferenceProceedings")
Publication = URIRef("https://www.wikidata.org/wiki/Q732577")
Venue = URIRef("https://www.wikidata.org/wiki/Q2085381")
Organization = URIRef("https://schema.org/Organization")
Person = URIRef("http://xmlns.com/foaf/0.1/Person")
IdentifiableEntity = URIRef("https://www.wikidata.org/wiki/Q35120")

# attributes related to classes
issue = URIRef("http://prismstandard.org/namespaces/basic/2.0/issueIdentifier")
volume = URIRef("http://purl.org/spar/fabio/JournalVolume")
chapterNumber = URIRef("http://purl.org/spar/fabio/hasSequenceIdentifier")
event = URIRef("https://schema.org/Event")
publicationYear = URIRef("http://purl.org/spar/fabio/hasPublicationYear")
title = URIRef("http://purl.org/dc/terms/title")
givenName = URIRef("http://xmlns.com/foaf/0.1/givenName")
familyName = URIRef("http://xmlns.com/foaf/0.1/familyName")
id = URIRef("https://schema.org/identifier")
doi = URIRef ("http://prismstandard.org/namespaces/basic/2.0/doi") # sub-property of http://purl.org/dc/terms/identifier --> see https://sparontologies.github.io/fabio/current/fabio.html#d4e1044

# relations among classes
cites = URIRef("https://schema.org/citation")
publicationVenue = URIRef("https://schema.org/isPartOf")
author = URIRef("http://purl.org/dc/terms/creator")
publisher = URIRef("http://purl.org/dc/terms/publisher")

# This is the string defining the base URL used to defined
# the URLs of all the resources created from the data
base_url = "https://allorapy.github.io/res/"


In [None]:
# importing authors from JSON

with open("/content/drive/MyDrive/Colab Notebooks/graph_other_data.json", "r", encoding="utf-8") as f:
    json_doc = load(f)
print(type(json_doc))

# read just authors

authors = json_doc["authors"]

# create the dataframe from json

df_authors=pd.DataFrame(authors.items(),columns=['doi','author']).explode('author')

df_person=pd.json_normalize(json.loads(df_authors.to_json(orient="records")))
df_person.rename(columns={"author.family":"family_name","author.given":"given_name","author.orcid":"orc_id"}, inplace = True)

df_person=pd.DataFrame(df_person)

# Remove "doi:" from DOI
#doi = ["doi:", "DOI:"]

#for char in doi:
  #df_person['doi'] = df_person['doi'].str.replace(char,"")

#df_person['doi'] = df_person['doi'].str.split().str.join("")

#getPublicationAuthors --> by the way it should be programmed as method 
#in order to use in Class 

#x = df[1][0]
#authors= pd.DataFrame.from_dict(x, orient='index')
#authors = authors.T

#a=pd.DataFrame(authors['doi:10.1162/qss_a_00023'])

#for raw in a.index:
#  if a['doi:10.1162/qss_a_00023'][raw]!=None:
#    print(a['doi:10.1162/qss_a_00023'][raw])



person_internal_id = {}

for idx, row in df_person.iterrows():
    local_id = "person-" + str(idx)

    # The shape of the new resources that are people is
    # 'https://allorapy.github.io/res/person-<integer>'
    subj = URIRef(base_url + local_id)

    # We put the new person resources created here, to use them
    # when creating publications
    #person_internal_id[row["doi"]] = subj

# più che creare un dict dovrei associare ad ogni doi i diversi autori (chiave con più valori)
# e poi passare la lista di valori (url) a publications - trovare soluzione

    #person_internal_id[row["doi"]] = list()
    #person_internal_id[row["doi"]].append(local_id)

    if local_id not in person_internal_id:
        person_internal_id[row["doi"]] = list()
    elif isinstance(person_internal_id[row["doi"]], list):
        person_internal_id[row["doi"]].append(local_id)
    else:
        person_internal_id[row["doi"]] = person_internal_id[row["doi"]], [local_id]

    my_graph.add((subj, RDF.type, Person))

    my_graph.add((subj, FOAF["familyName"], Literal(row["family_name"])))
    my_graph.add((subj, FOAF["givenName"], Literal(row["given_name"])))

print(person_internal_id)

df_person


#print(my_graph.serialize(format="turtle"))


<class 'dict'>
{'doi:10.1016/j.websem.2021.100655': [], 'doi:10.1007/s10115-017-1100-y': [], 'doi:10.1016/j.websem.2014.03.003': [], 'doi:10.1093/nar/gkz997': [], 'doi:10.3390/publications7030050': [], 'doi:10.1017/s0269888920000065': [], 'doi:10.3390/info11030129': [], 'doi:10.1007/s00778-018-0528-3': [], 'doi:10.21105/joss.02731': [], 'doi:10.1016/j.websem.2014.06.002': [], 'doi:10.1007/s10115-019-01401-x': [], 'doi:10.1007/978-3-030-30793-6_22': [], 'doi:10.1007/978-3-030-33220-4_25': [], 'doi:10.1080/17538947.2020.1738568': [], 'doi:10.1007/s10462-020-09826-5': [], 'doi:10.1007/978-3-030-49461-2_25': [], 'doi:10.1007/s10462-020-09866-x': [], 'doi:10.1007/s10796-020-10035-2': [], 'doi:10.1007/978-3-030-60276-5_25': [], 'doi:10.1007/978-3-030-54956-5_2': [], 'doi:10.1007/s11280-020-00836-5': [], 'doi:10.3390/a13090217': [], 'doi:10.1007/978-3-030-62466-8_1': [], 'doi:10.1016/j.eswa.2020.113205': [], 'doi:10.1007/s00521-020-05491-5': [], 'doi:10.1002/bse.2855': [], 'doi:10.3390/info12

Unnamed: 0,doi,family_name,given_name,orc_id
0,doi:10.1016/j.websem.2021.100655,Espinoza-Arias,Paola,0000-0002-3938-2064
1,doi:10.1016/j.websem.2021.100655,Garijo,Daniel,0000-0003-0454-7145
2,doi:10.1016/j.websem.2021.100655,Corcho,Oscar,0000-0002-9260-0753
3,doi:10.1007/s10115-017-1100-y,Diefenbach,Dennis,0000-0002-0046-2219
4,doi:10.1016/j.websem.2014.03.003,Groth,Paul,0000-0003-0183-6910
...,...,...,...,...
1427,doi:10.1145/3309547,Wang,Xiang,0000-0002-6148-6329
1428,doi:10.1007/978-3-030-58285-2_27,Martin,Leon,0000-0002-6747-5524
1429,doi:10.1007/978-3-030-58285-2_27,Boockmann,Jan H.,0000-0001-6816-8393
1430,doi:10.1007/978-3-030-58285-2_27,Henrich,Andreas,0000-0002-5074-3254


In [None]:
# importing publisher JSON

with open("/content/drive/MyDrive/Colab Notebooks/graph_other_data.json", "r", encoding="utf-8") as f:
    json_doc = load(f)


# read just publishers

publisher = json_doc["publishers"]

# create the dataframe from json

df_publishers=pd.DataFrame(publisher.items(),columns=['crossref','name'])

df_publishers=pd.json_normalize(json.loads(df_publishers.to_json(orient="records")))
df_publishers.rename(columns={"name.id":"id","name.name":"name"}, inplace = True)

publisher_internal_id = {}
for idx, row in df_publishers.iterrows():
  local_id = "publisher-" + str(idx)

  subj = URIRef(base_url + local_id)

  publisher_internal_id[row["crossref"]] = subj


df_publishers
#publisher_internal_id

Unnamed: 0,crossref,id,name
0,crossref:735,crossref:735,Thomas Telford Ltd.
1,crossref:2780,crossref:2780,The Korean Society of Medical Informatics
2,crossref:2373,crossref:2373,Editorial CSIC
3,crossref:2560,crossref:2560,F1000 Research Ltd
4,crossref:311,crossref:311,Wiley
5,crossref:320,crossref:320,Association for Computing Machinery (ACM)
6,crossref:98,crossref:98,Hindawi Limited
7,crossref:1968,crossref:1968,MDPI AG
8,crossref:22695,crossref:22695,Polish Librarians' Association
9,crossref:2258,crossref:2258,Pensoft Publishers


## struggling connections between publication with authors, publications with publishers,

In [None]:
from pandas import read_csv, Series

publications = read_csv("/content/drive/MyDrive/Colab Notebooks/graph_publications.csv", 
                  keep_default_na=False,
                  dtype={
                      "id": "string",
                      "title": "string",
                      "type": "string",
                      "publication_year": "int",
                      "issue": "string",
                      "volume": "string",
                      "chapter": "string",
                      "publication_venue": "string",
                      "venue_type": "string",
                      "publisher": "string",
                      "event": "string"
                  })

# Remove "doi:" from DOI
#doi = ["doi:", "DOI:"]

#for char in doi:
  #publications['id'] = publications['id'].str.replace(char,"")

#publications['id'] = publications['id'].str.split().str.join("")


publication_internal_id = {}
for idx, row in publications.iterrows():
    local_id = "publication-" + str(idx)
    
    # The shape of the new resources that are publications is
    # 'https://allorapy.github.io/res/publication-<integer>'
    subj = URIRef(base_url + local_id)
    
    # We put the new publication resources created here, to use them
    # when creating publications
    publication_internal_id[row["id"]] = subj
    
    if row["type"] == "journal-article":
        # RDF.type is the URIRef already provided by rdflib of the property 
        # 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'
        my_graph.add((subj, RDF.type, JournalArticle))
        
        # These two statements applies only to journal articles
        my_graph.add((subj, issue, Literal(row["issue"])))
        my_graph.add((subj, volume, Literal(row["volume"])))
    elif row["type"] == "book-chapter":
        # RDF.type is the URIRef already provided by rdflib of the property 
        # 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'
        my_graph.add((subj, RDF.type, BookChapter))
        
        # This statement applies only to book chapter
        my_graph.add((subj, chapterNumber, Literal(row["chapter"])))
    else:
        my_graph.add((subj, RDF.type, ProceedingsPaper))
        
        # This statement applies only to proceedings paper
        my_graph.add((subj, event, Literal(row["event"])))
    
    my_graph.add((subj, DCTERMS['title'], Literal(row["title"])))
    my_graph.add((subj, PRISM['doi'], Literal(row["id"])))
    
    
    # Defining subclasses
    my_graph.add((JournalArticle, RDFS.subClassOf, Publication))
    my_graph.add((BookChapter, RDFS.subClassOf, Publication))
    my_graph.add((ProceedingsPaper, RDFS.subClassOf, Publication))

    # Defining relations
    my_graph.add((subj, publicationVenue, Literal(row["publication_venue"])))

    
    # The URL of the related person is taken from the previous
    # dictionary defined when processing the authors
    my_graph.add((subj, author, person_internal_id[row["id"]]))

    # Tring to add to the publication the publisher
    # my_graph.add((subj, publisher, publisher_internal_id[row["publisher"]]))
    
    # The original value here has been casted to string since the Date type
    # in schema.org ('https://schema.org/Date') is actually a string-like value
    my_graph.add((subj, publicationYear, Literal(str(row["publication_year"]))))

# I create a set of venues in order to not have a venue title more than one time

venues_series = publications["publication_venue"] 

venues_set = set()

for idx, item in venues_series.iteritems():
    if item not in venues_set:
      venues_set.add(item)

# I create the DataFrame to iterate in order to create the "venues" subjects

df_venues = pd.DataFrame(venues_set)

df_venues.rename(columns={0:"title"}, inplace = True)


for idx, row in df_venues.iterrows():
    venue_local_id = "venue-" + str(idx)

    subj = URIRef(base_url + venue_local_id)

    # I assign the RDF.type and the title and add them to the graph
    my_graph.add((subj, RDF.type, Venue))
    my_graph.add((subj, DCTERMS['title'], Literal(row["title"])))

    # Defining relations
    #my_graph.add((subj, publisher, publisher_internal_id[row["publisher"]]))

publications

AssertionError: ignored

In [None]:
print("-- Number of triples added to the graph after processing the publications")
print(len(my_graph))

print(my_graph.serialize(format="turtle"))

# print namespaces used in the graph
# for prefix, ns in my_graph.namespaces():
#   print(ns)

[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
:person-957 a foaf:Person ;
    foaf:familyName "Anzalone" ;
    foaf:givenName "Alfred J" .

:person-958 a foaf:Person ;
    foaf:familyName "Callahan" ;
    foaf:givenName "Tiffany J" .

:person-959 a foaf:Person ;
    foaf:familyName "Bramante" ;
    foaf:givenName "Carolyn T" .

:person-96 a foaf:Person ;
    foaf:familyName "Vargas-Lombardo" ;
    foaf:givenName "Miguel" .

:person-960 a foaf:Person ;
    foaf:familyName "Greene" ;
    foaf:givenName "Casey S" .

:person-961 a foaf:Person ;
    foaf:familyName "Chu" ;
    foaf:givenName "Haitao" .

:person-962 a foaf:Person ;
    foaf:familyName "Koraishy" ;
    foaf:givenName "Farrukh M" .

:person-963 a foaf:Person ;
    foaf:familyName "Liang" ;
    foaf:givenName "Chen" .

:person-964 a foaf:Person ;
    foaf:familyName "Liu" ;
    foaf:givenName "Feifan" .

:person-965 a foaf:Person ;
    foaf:familyName "Mcnair" ;
    foaf:givenName "Douglas S" .

:person-966 a 