In [20]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [21]:
import pandas as pd

class DBLP:
    def __init__(self, path):
        # Main
        self.papers = pd.read_csv(f'{path}/papers.csv', sep=',')

        # RDBS
        self.authors = pd.read_csv(f'{path}/authors.csv', sep=',')
        self.volumes = pd.read_csv(f'{path}/volumes.csv', sep=',')
        self.editions = pd.read_csv(f'{path}/editions.csv', sep=',')
        self.journals = pd.read_csv(f'{path}/journals.csv', sep=',')
        self.conferences = pd.read_csv(f'{path}/conferences.csv', sep=',')
        self.reviewers = pd.read_csv(f'{path}/reviewers.csv', sep=',')
        self.reviews = pd.read_csv(f'{path}/reviews.csv', sep=',')
        self.keywords = pd.read_csv(f'{path}/keywords.csv', sep=',')

In [22]:
dblp = DBLP('data')

In [23]:
dblp.papers.head()

Unnamed: 0,id,pages,title,author_ids,volume_id,edition_id,abstract,doi,keyword_ids
0,0,49-61,Estimating Traffic Disruption Patterns with Vo...,2685|70|3102|76|1426,13.0,,"In this paper, we present an approach to devel...",jvlywkquewps,43|16|17|2|25
1,1,28-34,A Speaker-aware Parallel Hierarchical Attentiv...,1326|1181|136|1158,4.0,,"In this paper, we present an empirical analysi...",ffthtdqohkxj,29|33|28|26|43
2,2,26-43,Robust Dialog State Tracking for Large Ontolog...,228|501|230|503|1692,93.0,,"In this paper, we present an approach to devel...",yfwkbkvjolvv,30|10|11|5|9
3,3,23-30,From compression to compressed sensing,1716|628,98.0,,"In this paper, we show that a simple system th...",lomhkwmufiws,3|17|21|2|10
4,4,10668-10688,Using Class Probabilities to Map Gradual Trans...,1155|1967|1866|1917|1859|1159,54.0,,"In this paper, we propose three proposed hypot...",geohcjlqsdai,35|44|14|5|39


In [24]:
def impute_na(df):
    return df.fillna(-1)

def convert_to_int(df):
    numeric_columns = df.select_dtypes(include=[float, int]).columns
    df[numeric_columns] = df[numeric_columns].astype(int)
    return df

In [25]:
for df_key in vars(dblp):
    setattr(dblp, df_key, impute_na(getattr(dblp, df_key)))
    setattr(dblp, df_key, convert_to_int(getattr(dblp, df_key)))

In [26]:
dblp.papers.head()

Unnamed: 0,id,pages,title,author_ids,volume_id,edition_id,abstract,doi,keyword_ids
0,0,49-61,Estimating Traffic Disruption Patterns with Vo...,2685|70|3102|76|1426,13,-1,"In this paper, we present an approach to devel...",jvlywkquewps,43|16|17|2|25
1,1,28-34,A Speaker-aware Parallel Hierarchical Attentiv...,1326|1181|136|1158,4,-1,"In this paper, we present an empirical analysi...",ffthtdqohkxj,29|33|28|26|43
2,2,26-43,Robust Dialog State Tracking for Large Ontolog...,228|501|230|503|1692,93,-1,"In this paper, we present an approach to devel...",yfwkbkvjolvv,30|10|11|5|9
3,3,23-30,From compression to compressed sensing,1716|628,98,-1,"In this paper, we show that a simple system th...",lomhkwmufiws,3|17|21|2|10
4,4,10668-10688,Using Class Probabilities to Map Gradual Trans...,1155|1967|1866|1917|1859|1159,54,-1,"In this paper, we propose three proposed hypot...",geohcjlqsdai,35|44|14|5|39


In [27]:
# Helper functions
from functools import partial

def assign_category(observation, ref_col, available_categories, col_name='category'):
    # Get hash of the observation id
    hash_id = hash(observation[ref_col])

    # Get the index of the hash in the available subtypes
    index = hash_id % len(available_categories)

    # Append the subtype to the observation
    observation[col_name] = available_categories[index]

    return observation

def random_string(length):
    import random
    import string
    return ''.join(random.choice(string.ascii_lowercase) for i in range(length))

def add_extra_attribute(observation, length=10, col_name='attribute'):
    observation[col_name] = random_string(length)
    return observation

# Impute missing values wrt the assignment
assign_paper_subtype_conf = partial(assign_category, ref_col='edition_id', available_categories=['DemoPaper', 'ShortPaper', 'FullPaper', 'Poster'])
assign_paper_subtype_jour = partial(assign_category, ref_col='volume_id', available_categories=['DemoPaper', 'ShortPaper', 'FullPaper'])
assign_conf_subtype = partial(assign_category, ref_col='id', available_categories=['Workshop', 'Symposium', 'ExpertGroup'])
assign_conf_related = partial(assign_category, ref_col='id', available_categories=[0, 1, 5, 6], col_name='relatedTo')

dblp.papers = dblp.papers.apply(lambda x: assign_paper_subtype_jour(x) if x.volume_id != -1 else assign_paper_subtype_conf(x) , axis=1)
dblp.conferences = dblp.conferences.apply(lambda x: assign_conf_subtype(x), axis=1)
dblp.conferences = dblp.conferences.apply(lambda x: assign_conf_related(x), axis=1)

dblp.papers = dblp.papers.apply(lambda x: add_extra_attribute(x, length=10, col_name='subpaper_attr'), axis=1)
dblp.papers = dblp.papers.apply(lambda x: add_extra_attribute(x, length=10, col_name='paper_type_attr'), axis=1)

dblp.conferences = dblp.conferences.apply(lambda x: add_extra_attribute(x, length=10, col_name='subconf_attr'), axis=1)

In [28]:
import csv
from rdflib import Graph, Literal, BNode, Namespace, RDF, URIRef
from rdflib.namespace import XSD
import numpy as np

# Create a namespace for your ontology
n = Namespace("http://www.sdm.com/ontology#")

# Create a RDF graph
g = Graph()

# Load the data from the df into the ontology ABOX

## Add chairs and editors (one for ease)
chair = URIRef(n + 'chair')
g.add((chair, RDF.type, n.Chair))
g.add((chair, n.role, Literal('Chair', datatype=XSD.string)))

editor = URIRef(n + 'editor')
g.add((editor, RDF.type, n.Editor))
g.add((editor, n.house, Literal('Editor', datatype=XSD.string)))

## Load authors and reviewers
for _, row in dblp.authors.iterrows():
    author = URIRef(n + 'author' + str(row['id']))
    reviewer = URIRef(n + 'reviewer' + str(row['id']))
    g.add((author, RDF.type, n.Author))
    g.add((reviewer, RDF.type, n.Reviewer))
    g.add((author, n.hindex, Literal(np.random.random(), datatype=XSD.float)))
    g.add((reviewer, n.experience, Literal(np.random.randint(1, 100), datatype=XSD.integer)))
    g.add((author, n.name, Literal(row['author'], datatype=XSD.string))) # OWL takes care of assigning the datatypes to the proper subclass
    g.add((reviewer, n.name, Literal(row['author'], datatype=XSD.string)))

    g.add((chair, n.assigns, reviewer)) # Just assume all reviers are assigned by both at some point (boring assignment)
    g.add((editor, n.assigns, reviewer))

## Load keywords
for _, row in dblp.keywords.iterrows():
    keyword = URIRef(n + 'area' + str(row['id']))
    g.add((keyword, RDF.type, n.KnowledgeArea))
    g.add((keyword, n.name, Literal(row['keyword'], datatype=XSD.string)))

## Load journals
for _, row in dblp.journals.iterrows():
    journal = URIRef(n + 'journal' + str(row['id']))
    g.add((journal, RDF.type, n.Journal))
    g.add((journal, n.name, Literal(row['journal'], datatype=XSD.string)))
    g.add((journal, n.publisher, Literal('IEEE', datatype=XSD.string)))
    g.add((journal, n.relatedTo, URIRef(n + 'area' + str(0))))

## Load conferences
for _, row in dblp.conferences.iterrows():
    conference = URIRef(n + 'conference' + str(row['id']))
    if row['category'] == 'Workshop':
        g.add((conference, RDF.type, n.Workshop))
        g.add((conference, n.organizers, Literal(row['subconf_attr'], datatype=XSD.string)))
    elif row['category'] == 'Symposium':
        g.add((conference, RDF.type, n.Symposium))
        g.add((conference, n.program, Literal(row['subconf_attr'], datatype=XSD.string)))
    elif row['category'] == 'ExpertGroup':
        g.add((conference, RDF.type, n.ExpertGroup))
        g.add((conference, n.domain, Literal(row['subconf_attr'], datatype=XSD.string)))

    g.add((conference, n.name, Literal(row['conference'], datatype=XSD.string)))
    g.add((conference, n.publisher, Literal('IEEE', datatype=XSD.string)))
    g.add((conference, n.relatedTo, URIRef(n + 'area' + str(row['relatedTo']))))

## Load editions
for _, row in dblp.editions.iterrows():
    edition = URIRef(n + 'edition' +  str(row['id']))
    g.add((edition, RDF.type, n.ConferenceProceedings))

    g.add((edition, n.heldIn, URIRef(n + 'conference' + str(int(row['conference_id'])))))
    g.add((edition, n.edition, Literal(row['city']+str(row['year']), datatype=XSD.string)))

## Load volumes
for _, row in dblp.volumes.iterrows():
    volume = URIRef(n + 'volume' +  str(row['id']))
    g.add((volume, RDF.type, n.JournalVolume))

    g.add((volume, n.heldIn, URIRef(n + 'journal' + str(int(row['journal_id'])))))
    g.add((volume, n.volume, Literal(row['volume'], datatype=XSD.string)))

## Load papers
for _, row in dblp.papers.iterrows():
    paper = URIRef(n + 'paper' + str(row['id']))
    if row['category'] == 'DemoPaper':
        g.add((paper, RDF.type, n.DemoPaper))
        g.add((paper, n.demo, Literal(row['subpaper_attr'], datatype=XSD.string)))
        g.add((paper, n.background, Literal(row['paper_type_attr'], datatype=XSD.string)))
    elif row['category'] == 'ShortPaper':
        g.add((paper, RDF.type, n.ShortPaper))
        g.add((paper, n.conciseness, Literal(row['subpaper_attr'], datatype=XSD.string)))
        g.add((paper, n.background, Literal(row['paper_type_attr'], datatype=XSD.string)))
    elif row['category'] == 'FullPaper':
        g.add((paper, RDF.type, n.FullPaper))
        g.add((paper, n.discussion, Literal(row['subpaper_attr'], datatype=XSD.string)))
        g.add((paper, n.background, Literal(row['paper_type_attr'], datatype=XSD.string)))
    elif row['category'] == 'Poster':
        g.add((paper, RDF.type, n.Poster))
        g.add((paper, n.dimensions, Literal(row['subpaper_attr'], datatype=XSD.string)))
        g.add((paper, n.track, Literal(row['paper_type_attr'], datatype=XSD.string)))
    
    g.add((paper, n.title, Literal(row['title'], datatype=XSD.string)))
    g.add((paper, n.abstract, Literal(row['abstract'], datatype=XSD.string)))

    # Link with authors
    for author in row['author_ids'].split('|'):
        g.add((paper, n.authoredBy, URIRef(n + 'author' + author)))

    # Link with areas
    for keyword in row['keyword_ids'].split('|'):
        g.add((paper, n.relatedTo, URIRef(n + 'area' + keyword)))

    # Link with publications
    if row['volume_id'] != -1:
        g.add((paper, n.publishedIn, URIRef(n + 'volume' +  str(row['volume_id']))))

    elif row['edition_id'] != -1:
        g.add((paper, n.publishedIn, URIRef(n + 'edition' +  str(row['edition_id']))))

## Load reviews
### Should reviewers and authors be different entities?
for i, row in dblp.reviews.iterrows():
    review = URIRef(n + 'review' + str(i))
    g.add((review, RDF.type, n.Review))
    g.add((review, n.ReviewDecision, Literal(row['decision'], datatype=XSD.string)))
    g.add((review, n.ReviewText, Literal(row['content'], datatype=XSD.string)))

    g.add((review, n.reviewedPaper, URIRef(n + 'paper' + str(row['article_id']))))
    g.add((review, n.submitedBy, URIRef(n + 'author' + str(row['author_id']))))


# Serialize the graph in RDF/XML format
g.serialize(destination='abox.nt', format='nt', encoding='utf-8')

<Graph identifier=N8dcb7b6336094bfbbb48f61797222312 (<class 'rdflib.graph.Graph'>)>

In [29]:
f = Graph()

In [30]:
f.parse('tbox_grafo.owl', format='xml')
f.parse('abox.nt', format='nt')

<Graph identifier=N77a02e792b57471da5e16bb7d4151b8b (<class 'rdflib.graph.Graph'>)>

In [31]:
print(len(f))

61341


In [32]:
f.serialize(destination='linked_data.nt', format='nt', encoding='utf-8')

<Graph identifier=N77a02e792b57471da5e16bb7d4151b8b (<class 'rdflib.graph.Graph'>)>