In [None]:
import pandas as pd
import numpy as np
import re
import sys

In [None]:
#read
df = pd.read_csv('data_formatted.csv', sep=',')

In [None]:
class DataStore():
    def __init__(self):
        self.counterMap = {}
        self.data = {}
        
    def getId(self, namespace):
        if namespace not in self.counterMap:
            self.counterMap[namespace] = 0
        newId = 'id_' + namespace + '_' + str(self.counterMap[namespace])
        self.counterMap[namespace] += 1
        return newId
    def has(self, namespace, value):
        if namespace not in self.data:
            self.data[namespace] = {}
        store = self.data[namespace]
        if value not in store:
            return None
        return store[value] 
        
    def addIfMissing(self, namespace, value):
        _id = self.has(namespace, value)
        if not _id:
            _id = self.getId(namespace)
            self.data[namespace][value] = _id
        return _id
        
dataStore = DataStore()

In [None]:
# accept uri as value
uri_props = {
    'instance of': 'P31',
    'author': 'P50',
    'editor': 'P98',
    'publisher': 'P123',
    'place_of_publication': 'P291',
    'type': 'P_1',
    'oldDepartment': 'P_2',
    'journal': 'P_3',
    'conference': 'P_4',
    'series': 'P_5',
    'keyword': 'P_6'
}

# accept a literal as value
literal_props = {
    'publication_date' : 'P577',
    'DOI': 'P356',
    'issue': 'P433',
    'volume': 'P478',
    'pages': 'P304',
    'title': 'P1476', # booktitle is used here. not only books have the field book title
    'reference_URL': 'P854'
}

uri_triplets = []
literal_triplets = []


def is_nan(x):
    return (x is np.nan or x != x)

for _, row in df.iterrows():
    _id = row['id']
    work_id = dataStore.addIfMissing('work', row['title'])
    
    def addLiteral(value, prop):
        if not is_nan(value):
            literal_triplets.append([_id, work_id, prop, value])
            
    def addItem(value, namespace, prop):
        if not is_nan(value):
            item_id = dataStore.addIfMissing(namespace, value)
            uri_triplets.append([_id, work_id, prop, item_id])
            
    
    
    addItem(row['type'], 'type', uri_props['type'])
        
    if not is_nan(row['authors']):
        authors = str(row['authors']).split(';')
        for author in authors:
            # we add first to humans store, because the author might be also an editor, we want to re_use the same id
            author_id = dataStore.addIfMissing('human', author)
            # we also add to authors
            dataStore.addIfMissing('author', author)
            uri_triplets.append([_id, work_id, uri_props['author'], author_id])
    
    if not is_nan(row['editors']):
        editors = str(row['editors']).split(';')
        for editor in editors:
            # same as author
            editor_id = dataStore.addIfMissing('human', editor)
            dataStore.addIfMissing('editor', editor)
            uri_triplets.append([_id, work_id, uri_props['editor'], editor_id])
            
    
    addItem(row['oldDepartmentNames'], 'oldDepartment', uri_props['oldDepartment'])
    addItem(row['publisher'], 'publisher', uri_props['publisher'])
    addItem(row['journal'], 'journal', uri_props['journal'])
    addLiteral(row['booktitle'], literal_props['title'])
    
    if (not is_nan(row['startpage'])) and (not is_nan(row['endpage'])):
        val = str(row['startpage']) + '-' + str(row['endpage'])
        addLiteral(val, literal_props['pages'])
    
    addLiteral(row['issue'], literal_props['issue'])
    addLiteral(row['vol'],literal_props['volume'])
    addLiteral(row['year'], literal_props['publication_date'])
    
    addItem(row['place'], 'place', uri_props['place_of_publication'])
    addItem(row['conference'], 'conference', uri_props['conference'])
    addItem(row['Serie'], 'series', uri_props['series'])
    
    addLiteral(row['link'], literal_props['reference_URL'])
    
    
    if not is_nan(row['keywordsAndPeerReview']):
        keywords = str(row['keywordsAndPeerReview']).split(";")
        for keyword in keywords:
            keyword_id = dataStore.addIfMissing('keyword', keyword)
            uri_triplets.append([_id, work_id, uri_props['keyword'], keyword_id])
            
    


Questions:
- are 'Items' and 'Properties' already pre-definied, or should we also define them?
- how can we define that a property accepts an item as value? or accept a literal? or multiple values?
- transitive properties?
- what are the different data types? and how can we make it clear that this field should have this type of data