In [1]:
import pandas as pd
import numpy as np
import re
import os

In [2]:

if not os.path.exists('./sparql'):
    os.makedirs('./sparql/')
#read
df = pd.read_csv('data_formatted.csv', sep=',')

# First Chunk

### Properties

In [3]:
props = [
    ['id_prop_instance_of', 'instance of', 'multiple_items'],
    ['id_prop_author',  'author', 'multiple_items'],
    ['id_prop_editor', 'editor', 'multiple_items'],
    ['id_prop_keyword', 'keyword', 'multiple_items'],
    ['id_prop_publisher', 'publisher', 'item'],
    ['id_prop_place_of_publication', 'place of publication', 'item'],
    ['id_prop_publication_type', 'publication type', 'item'],
    ['id_prop_journal', 'journal', 'item'],
    ['id_prop_conference', 'conference', 'item'],
    ['id_prop_series', 'series', 'item'],
    ['id_prop_publication_date', 'publication date', 'string'],
    ['id_prop_DOI', 'DOI', 'string'],
    ['id_prop_issue', 'issue', 'string'],
    ['id_prop_volume', 'volume', 'string'],
    ['id_prop_pages', 'pages', 'string'],
    ['id_prop_title', 'title', 'string'],
    ['id_prop_reference_URL', 'reference URL', 'string']
]

props = pd.DataFrame(props, columns = ['id', 'label', 'data_type'])
props.to_csv('./sparql/properties.csv', index=False)

### Base Items

In [4]:
base_items = [
    ['id_base_work', 'Work'],
    ['id_base_work_type', 'Work Type'],
    ['id_base_creator', 'Creator'],
    ['id_base_author', 'Author'],
    ['id_base_editor', 'Editor'],
    ['id_base_journal', 'Journal'],
    ['id_base_publisher', 'Publisher'],
    ['id_base_place_of_publication', 'Place of Publication'],
    ['id_base_conference', 'Conference'],
    ['id_base_series', 'Series'],
    ['id_base_keyword', 'Keyword']
]

base_items = pd.DataFrame(base_items, columns = ['id', 'label'])
base_items.to_csv('./sparql/items_0.csv', index=False)

# Second chunk

### Publication Types (first level)

In [5]:
COLUMNS = ['id', 'label', 'id_prop_instance_of']

In [6]:
publication_types = [
    ['id_type_book', 'Book', 'id_base_work_type'],
    ['id_type_newspaper', 'Newspaper Publication', 'id_base_work_type'],
    ['id_type_article', 'Article', 'id_base_work_type'],
    ['id_type_pik_series', 'PIK Series', 'id_base_work_type'],
    ['id_type_report', 'Report', 'id_base_work_type'],
    ['id_type_habilitation', 'Habilitation', 'id_base_work_type'],
    ['id_type_diploma', 'Diploma', 'id_base_work_type'],
    ['id_type_lecture', 'Lecture', 'id_base_work_type'],
    ['id_type_conference_paper', 'Conference Paper', 'id_base_work_type'],
    ['id_type_thesis', 'Thesis', 'id_base_work_type'],
    ['id_type_software', 'Software Publication', 'id_base_work_type'],
    ['id_type_data', 'Data Publication', 'id_base_work_type'],
    ['id_type_epub', 'Electronic Publication', 'id_base_work_type'],
    ['id_type_conference_proceedings', 'Conference Proceedings', 'id_base_work_type'],
]


pd.DataFrame(publication_types, columns=COLUMNS).to_csv('./sparql/items_1.csv', index=False)
del publication_types

# Third Chunk

### Publication Types (second level)

In [7]:
inherited_types = [
    ['id_type_article_in_book', 'Article in Book', 'id_base_work_type;id_type_book;id_type_article'],
    ['id_type_isi_article', 'ISI Article', 'id_base_work_type;id_type_article'],
    ['id_type_other_article', 'Other Article', 'id_base_work_type;id_type_article'],
    ['id_type_article_in_report', 'Article in Book', 'id_base_work_type;id_type_report;id_type_article'],
    ['id_type_editied_book', 'Edited Book', 'id_base_work_type;id_type_book'],
]

pd.DataFrame(inherited_types, columns=COLUMNS).to_csv('./sparql/items_2.csv', index=False)
del inherited_types

# Fourth Chunk

### helpers

In [8]:
def is_nan(x):
    return (x is np.nan or x != x)

def split_and_flatten(input_array, sep = ';', splitter = None, transform = str.strip):
    array = []
    if not splitter:
        splitter = lambda items: [transform(item) for item in items.split(sep)]
        
    for items in input_array:
        if not is_nan(items):
            array.extend(splitter(items))
    return array

def get_id_map(arr, prefix = '_'):
    dic = {};
    counter = 0
    for i, item in enumerate(arr):
        dic[item] = 'id_'+ prefix + '_' + str(i)
    return dic


### Creators (Authors & Editors)

In [9]:
authors = split_and_flatten(df['authors'].tolist())
editors = split_and_flatten(df['editors'].tolist())
creators = authors + editors

authors_map = get_id_map(authors)
editors_map = get_id_map(editors)
creator_map = get_id_map(creators, 'creator')

creators_data = []
for name, _id in creator_map.items():
    val = 'id_base_creator'
    if name in authors_map:
        val += ';id_base_author'
    if name in editors_map:
        val += ';id_base_editor'
    creators_data.append([_id, name, val])

data = pd.DataFrame(creators_data, columns=COLUMNS)

#clean up namespace, we still need the creator_map later
del authors, editors, creators, authors_map, editors_map, creators_data

### Keywords

In [10]:
def split_keywords(string):
    return [item.strip().lower() for item in re.split('(;|,)', string)]

keywords = split_and_flatten(df['keywordsAndPeerReview'].tolist(), splitter=split_keywords)

keywords_map = get_id_map(keywords, 'keyword')
keyword_data = []
for name, _id in keywords_map.items():
    keyword_data.append([_id, name, 'id_base_keyword'])

data = data.append(pd.DataFrame(keyword_data, columns=COLUMNS))

del keywords, keyword_data

### helper

In [11]:
def row_instance_of(series, parent, id_prefix):
    series = series.dropna().unique().tolist()
    items_map = get_id_map(series, id_prefix)
    items_data = []
    for name, _id in items_map.items():
        items_data.append([_id, name, parent])
    return (items_map, items_data)

### Publisher

In [12]:
publisher_map, publisher_data = row_instance_of(df['publisher'], 'id_base_publisher', 'publisher')
data = data.append(pd.DataFrame(publisher_data, columns=COLUMNS))
del publisher_data

### Journal

In [13]:
journal_map, journal_data = row_instance_of(df['journal'], 'id_base_journal', 'journal')
data = data.append(pd.DataFrame(journal_data, columns=COLUMNS))
del journal_data

### Place of Publication

In [14]:
place_map, place_data = row_instance_of(df['place'], 'id_base_place_of_publication', 'place')
data = data.append(pd.DataFrame(place_data, columns=COLUMNS))
del place_data

### Conference

In [15]:
conference_map, conference_data = row_instance_of(df['conference'], 'id_base_conference', 'conference')
data = data.append(pd.DataFrame(conference_data, columns=COLUMNS))
del conference_data

### Series

In [16]:
series_map, series_data = row_instance_of(df['series'], 'id_base_series', 'series')
data = data.append(pd.DataFrame(series_data, columns=COLUMNS))
del series_data

In [17]:
# finished chunk of data, write back
data.to_csv('./sparql/items_3.csv', index=False)
del data

# Fifth chunk

In [18]:
publication_type_map = {
    'inbook': 'id_type_article_in_book',
    'confpaper':'id_type_conference_paper',
    'lecture':'id_type_lecture',
    'paperr':'id_type_isi_article',
    'papern':'id_type_other_article',
    'instseries':'id_type_pik_series',
    'epup':'id_type_epub',
    'book':'id_type_book',
    'inreport':'id_type_article_in_report',
    'report' :'id_type_report',
    'edbook' : 'id_type_editied_book',
    'thesis':'id_type_thesis',
    'proceedings':'id_type_conference_proceedings',
    'newspaper':'id_type_newspaper',
    'dipl':'id_type_diploma', 
    'habil':'id_type_habilitation',
    'data':'id_type_data',
    'software':'id_type_software'
}

works = []
# we need these maps for the following part
# series_map, conference_map, place_map, journal_map, publisher_map, keywords_map, creator_map

for index, row in df.iterrows():
    work = {
        'id': 'id_work_'+ str(row['id']),
        'label': row['title'],
        'id_prop_instance_of': 'id_base_work'
    }
    if not is_nan(row['type']):
        work['id_prop_publication_type'] = publication_type_map[row['type']]
        
    if not is_nan(row['authors']):
        authors = [name.strip() for name in row['authors'].split(';')]
        authors = map(lambda name : creator_map[name], authors)
        work['id_prop_author'] = ';'.join(authors)
        
    if not is_nan(row['editors']):
        editors = [name.strip() for name in row['editors'].split(';')]
        editors = map(lambda name : creator_map[name], editors)
        work['id_prop_editor'] = ';'.join(editors)
        
        
    if not is_nan(row['keywordsAndPeerReview']):
        keywords = split_keywords(row['keywordsAndPeerReview'])
        keywords = map(lambda name : keywords_map[name], keywords)
        work['id_prop_keyword'] = ';'.join(keywords)
            
    if not is_nan(row['publisher']):
        work['id_prop_publisher'] = publisher_map[row['publisher']]
        
        
    if not is_nan(row['journal']):
        work['id_prop_journal'] = journal_map[row['journal']]    
    
    if not is_nan(row['place']):
        work['id_prop_place_of_publication'] = place_map[row['place']]
        
        
    if not is_nan(row['conference']):
        work['id_prop_conference'] = conference_map[row['conference']]
    
    
    if not is_nan(row['series']):
        work['id_prop_series'] = series_map[row['series']]
        
        
    
    if (not is_nan(row['startpage'])) and (not is_nan(row['endpage'])):
        val = str(row['startpage']) + '-' + str(row['endpage'])
        work['id_prop_pages'] = val
    
    work['id_prop_title'] = row['booktitle']
    work['id_prop_issue'] = row['issue']
    work['id_prop_volume'] = row['vol']
    work['id_prop_publication_date'] = row['year']
    work['id_prop_reference_URL'] = row['link']
    
    works.append(work)

In [19]:
works_data = pd.concat([pd.DataFrame(work, index=[0]) for work in works], ignore_index = True, sort=False)

In [20]:
works_data.to_csv('./sparql/items_4.csv', index=False)

Questions:
- transitive properties?
- what are the different data types? and how can we make it clear that this field should have this type of data?