In [1]:
import pandas as pd
import numpy as np
import re
import os

In [2]:

if not os.path.exists('./sparql'):
    os.makedirs('./sparql/')
#read
df = pd.read_csv('../pik_output.csv', sep=',')

In [3]:
def filter_label(label):
    if len(label) > 245:
        return label[:245]
    return label

# First Chunk

### Properties

In [4]:
props = [
    ['id_prop_instance_of', 'instance of', 'wikibase-item'], #P31
    ['id_prop_author',  'author', 'wikibase-item'], #P50
    ['id_prop_editor', 'editor', 'wikibase-item'], #P98
    ['id_prop_main_subject', 'main subject', 'wikibase-item'], #P921
    ['id_prop_publisher', 'publisher', 'wikibase-item'], #P123
    ['id_prop_place_of_publication', 'place of publication', 'wikibase-item'], #P291
    ['id_prop_published_in', 'published in', 'wikibase-item'], #P1433 (for journals)
    ['id_prop_part_of_the_series', 'part of the series', 'wikibase-item'], #P179 (for series)
    ['id_prop_publication_date', 'publication date', 'string'], #P577
    ['id_prop_DOI', 'DOI', 'string'], #P356
    ['id_prop_issue', 'issue', 'string'], #P433
    ['id_prop_volume', 'volume', 'string'], #P478
    ['id_prop_number_of_pages', 'number of pages', 'string'], #P1104
]

props = pd.DataFrame(props, columns = ['id', 'label', 'data_type'])
props['label'] = props['label'].map(filter_label)
props.to_csv('./sparql/properties.csv', index=False)

### Base Items

In [5]:
base_items = [
    # publication types
    ['id_base_article', 'article'], # Q191067 (for paperr and papern)
    ['id_base_chapter', 'chapter'], #Q1980247 (for inbook)
    ['id_base_confpaper', 'conference paper'], #Q23927052
    ['id_base_lecture', 'lecture'], #Q603773
    ['id_base_report', 'report'], #Q10870555
    ['id_base_epub', 'electronic publication'], #Q21572908
    ['id_base_inreport', 'research report'], #Q59387148
    ['id_base_intseries', 'technical report'], #Q3099732
    ['id_base_book', 'book'], #Q571
    ['id_base_newspaper', 'newspaper article'], #Q2495037
    ['id_base_edbook', 'edited volume'], #Q1711593
    ['id_base_data', 'data publication'], #Q17051824
    ['id_base_software', 'software project'], #Q63437139
    ['id_base_dipl', 'diploma thesis'], #Q30749496
    ['id_base_habil', 'habilitation thesis'], #Q144362
    ['id_base_thesis', 'doctoral thesis'], #Q187685
    ['id_base_proceedings', 'proceedings'], #Q1143604
    # the rest
    ['id_base_author', 'author'], #Q482980
    ['id_base_editor', 'editor'], #Q1607826
    ['id_base_journal', 'academic journal'], #Q737498
    ['id_base_publisher', 'publisher'], #Q2085381
    ['id_base_place_of_publication', 'place of publication'], # does not exist in wikidata, we will use our own base class
    ['id_base_series', 'series'], # does not exist in wikidata, we will use our own base class
    ['id_base_main_subject', 'main subject'] # does not exist in wikidata, we will use our own base class
]

base_items = pd.DataFrame(base_items, columns = ['id', 'label'])
base_items['label'] = base_items['label'].map(filter_label)
base_items.to_csv('./sparql/items_0.csv', index=False),

(None,)

# Second chunk

### Publication Types (first level)

# Third Chunk

### Publication Types (second level)

# Fourth Chunk

### helpers

In [6]:
def is_nan(x):
    return (x is np.nan or x != x)

def split_and_flatten(input_array, sep = ';', splitter = None, transform = str.strip):
    array = []
    if not splitter:
        splitter = lambda items: [transform(item) for item in items.split(sep)]
        
    for items in input_array:
        if not is_nan(items):
            split = splitter(items)
            not_empty = filter(lambda string: len(string) > 0, split)
            array.extend(not_empty)
    return list(set(array))

def get_id_map(arr, prefix = '_'):
    dic = {};
    for i, item in enumerate(arr):
        dic[item] = 'id_'+ prefix + '_' + str(i)
    return dic


### Creators (Authors & Editors)

In [7]:
COLUMNS = ['id', 'label', 'id_prop_instance_of']

In [8]:
authors = split_and_flatten(df['authors'].tolist())
editors = split_and_flatten(df['editors'].tolist())
creators = authors + editors

authors_map = get_id_map(authors)
editors_map = get_id_map(editors)
creator_map = get_id_map(creators, 'creator')

creators_data = []
for name, _id in creator_map.items():
    val = ''
    if name in authors_map and name in editors_map:
        val = 'id_base_author;id_base_editor'
    elif name in editors_map:
        val = 'id_base_editor'
    elif name in authors_map:
        val = 'id_base_author'
    creators_data.append([_id, name, val])

cr = pd.DataFrame(creators_data, columns=COLUMNS)
cr['label'] = cr['label'].map(filter_label)
cr.to_csv('./sparql/items_1.csv', index=False)

#clean up namespace, we still need the creator_map later
del authors, editors, creators, authors_map, editors_map, creators_data, cr

# Fifth Chunk

### Keywords

In [9]:
def split_keywords(string):
    values = [item.strip().lower() for item in re.split('(;|,)', string)]
    return filter(lambda string: len(string) > 2, values)

keywords = split_and_flatten(df['keywordsAndPeerReview'].tolist(), splitter=split_keywords)

keywords_map = get_id_map(keywords, 'keyword')
keyword_data = []
for name, _id in keywords_map.items():
    keyword_data.append([_id, name, 'id_base_main_subject'])

data = pd.DataFrame(keyword_data, columns=COLUMNS)

del keywords, keyword_data

### helper

In [10]:
def row_instance_of(series, parent, id_prefix):
    series = series.dropna().unique().tolist()
    items_map = get_id_map(series, id_prefix)
    items_data = []
    for name, _id in items_map.items():
        items_data.append([_id, name, parent])
    return (items_map, items_data)

### Publisher

In [11]:
publisher_map, publisher_data = row_instance_of(df['publisher'], 'id_base_publisher', 'publisher')
data = data.append(pd.DataFrame(publisher_data, columns=COLUMNS))
del publisher_data

### Journal

In [12]:
journal_map, journal_data = row_instance_of(df['journal'], 'id_base_journal', 'journal')
data = data.append(pd.DataFrame(journal_data, columns=COLUMNS))
del journal_data

### Place of Publication

In [13]:
place_map, place_data = row_instance_of(df['place'], 'id_base_place_of_publication', 'place')
data = data.append(pd.DataFrame(place_data, columns=COLUMNS))
del place_data

### Conference

### Series

In [14]:
series_map, series_data = row_instance_of(df['Serie'], 'id_base_series', 'series')
data = data.append(pd.DataFrame(series_data, columns=COLUMNS))
del series_data

In [15]:
# finished chunk of data, write back
data['label'] = data['label'].map(filter_label)
data.to_csv('./sparql/items_2.csv', index=False)
del data

# Sixth chunk

In [16]:
publication_type_map = {
    'inbook': 'id_base_chapter',
    'confpaper':'id_base_confpaper',
    'lecture':'id_base_lecture',
    'paperr':'id_base_article',
    'papern':'id_base_article',
    'instseries':'id_base_intseries',
    'epup':'id_base_epub',
    'book':'id_base_book',
    'inreport':'id_base_inreport',
    'report' :'id_base_report',
    'edbook' : 'id_base_edbook',
    'thesis':'id_base_thesis',
    'proceedings':'id_base_proceedings',
    'newspaper':'id_base_newspaper',
    'dipl':'id_base_dipl', 
    'habil':'id_base_habil',
    'data':'id_base_data',
    'software':'id_base_software'
}

works = []
# we need these maps for the following part
# series_map, conference_map, place_map, journal_map, publisher_map, keywords_map, creator_map

illegal = []
for index, row in df.iterrows():
    if is_nan(row['title']):
        illegal.append(row)
        continue
    work = {
        'id': 'id_work_'+ str(row['id']),
        'label': row['title']
    }
    if not is_nan(row['type']):
        work['id_prop_instance_of'] = publication_type_map[row['type']]
        
    if not is_nan(row['authors']):
        authors = [name.strip() for name in row['authors'].split(';')]
        authors = map(lambda name : creator_map[name], authors)
        work['id_prop_author'] = ';'.join(authors)
        
    if not is_nan(row['editors']):
        editors = [name.strip() for name in row['editors'].split(';')]
        editors = map(lambda name : creator_map[name], editors)
        work['id_prop_editor'] = ';'.join(editors)
        
        
    if not is_nan(row['keywordsAndPeerReview']):
        keywords = split_keywords(row['keywordsAndPeerReview'])
        keywords = map(lambda name : keywords_map[name], keywords)
        work['id_prop_main_subject'] = ';'.join(keywords)
            
    if not is_nan(row['publisher']):
        work['id_prop_publisher'] = publisher_map[row['publisher']]
        
        
    if not is_nan(row['journal']):
        work['id_prop_published_in'] = journal_map[row['journal']]    
    
    if not is_nan(row['place']):
        work['id_prop_place_of_publication'] = place_map[row['place']]
        
        
#     if not is_nan(row['conference']):
#         work['id_prop_conference'] = conference_map[row['conference']]
    
    if not is_nan(row['Serie']):
        work['id_prop_part_of_the_series'] = series_map[row['Serie']]
        
        
    
    if (not is_nan(row['startpage'])) and (not is_nan(row['endpage'])):
        val = str(row['startpage']) + '-' + str(row['endpage'])
        work['id_prop_number_of_pages'] = val
    
#     work['id_prop_title'] = row['booktitle']
    if not is_nan(row['issue']):
        work['id_prop_issue'] = row['issue']
        
    if not is_nan(row['vol']):
        work['id_prop_volume'] = row['vol']
    
    if not is_nan(row['year']):
        work['id_prop_publication_date'] = str(int(row['year']))
#     work['id_prop_reference_URL'] = row['link']
    
    works.append(work)

In [17]:
works_data = pd.concat([pd.DataFrame(work, index=[0]) for work in works], ignore_index = True, sort=False)

In [18]:
works_data['label'] = works_data['label'].map(filter_label)
works_data.to_csv('./sparql/items_3.csv', index=False)