# exhibition catalogs

In [12]:
# importing required modules 
import PyPDF2
import csv
import math
import re
import string

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from nltk.tokenize import RegexpTokenizer, word_tokenize
from nltk.corpus import stopwords

In [6]:
dirname = '../data/catalogs/'

ROME_EN = 'Rome_EN_LR_compleet'
EGYPTE_NL = 'Eeuwig_Egypte_NL_LR'
EGYPTE_EN = 'Eternal_Egypt_LR'
CROSSROADS_NL = 'CrossRoads_NEDERLANDS_LR_compleet'
CROSSROADS_EN = 'CrossRoads_ENGELS_LR_compleet'
SICILY_EN = 'sicily_en'

output_dir = '../raw_extract/'

books = [ROME_EN, EGYPTE_NL, EGYPTE_EN, CROSSROADS_EN, CROSSROADS_NL, SICILY_EN]

In [8]:
# Obtain the raw extract from all exhibition catalogs considered
fields = ['data']

for b in books: 
    
    with open(output_dir + 'extract_' + b + '.csv', 'w') as csvfile:

        # creating a csv writer object 
        csvwriter = csv.writer(csvfile) 
         
        # writing the fields 
        csvwriter.writerow(fields) 

        reader = PyPDF2.PdfReader(dirname + b + '.pdf')
        for pagei in range(len(reader.pages)):
            page = reader.pages[pagei]
            csvwriter.writerow([page.extract_text()])

KeyboardInterrupt: 

In [10]:
# preprocess raw text extracts from exhibition catalogs

df_egypte = pd.read_csv('../raw_extract/extract_' + EGYPTE_NL + '.csv')
df_crossroads = pd.read_csv('../raw_extract/extract_' + CROSSROADS_EN + '.csv')

credits_egypte = df_egypte.iloc[91]

df_egypte

Unnamed: 0,data
0,EEuwig EgyptE
1,AllArd \npierson \nmuseum\nBen van den Bercke...
2,eeuwig egypte 5 4
3,7 6 Deze uitgave van het Allard Pierson Museum...
4,9 8inhoud\n \n Voorwoord – 11\n Chronologie ...
...,...
88,eeuwig egypte 177 176
89,eeuwig egypte 179 178\nAllerheiligste (of Heil...
90,eeuwig egypte 181 180 Soennieten: volgelingen...
91,eeuwig egypte 183 182nEdErlands-VlaaMs institu...


In [434]:
# function for preprocessing the data in a book

def preprocess(book):
    
    df = pd.read_csv('extract_' + book + '.csv')
    df = df.dropna()
    
    tokenizer = RegexpTokenizer(r'\w+')
    stop_words = set(stopwords.words('english'))

    corpus = ''

    for i in range(len(df.index)):

        # tokenize
        query = tokenizer.tokenize(df['data'].iloc[i])

        # filter stop words convert to lower
        filtered_sentence = [w.lower() for w in query if not w.lower() in stop_words]

        # filter digits
        filtered_sentence = [w for w in filtered_sentence if not w.isdigit()]

        df['data'].iloc[i] = " ".join(filtered_sentence)
        corpus += " ".join(filtered_sentence)
        
    return df, corpus

In [13]:
# preprocess the data
def preprocess_document(document):
    # Tokenization
    tokens = word_tokenize(document)
    # Lowercase conversion
    tokens = [token.lower() for token in tokens] 
    # Punctuation removal
    tokens = [token for token in tokens if token not in string.punctuation]
    # Stop word removal
    stop_words = set(stopwords.words("english"))
    tokens = [token for token in tokens if token not in stop_words]

    return " ".join(tokens)

In [14]:
preprocessed_documents = [preprocess_document(document) for document in documents]
print(preprocessed_documents)

NameError: name 'documents' is not defined

In [15]:
# df, corpus = preprocess(EGYPTE_NL)

split = round(0.8 * len(df_egypte))

train = df_egypte.iloc[:split]
test = df_egypte.iloc[split:]

In [16]:
train.to_csv('train_corpus.csv')
test.to_csv('test_corpus.csv')

Unnamed: 0,data
0,eeuwig egypte
1,allard pierson museum ben van den bercken will...
2,eeuwig egypte
3,deze uitgave van het allard pierson museum gep...
4,8inhoud voorwoord chronologie prehistorisch eg...
...,...
69,de romeinse periode eeuwig egypte verder naar ...
70,eeuwig egypte het zuiden lag een ommuurd hof m...
71,eeuwig egypte heid maakte de tempels afhankeli...
72,eeuwig egypte het gezin egypte vóór de komst v...


## label extraction
### so far: egypt

In [52]:
# extract apm numbers from eternal egypt credits page
def extract_apm(credits_page):
    
    # check input type
    if type(credits_page) != str:
        raise('invalid input')
    
    # tokenize input
    tokenizer = RegexpTokenizer(r'\w+')
    credits = tokenizer.tokenize(credits_page)
    
    # find section within page with apm labels
    credits = credits[credits.index('omslag'): credits.index('nederlands')]
    credits = ' '.join(credits)
    
    # segment into; (blz), (page num), (apm references)
    pages_apm_egypte = re.findall('(blz\s)(\d+)(.*?)blz', credits)
    
    # process regex extract 
    labels_egypte = []
    
    for _, p, apm in pages_apm_egypte:
        apm_refs = re.findall('\d{4,}', apm)     
        apm_refs = ['0' + r if len(r) <= 4 else r for r in apm_refs]
        labels_egypte += [(int(p), apm_refs)]
    
    return pd.DataFrame(labels_egypte, columns=['page', 'apm']).explode('apm')

In [5]:
egypt_raw = '../raw_extract/extract_' + EGYPTE_NL + '.csv'
egypt = pd.read_csv(egypt_raw)

index= -2
credits = egypt.data.iloc[index]

matches = re.findall("(?:blz.)\s(\d+):\s+(?:[a-zA-Z]*)\s*(\d+)(?:onder)*\s*\W*(\d*)(?:onder)*\W*\s*(\d*)\W*\s*(\d*)(?:boven)*(?:onder)*\s*(\d*)\W*(\d*)(?:onder)*\s*(\d*)\W*(\d*)\W*(\d*)", credits)
matches

# pd.DataFrame(matches, columns=['page', '1', '2', '3', '4', '5', '6', '7', '8', '9']).to_csv('egypt.csv', index=False)

[('17', '196', '1', '2', '', '4170', '4173', '', '4222', ''),
 ('18', '4206', '4143', '4145', '', '', '', '', '', ''),
 ('21', '15290', '', '', '', '', '', '', '', ''),
 ('22', '4162', '4218', '4164', '4219', '', '', '3974', '3863', '3972'),
 ('23', '12637', '', '', '', '', '', '', '', ''),
 ('33', '12720', '15276', '3943', '', '', '', '', '', ''),
 ('34', '3858', '', '12676', '12678', '', '', '', '', ''),
 ('35', '4044', '', '15999', '16476', '', '', '', '', ''),
 ('37', '15302', '15301', '', '', '', '', '', '', ''),
 ('39', '7298', '', '4306', '', '', '', '', '', ''),
 ('40', '3635', '12683', '', '', '', '', '', '', ''),
 ('42', '8752', '14021', '', '', '', '', '', '', ''),
 ('43', '9274', '15592', '', '', '', '', '', '', ''),
 ('46', '3933', '', '', '', '', '', '', '', ''),
 ('47', '8850', '', '', '', '', '', '', '', ''),
 ('48', '3400', '', '', '', '', '', '', '', ''),
 ('49', '16000', '', '', '', '', '', '', '', ''),
 ('51', '8539', '', '', '', '', '', '', '', ''),
 ('53', '12698'

In [108]:
labels = pd.read_csv('labels/egypt.csv', converters={i: str for i in range(100)})

In [113]:
labels.drop(['8', '9'], axis=1).to_csv('labels/egypt.csv')

## filter pages

In [439]:
def filter_pages(refs, book):
    # create labels similar to index of df
    mapping = create_mapping(book)[1:-2]
    pages = list(refs['page'])
    idx = cross_reference(pages, mapping)
    
    return df.iloc[idx]

In [45]:
# helper function for egypte splitting up pages
def create_mapping(df):
    start = 2
    mapping = []
    mapping += [1]
    
    for i in range(1, len(df.index) - 1):
        mapping += [tuple((start, start + 1))]
        start += 2
    mapping += [start]
    return mapping

def cross_reference(pages, mapping):
    idx = []
    for p in pages:
        counter = 1
        for m in mapping:
            if p in m:
                idx.append(counter)
            counter +=1
    return idx

In [46]:
# pages = list(refs['page'])
mapping = create_mapping(df_egypte)
pd.DataFrame(mapping, columns=['page_per_id']).to_csv('mapping.csv')

In [442]:
idx = cross_reference(pages, mapping[1:-2])

In [443]:
filtered_book = filter_pages(refs, df)

# labels crossroads
obtaining the apm numbers from the 'illustratie verantwoording'

In [444]:
def apm(descr):
    return re.findall('(?:APM)\s*(\d+)', descr)

In [445]:
credits_crossroads = ' 10 above (APM16324),  \n13 (APM13822, APM9370),  \n29 above (APM7855),  51 (APM9276, APM9278, APM9280),  52 (APM16772),  \n66 (APM7468),  \n67 (APM12995),  \n69, 71 (APM7798),  \n72 (APM16388),  73 (APM3830),  \n74 (APM7798),  \n103 below (APM12974), \n146 (APM8471),  147 (APM8107),  \n162 (APM7071),  163 left (APM09163),  \n163 right (APM15589),  \n176 above (APM16369), 180 (APM3831, APM10998)'
page_numbers = [10, 13, 13, 29, 51, 51, 51, 52, 66, 67, 69, 72, 73, 74, 103, 146, 147, 162, 163, 163, 176, 180, 180]

In [446]:
references_crossroads = apm(credits_crossroads)

In [447]:
idx_crossroads = [x + -2 for x in page_numbers]

## create labels and save

In [4]:
def labels(df, idx, refs, outputfilename):
    labels = np.zeros_like(df.index)
    
    mapping = zip(idx, refs)
    
    for pageid, apm in mapping:
        labels[pageid] = apm
        
    np.savetxt(outputfilename, labels, delimiter=",")
    
    return labels

In [5]:
labels_crossroads = labels(df_crossroads, page_numbers, references_crossroads, 'crossroads.csv')
labels_egypte = labels(df_egypte, idx, list(refs.explode('apm')['apm']), 'egypt.csv')

NameError: name 'page_numbers' is not defined

In [49]:
pd.read_csv('labels/crossroads.csv')

ParserError: Error tokenizing data. C error: Expected 1 fields in line 14, saw 2


In [450]:
"Inventory numbers Allard
Pierson Museum
Cover: 14232
p. 6: 16751
p. 28: 3493
p. 31: 3271
p. 33: 7802
p. 35: above 7164, below 7316
p. 37: 7971
p. 38: 9227
p. 40: 16883
p. 41: 16228
p. 43: 13055
p. 45: 1379
p. 48: 13937
p. 50: above 12378, below
10167
p. 52: 13825
p. 53: 1627
p. 55: 7347 and 7349
p. 56: 7286
p. 57: 7359 and 13963
p. 59: 7326
p. 61: 2907
p. 62: 1786 (photo Restauratieatelier
Restaura)
p. 64: above 3239 and 2845,
below 1785
p. 65: 788
p. 69: 1892
p. 72: 8343
p. 73: above 13946,
below 15758
p. 74: above 15369 and 15370
p. 75: 14005
p. 76: 6349
p. 77: 12428
p. 78: 7592
p. 80: 15396
p. 81: 8188
p. 82: 3242 and 3243
p. 84: above 9374, below 1774
p. 85: 8180
p. 86: 3269
p. 87: 3422
p. 92: 12
p. 93 above 1606
p. 95: 8552
p. 96: 35
p. 97: above 12417, below
12534
p. 100: above 7066, 8124, 8116,
below 8117, 7065, 8120
p. 101: 7974
p. 102: 7757
p. 103: 8133
p. 104: left 7288, 7290, 7874
and 8023, right 725
p. 105: 7768
p. 107: 8146.001-009
p. 108: 8169
p. 109: 16217
p. 110: 16166
p. 115: 9234
p. 118: 1674
p. 119: 15076
p. 122: 1402
p. 123: 8016
p. 124: 7799
p. 126: 8175
p. 128: 7946
p. 131: 11972
p. 132: 1765
p. 133: 15914
p. 137: 5205, 5208, 5216, 5220,
5222, 5230
p. 139: 15927
p. 140: 9894-9900, 10675
p. 141: 9350
p. 142: 8133
p. 143: 7022
p. 144: 6295, 6296, 7304,
7308, 14165
p. 145: 7001, 7003, 7004
p. 146: 724
p. 148: 1687
p. 151: above 15746, below
1722
p. 153: 8363, 319, 1681, 6319,
p. 156: 12.324
p. 157: left 16618, right 12481
p. 158: above 15689,
below 15999
p. 161: 451
p. 162: 3579
p. 163: 7163
p. 164: 7310
p. 165: 9224
p. 167: 7379
p. 174: 14.409
p. 176: 5180
p. 177: 16763
p. 178-179: 10854
p. 180: 10.854
p. 181: 9241
p. 182: 6287
p. 183: 16882
p. 185: 16604, 16607, 16612,
16614, 16616, 16610

SyntaxError: EOL while scanning string literal (1299147726.py, line 1)

In [451]:
Inventarisnummers
Allard Pierson Museum
omslag: 4076
blz. 17: boven 196-1/2; onder
4170-4173, 4222
blz. 18: 4206, 4143, 4145
blz. 21: 15290
blz. 22: boven 4162, 4218, 4164, 4219;
onder 3974/3863, 3972A
blz. 23: 12637
blz. 33: 12720, 15276, 3943
blz. 34: boven 3858; onder 12676,
12678
blz. 35: boven 4044; onder 15999,
16476
blz. 37: 15302, 15301
blz. 39: boven 7298; onder 4306
blz. 40: 3635, 12683
blz. 42: boven 8752/14021; onder
Schriftmuseum Dortmond
P. Amsterdam 22
blz. 43: 9274, 15592
blz. 46: 3933
blz. 47: 8850
blz. 48: 3400
blz. 49: 16000
blz. 51: 8539
blz. 53: 12698
blz. 54: 12647, 14238
blz. 63: 15350
blz. 64: 9237
blz. 65: 8789
blz. 69: 12978
blz. 73: APM 9115
blz. 75: 1387
blz. 76: boven 3408, 360, 3799;
onder 11960
blz. 77: 12718, 8537
blz. 78: 8851; Schriftmuseum
Dortmond, no. 115
blz. 79: boven 9114; onder 8875, 1676
blz. 86: 16500
blz. 87: boven 9223; onder 8811
blz. 88: 13283a-j, 8800
blz. 99: 391
blz. 100: boven 8065; onder 13292
blz. 102: 7774
blz. 103: 12760, 4307, 15326
blz. 104: 12977
blz. 105: 9475, 9492, 9502
blz. 106: 8562, 8563, 8417
blz. 107: 8831
blz. 108: 20
blz. 109: 8837
blz. 111: 13219
blz. 115: 8795/6
blz. 116: 7126
blz. 117: 7772
blz. 118: 6289
blz. 119: 7993
blz. 120: 8846
blz. 124: 7216, 7238, 7272
blz. 125: 13158
blz. 129: 7758
blz. 130: 7763
blz. 131: 9369
blz. 132: 8517
blz. 133: 7796
blz. 135: 14232
blz. 136: 7860, 7861
blz. 137: 9353
blz. 138: 7874, 9227
blz. 139: 7803
blz. 141: 7766
blz. 142: boven 7757; onder 7974
blz. 143: 7761
blz. 144: 8188
blz. 145: 6286
blz. 150: 12995, 14513
blz. 152: boven 16750; onder 14510
blz. 153: 8189
blz. 159: 16385

SyntaxError: invalid syntax (1808091653.py, line 2)

## stopword removal in multiple languages

In [2]:
# author to keywords (from titles)
french = set(stopwords.words('french'))
english = set(stopwords.words('english'))
german = set(stopwords.words('german'))
dutch = set(stopwords.words('dutch'))

punctuation = set(list(punctuation))

keywords = {}

for author in alma_beeldbank['contributor'].unique():

    body = " ".join(alma_beeldbank[alma_beeldbank['contributor'] == author]['title']).lower()
    values = word_tokenize(body)

    lang = alma_beeldbank[alma_beeldbank['contributor'] == author]['language']
    print(lang)

    if author in keywords: 
        print('entry already present!')
        keywords[author][0] += values 
    else:
        keywords[author] = (values, lang)

    keywords[author] = (set(keywords[author][0]), keywords[author][1])
    filtered = []

    for w in keywords[author][0]:

        if keywords[author][1] == 'fre':
            if w not in french:
                if w not in punctuation:
                    filtered.append(w)
        elif keywords[author][1] == 'eng':
            if w not in english:
                if w not in punctuation:
                    filtered.append(w)
        elif keywords[author][1] == 'ger':
            if w not in german:
                if w not in punctuation:
                    filtered.append(w)
        elif keywords[author][1] == 'dut':
            if w not in dutch:
                if w not in punctuation:
                    filtered.append(w)
        else:
            if w not in punctuation:
                filtered.append(w)

    # potentially add language 
    keywords[author] = filtered

NameError: name 'stopwords' is not defined

## metadata preprocessing

In [41]:
# dc export
uva_alma_archobjects = pd.read_xml('../data/CUE/uva_alma_archobjects_dc_new.xml')
uva_alma_archobjects

Unnamed: 0,schemaLocation,title,description,publisher,date,type,format,identifier,language,relation,subject,contributor
0,http://www.openarchives.org/OAI/2.0/oai_dc/ ht...,Amfoor aardewerk; vaatwerk,,,900-600 voor Christus IJzertijd II,,op wiel gedraaid,https://pid.uba.uva.nl/ark:/88238/b19900378131...,xxx,,,
1,http://www.openarchives.org/OAI/2.0/oai_dc/ ht...,vrouwenkop gips; menselijke figuur,,,1860,,gegoten,https://pid.uba.uva.nl/ark:/88238/b19900377732...,xxx,,,
2,http://www.openarchives.org/OAI/2.0/oai_dc/ ht...,vrouwenkop gips; menselijke figuur,,,1860,,gegoten,https://pid.uba.uva.nl/ark:/88238/b19900377732...,xxx,,,
3,http://www.openarchives.org/OAI/2.0/oai_dc/ ht...,hangertje glas: sieraad,,,200 n. Chr.,,gegoten,https://pid.uba.uva.nl/ark:/88238/b19900377732...,xxx,,,
4,http://www.openarchives.org/OAI/2.0/oai_dc/ ht...,hangertje glas: sieraad,,,200 n. Chr.,,gegoten,https://pid.uba.uva.nl/ark:/88238/b19900377732...,xxx,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
18268,http://www.openarchives.org/OAI/2.0/oai_dc/ ht...,fragmenten kom vaas tegel aardewerk: architect...,Architectural Terracottas in the Allard Pierso...,,keizertijd,,aardewerkmateriaal,https://pid.uba.uva.nl/ark:/88238/b19900325330...,xxx,,,
18269,http://www.openarchives.org/OAI/2.0/oai_dc/ ht...,baksteen aardewerk: architecturaal,Architectural Terracottas in the Allard Pierso...,,keizertijd,,vorm incisie,https://pid.uba.uva.nl/ark:/88238/b19900325330...,xxx,,,
18270,http://www.openarchives.org/OAI/2.0/oai_dc/ ht...,legioenbaksteen aardewerk: architecturaal,Architectural Terracottas in the Allard Pierso...,,keizertijd,,vorm,https://pid.uba.uva.nl/ark:/88238/b19900325329...,xxx,,,
18271,http://www.openarchives.org/OAI/2.0/oai_dc/ ht...,lansmodel organisch: wapen,,,1800,,houtmateriaal,https://pid.uba.uva.nl/ark:/88238/b19900325329...,xxx,,,


In [6]:
def merge(df):
    
    colnames = df.columns.values.tolist()
    
    for i in range(len(df.index)):
        document = ''
        for c in colnames: 
            if type(df[c].iloc[i]) == str:
                document += df[c].iloc[i] + ', '
                
        df.document['document'].iloc[i] == document
    

In [10]:
uva_alma_archobjects['document'] = uva_alma_archobjects.apply()

TypeError: apply() missing 1 required positional argument: 'func'

In [42]:
uva_alma_archobjects.drop(['schemaLocation', 'identifier'], axis=1)

Unnamed: 0,title,description,publisher,date,type,format,language,relation,subject,contributor
0,Amfoor aardewerk; vaatwerk,,,900-600 voor Christus IJzertijd II,,op wiel gedraaid,xxx,,,
1,vrouwenkop gips; menselijke figuur,,,1860,,gegoten,xxx,,,
2,vrouwenkop gips; menselijke figuur,,,1860,,gegoten,xxx,,,
3,hangertje glas: sieraad,,,200 n. Chr.,,gegoten,xxx,,,
4,hangertje glas: sieraad,,,200 n. Chr.,,gegoten,xxx,,,
...,...,...,...,...,...,...,...,...,...,...
18268,fragmenten kom vaas tegel aardewerk: architect...,Architectural Terracottas in the Allard Pierso...,,keizertijd,,aardewerkmateriaal,xxx,,,
18269,baksteen aardewerk: architecturaal,Architectural Terracottas in the Allard Pierso...,,keizertijd,,vorm incisie,xxx,,,
18270,legioenbaksteen aardewerk: architecturaal,Architectural Terracottas in the Allard Pierso...,,keizertijd,,vorm,xxx,,,
18271,lansmodel organisch: wapen,,,1800,,houtmateriaal,xxx,,,


In [43]:
# join all columns
uva_alma_archobjects['documents'] = uva_alma_archobjects.apply(lambda x: ', '.join(x.drop(['schemaLocation', 'identifier']).dropna().values.astype(str).tolist()), axis=1)


In [44]:
# row and document column
uva_alma_archobjects.iloc[0], uva_alma_archobjects['documents'].iloc[0]

(schemaLocation    http://www.openarchives.org/OAI/2.0/oai_dc/ ht...
 title                                    Amfoor aardewerk; vaatwerk
 description                                                    None
 publisher                                                      None
 date                             900-600 voor Christus IJzertijd II
 type                                                            NaN
 format                                             op wiel gedraaid
 identifier        https://pid.uba.uva.nl/ark:/88238/b19900378131...
 language                                                        xxx
 relation                                                       None
 subject                                                        None
 contributor                                                    None
 documents         Amfoor aardewerk; vaatwerk, 900-600 voor Chris...
 Name: 0, dtype: object,
 'Amfoor aardewerk; vaatwerk, 900-600 voor Christus IJzertijd II, op wiel gedr

In [46]:
uva_alma_archobjects.documents.to_csv('metadata_arcobjects.csv')