# Models file for text matching

## Doc2Vec

In [1]:
import gensim
import pandas as pd
import numpy as np

dirname = '../processed/'
EGYPT_EN = 'processed_Eternal_Egypt_LR.csv'

dirname_metadata = '../metadata/'
queries = 'evaluation_metadata_eng.csv'

In [2]:
# prepare data
raw_corpus = pd.read_csv(dirname + EGYPT_EN)
raw_corpus = raw_corpus.replace(np.nan,'',regex=True)

documents = list(raw_corpus['data'])
documents

['',
 'eternal egypt',
 '',
 'allard pierson museum ben van den bercken willem van haarlem contributions h. willems d. huygem.j ravenf choël k. innemée',
 'eternal egypt4',
 '',
 'published allard pierson museum coincide eternal egypt experience exhibition july january exhibition publication result close cooperation allard pierson museum bibliotheca alexandrina/cultnat netherlands- flemish institute cairo nvic netherlands organisation scientific research nwo sponsors exhibition publication realized generous support djoser reizen friends allard pierson museum university amsterdam',
 '',
 '8contents foreword – chronology – prehistoric egypt 1,600,000-4,000 bc – ben van den bercken climate climate change religion magic material culture social organization excav ation rock art el-hosh qurta predynastic early dynastic periods 4,000-2707 bc – willem van haarlem material culture religion writing kingship excav ation tell ibrahim awad old kingdom first intermediate period 2707-2046 bc – willem

In [3]:
import smart_open

def read_corpus(fname, tokens_only=False):

    for i, row in pd.read_csv(fname).iterrows():
        try:
            tokens = gensim.utils.simple_preprocess(row['data'])
        except:
            tokens = ''
        if tokens_only:
            yield tokens
        else:
            # For training data, add tags
            yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

In [8]:
train_set = list(read_corpus(dirname + EGYPT_EN ))
eval_set = list(read_corpus('../metadata/processed_evaluation_metadata_eng.csv'))

In [9]:
train_set, eval_set
print(len(train_set), len(eval_set))

185 116


In [10]:
train_set[17]

TaggedDocument(words=['prehistoric', 'egyptthe', 'first', 'hominids', 'entered', 'egypt', 'years', 'ago', 'settled', 'along', 'fertile', 'banks', 'river', 'nile', 'million', 'years', 'ago', 'proto', 'nile', 'flowed', 'western', 'desert', 'formed', 'basis', 'present', 'day', 'nile', 'proto', 'nile', 'nile', 'gradually', 'shifted', 'towards', 'east', 'result', 'traces', 'left', 'habitation', 'earliest', 'humans', 'east', 'bank', 'likely', 'obliterated', 'nile', 'probably', 'acquired', 'current', 'form', 'start', 'pliocene', 'era', 'million', 'years', 'ago', 'river', 'fed', 'smaller', 'rivers', 'atbara', 'white', 'nile', 'linking', 'lake', 'victoria', 'blue', 'nile', 'linking', 'lake', 'tana', 'inundated', 'regularly', 'due', 'meltwaters', 'moun', 'tain', 'ranges', 'central', 'africa', 'periodic', 'inundation', 'sulted', 'deposition', 'fertile', 'silt', 'nile', 'could', 'also', 'wash', 'earth', 'valley', 'river', 'eroded', 'passage', 'landscape', 'creating', 'higher', 'areas', 'terraces',

In [11]:
eval_set

[TaggedDocument(words=['th', 'century', 'bc', 'mm', 'fragment', 'leech', 'fibula', 'call', 'nr', 'apm', 'fragment', 'leech', 'fibula', 'metal', 'object', 'ornament', 'bracket', 'leech', 'fibula', 'decoration', 'incised', 'lines', 'hollow', 'gray', 'green', 'patina', 'bronze', 'material', 'incision', 'italy', 'italy', 'presumably', 'allard', 'pierson', 'archaeological', 'collection', 'prehistoric', 'italic'], tags=[0]),
 TaggedDocument(words=['th', 'th', 'century', 'bc', 'mm', 'amphora', 'call', 'nr', 'apm', 'amphora', 'pottery', 'object', 'vessel', 'small', 'wide', 'necked', 'amphora', 'ears', 'quite', 'angular', 'lip', 'shoulder', 'groups', 'concentric', 'circles', 'neck', 'shoulder', 'bundle', 'narrow', 'horizontal', 'bands', 'around', 'waist', 'broad', 'band', 'ground', 'bright', 'red', 'algemeene', 'gids', 'corpus', 'vasorum', 'antiquorum', 'iicb', 'pl', 'vg', 'myres', 'cesnola', 'coll', 'pottery', 'material', 'cyprus', 'allard', 'pierson', 'archaeological', 'collection', 'greek', 

In [12]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)

In [13]:
model.build_vocab(train_set)

In [14]:
model.train(train_set, total_examples=model.corpus_count, epochs=model.epochs)

In [16]:
# Predict the page for the first metadata entry

inferred_vector = model.infer_vector(eval_set[0][0])
sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(0, ' '.join(eval_set[0][0])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)

for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_set[sims[index][0]].words)))

Test Document (0): «th century bc mm fragment leech fibula call nr apm fragment leech fibula metal object ornament bracket leech fibula decoration incised lines hollow gray green patina bronze material incision italy italy presumably allard pierson archaeological collection prehistoric italic»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec<dm/m,d50,n5,w5,mc2,s0.001,t3>:

MOST (22, 0.7229280471801758): «eternal egypt ivory copper objects reveal materials come immediate environment perhaps also away exchanges variety objects underlines sedentary character groups time work objects early villages also houses store belongings with black rim similar famil iar us later pottery naqada pottery shapes become enclosed moving dishes towards pots may related need store food only late neolithic period varied material culture emerge badarian cul ture jewellery made shells stone beads minerals worked bone material culture prehistoric times mainly flint culture one least perishable materi als known prehist

In [34]:
model.dv

<gensim.models.keyedvectors.KeyedVectors at 0x7f8e8ea1fd90>

In [46]:
predictions = []
pred_y_scores = []

for doc_id in range(len(eval_set)):
    inferred_vector = model.infer_vector(eval_set[doc_id][0])
    # sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
    sims = model.dv.n_similarity([inferred_vector], model.dv.vectors_for_all(range(0, keys)))

    p, score = sims[0]

    predictions += [p]
    
    pred_y_scores += [sims]

NameError: name 'keys' is not defined

In [47]:
sims = model.score(eval_set)

RuntimeError: We have currently only implemented score for the hierarchical softmax scheme, so you need to have run word2vec with hs=1 and negative=0 for this to work.

In [26]:
pd.DataFrame(pred_y_scores)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,175,176,177,178,179,180,181,182,183,184
0,"(22, 0.7060326337814331)","(173, 0.7026574015617371)","(34, 0.6968171000480652)","(184, 0.6966260671615601)","(21, 0.6852335333824158)","(159, 0.6748228073120117)","(183, 0.6711011528968811)","(126, 0.6618577241897583)","(170, 0.6506900787353516)","(6, 0.6339038610458374)",...,"(81, -0.1163705512881279)","(106, -0.12477529048919678)","(75, -0.13090023398399353)","(99, -0.13231708109378815)","(104, -0.15329872071743011)","(101, -0.1549879014492035)","(84, -0.16974851489067078)","(83, -0.1876322329044342)","(113, -0.18965648114681244)","(95, -0.25154909491539)"
1,"(124, 0.757013738155365)","(184, 0.7550918459892273)","(134, 0.7539668083190918)","(126, 0.746526837348938)","(34, 0.7458354234695435)","(22, 0.7432971596717834)","(6, 0.742488443851471)","(183, 0.7187865972518921)","(173, 0.7121946215629578)","(49, 0.7099859118461609)",...,"(80, 0.03606289625167847)","(38, 0.03273846209049225)","(54, 0.021143672987818718)","(83, -0.014292151667177677)","(177, -0.02161725051701069)","(52, -0.025662260130047798)","(67, -0.027779286727309227)","(15, -0.028976866975426674)","(95, -0.08743083477020264)","(7, -0.1340595781803131)"
2,"(22, 0.778319239616394)","(27, 0.7437394857406616)","(34, 0.711214005947113)","(24, 0.6961179971694946)","(25, 0.6913199424743652)","(21, 0.689603865146637)","(45, 0.6708071231842041)","(18, 0.6695645451545715)","(29, 0.6361473202705383)","(17, 0.6240460872650146)",...,"(99, -0.059825967997312546)","(67, -0.07584772258996964)","(80, -0.08641411364078522)","(139, -0.08735759556293488)","(104, -0.09660336375236511)","(7, -0.1035970076918602)","(83, -0.11027776449918747)","(110, -0.1259814202785492)","(95, -0.15450696647167206)","(117, -0.15934765338897705)"
3,"(22, 0.7682510614395142)","(27, 0.7399616837501526)","(34, 0.6893167495727539)","(24, 0.6876035332679749)","(21, 0.6793449521064758)","(25, 0.6773473620414734)","(45, 0.6608039140701294)","(18, 0.6494408249855042)","(29, 0.613862931728363)","(184, 0.6117404699325562)",...,"(99, -0.07508720457553864)","(80, -0.08048135787248611)","(139, -0.0975002869963646)","(67, -0.09804143756628036)","(104, -0.09906316548585892)","(7, -0.10559669882059097)","(83, -0.12152772396802902)","(110, -0.12447535991668701)","(95, -0.16995061933994293)","(117, -0.17544680833816528)"
4,"(27, 0.760059654712677)","(22, 0.751402735710144)","(25, 0.707380473613739)","(34, 0.6858484148979187)","(24, 0.6806279420852661)","(18, 0.672053873538971)","(21, 0.6689184308052063)","(45, 0.6558712720870972)","(20, 0.6376522183418274)","(184, 0.6333317756652832)",...,"(0, -0.0479915514588356)","(15, -0.05869031324982643)","(99, -0.06756459176540375)","(67, -0.06759833544492722)","(50, -0.0841737911105156)","(104, -0.08499957621097565)","(7, -0.08727802336215973)","(110, -0.10187996923923492)","(95, -0.1133391335606575)","(117, -0.12205470353364944)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111,"(126, 0.5988263487815857)","(11, 0.5983354449272156)","(170, 0.5873609781265259)","(184, 0.5768709778785706)","(173, 0.568692684173584)","(174, 0.562102735042572)","(183, 0.5223689675331116)","(182, 0.4937308132648468)","(169, 0.48956313729286194)","(181, 0.4590037167072296)",...,"(64, -0.3792493939399719)","(83, -0.3838755190372467)","(65, -0.39517050981521606)","(76, -0.3998854458332062)","(85, -0.40792182087898254)","(75, -0.41176775097846985)","(38, -0.4252280294895172)","(54, -0.4397358000278473)","(91, -0.44029778242111206)","(79, -0.4807552993297577)"
112,"(184, 0.7693772912025452)","(6, 0.7047033905982971)","(183, 0.7030156850814819)","(173, 0.6993416547775269)","(11, 0.6947557926177979)","(182, 0.6765574216842651)","(126, 0.6722748875617981)","(152, 0.6716979742050171)","(3, 0.6554381251335144)","(159, 0.647588849067688)",...,"(123, -0.008160630241036415)","(7, -0.02231680043041706)","(84, -0.023651305586099625)","(92, -0.02758900821208954)","(83, -0.04289646074175835)","(95, -0.07267769426107407)","(101, -0.07451941072940826)","(15, -0.08598615974187851)","(113, -0.09343944489955902)","(67, -0.10993031412363052)"
113,"(126, 0.6779747605323792)","(159, 0.6444694995880127)","(173, 0.6383281350135803)","(124, 0.6311562061309814)","(184, 0.6300539374351501)","(134, 0.6218796372413635)","(152, 0.6179438829421997)","(170, 0.6086859703063965)","(11, 0.6077624559402466)","(9, 0.6020785570144653)",...,"(90, -0.1538849025964737)","(7, -0.17161595821380615)","(89, -0.1770641803741455)","(113, -0.18671244382858276)","(92, -0.18880701065063477)","(75, -0.19457948207855225)","(91, -0.19629977643489838)","(95, -0.23079442977905273)","(84, -0.24001409113407135)","(83, -0.24417923390865326)"
114,"(159, 0.6134270429611206)","(134, 0.5538808107376099)","(126, 0.5468470454216003)","(35, 0.5420143604278564)","(33, 0.5227313041687012)","(152, 0.5127876400947571)","(22, 0.5064452886581421)","(34, 0.5014567971229553)","(45, 0.49997764825820923)","(42, 0.47625532746315)",...,"(7, -0.21018581092357635)","(89, -0.2116410732269287)","(75, -0.2169867902994156)","(83, -0.2405436933040619)","(38, -0.2539156377315521)","(104, -0.2548913359642029)","(113, -0.26310300827026367)","(95, -0.26525890827178955)","(84, -0.3080902695655823)","(99, -0.31928181648254395)"


In [19]:
# process labels
labels = pd.read_csv('../labels/egypt.csv')
evaluation_metadata = pd.read_csv('../metadata/evaluation_metadata_eng.csv')

In [20]:
# create the ground truth so that the metadata
# lines up with the corresponding ground truth page

ground_truth = {'apm_code':[], 'page':[]}

for i in range(len(evaluation_metadata)):
    code = evaluation_metadata['apm_code'].iloc[i]
    page = labels.loc[labels['apm_code'] == code]['page']

    ground_truth['apm_code'] += [code]
    ground_truth['page'] += [int(page)]

In [21]:
labels = pd.DataFrame(ground_truth)
labels

Unnamed: 0,apm_code,page
0,1961,17
1,1962,17
2,4170,17
3,4171,17
4,4172,17
...,...,...
111,14513,150
112,16750,152
113,14510,152
114,8189,153


In [22]:
len(ground_truth['page'])

116

In [23]:
from sklearn.metrics import accuracy_score, top_k_accuracy_score

print('accuracy of the Doc2Vec model:')
print(accuracy_score(ground_truth['page'], predictions,normalize=True))

print('top-k accuracy of the Doc2Vec model:')
print(top_k_accuracy_score(list(ground_truth['page']), np.array(pred_y_scores), k=10,normalize=True, labels=list(range(0,185))))

accuracy of the Doc2Vec model:
0.05172413793103448
top-k accuracy of the Doc2Vec model:


ValueError: Found array with dim 3. None expected <= 2.

In [None]:
correct_predictions = 0

for id in range(len(pred_y)):

    p = pred_y['pred_page'][id]

    apm_code = evaluation.iloc[0]['apm_code']

    for l in range(len(labels)):
            if id in list(labels.iloc[l][2:]):
                correct_page = labels.iloc[l][1]

    for m in range(1, len(mapping) -1):
            if correct_page == eval(mapping.iloc[m]['page_per_id'])[0] or p == eval(mapping.iloc[m]['page_per_id'])[1]:
                correct_pageid = m
            else:
                correct = -1
    if p == correct_pageid:
        print('prediction correct')
        correct_predictions += 1

correct_predictions

0

### Evaluation

In [None]:
labels = pd.read_csv('../labels/egypt.csv')
labels

Unnamed: 0.1,Unnamed: 0,page,1,2,3,4,5,6,7
0,0,17,1961,1962.0,4170.0,4171.0,4172.0,4173.0,4222.0
1,2,21,15290,,,,,,
2,3,22,4162,4218.0,4164.0,4219.0,3974.0,3863.0,3972.0
3,4,23,12637,,,,,,
4,5,33,12720,15276.0,3943.0,,,,
...,...,...,...,...,...,...,...,...,...
65,66,145,6286,,,,,,
66,67,150,12995,14513.0,,,,,
67,68,152,16750,14510.0,,,,,
68,69,153,8189,,,,,,


In [None]:
evaluation = pd.read_csv('../metadata/evaluation_metadata.csv')

In [None]:
def get_pageid(id):
    for l in range(len(labels)):
        if id in list(labels.iloc[l][2:]):
            correct_page = labels.iloc[l][1]

    for m in range(1, len(mapping) -1):
        if correct_page == eval(mapping.iloc[m]['page_per_id'])[0] or p == eval(mapping.iloc[m]['page_per_id'])[1]:
            correct_pageid = m
        else:
            correct = -1
    return correct_pageid

In [None]:
mapping = pd.read_csv('mapping.csv')
mapping

Unnamed: 0.1,Unnamed: 0,page_per_id
0,0,1
1,1,"(2, 3)"
2,2,"(4, 5)"
3,3,"(6, 7)"
4,4,"(8, 9)"
...,...,...
88,88,"(176, 177)"
89,89,"(178, 179)"
90,90,"(180, 181)"
91,91,"(182, 183)"


In [None]:
# helper function to determine whether a prediction is correct

def is_correct(p, ground_truth):

    for l in range(len(ground_truth)):
        if p in list(ground_truth.iloc[l][2:]):
            p = ground_truth.iloc[l][1]
            print(p)

    for m in range(1, len(mapping) -1):
        if str(round(p)) == eval(mapping.iloc[m]['page_per_id'])[0] or p == eval(mapping.iloc[m]['page_per_id'])[1]:
            correct = m
        else:
            correct = -1
    return p == correct

In [None]:
# function to evaluate predictions
def evaluate(pred_y, ground_truth_file):

    ground_truth = pd.read_csv(ground_truth_file)
    predictions = pred_y['pred_page']

    correct_predictions = 0
    for p in range(len(predictions)):

        prediction = predictions[p]
        correct = get_pageid(evaluation[p])

        if prediction == correct:
            correct_predictions +=1

    return correct_predictions


In [None]:
evaluate(pred_y[:3], '../labels/egypt.csv')

KeyError: 0

## TF-IDF

TF-IDF for matching objects to pages at museum catalogs

In [None]:
import csv
import string
import pandas as pd
import numpy as np

from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

dirname = '../data/catalogs/'
fname = 'Eternal_Egypt_LR'

output_dir = '../raw_extract/'

In [None]:
# prepare data
raw_corpus = pd.read_csv(output_dir + 'extract_Eternal_Egypt_LR.csv')
raw_corpus = raw_corpus.replace(np.nan,'',regex=True)

documents = list(raw_corpus['data'])
documents

['',
 'EtErnal Egypt',
 '',
 'AllArd  \npierson \nmuseum\nBen van den Bercken\nWillem van Haarlem\nWith contributions by\nH. Willems\nD. HuygeM.J. RavenF. Choël\nK. Innemée',
 'eternal egypt4',
 '5',
 '6 Published by the Allard Pierson Museum to coincide with \nthe Eternal Egypt Experience exhibition from 12 July 2013 \nto 5 January 2014. Exhibition and publication are the result of close cooperation between the Allard Pierson Museum, Bibliotheca Alexandrina/Cultnat, the Netherlands- \nFlemish Institute in Cairo (NVIC) and the Netherlands \nOrganisation for Scientific Research (NWO).\nSponsors\nThe exhibition and publication have been realized with  \nthe generous support of DJOSER Reizen, the Friends of the Allard Pierson Museum and the University of Amsterdam.',
 '7',
 '8Contents\n Foreword  –  11\n Chronology  –  12\n \n1 Prehistoric Egypt 1,600,000-4,000 bc   –  16\n Ben van den Bercken\n Climate and climate change\n Religion and Magic\n Material culture\n Social organization\n exc

In [None]:
# preprocess the data
def preprocess_document(document):
    # Tokenization
    tokens = word_tokenize(document)
    # Lowercase conversion
    tokens = [token.lower() for token in tokens]
    # Punctuation removal
    tokens = [token for token in tokens if token not in string.punctuation]
    # Stop word removal
    stop_words = set(stopwords.words("english"))
    tokens = [token for token in tokens if token not in stop_words]

    return " ".join(tokens)

In [None]:
preprocessed_documents = [preprocess_document(document) for document in documents]
print(preprocessed_documents)

['', 'eternal egypt', '', 'allard pierson museum ben van den bercken willem van haarlem contributions h. willems d. huygem.j ravenf choël k. innemée', 'eternal egypt4', '5', '6 published allard pierson museum coincide eternal egypt experience exhibition 12 july 2013 5 january 2014 exhibition publication result close cooperation allard pierson museum bibliotheca alexandrina/cultnat netherlands- flemish institute cairo nvic netherlands organisation scientific research nwo sponsors exhibition publication realized generous support djoser reizen friends allard pierson museum university amsterdam', '7', '8contents foreword – 11 chronology – 12 1 prehistoric egypt 1,600,000-4,000 bc – 16 ben van den bercken climate climate change religion magic material culture social organization excav ation rock art el-hosh qurta 2 predynastic early dynastic periods 4,000-2707 bc – 28 willem van haarlem material culture religion writing kingship excav ation tell ibrahim awad 3 old kingdom first intermediate

In [None]:
# process labels
labels = pd.read_csv('../labels/egypt.csv')
evaluation_metadata = pd.read_csv('../metadata/evaluation_metadata_eng.csv')

In [None]:
ground_truth = {'apm_code':[], 'page':[]}

for i in range(len(evaluation_metadata)):
    code = evaluation_metadata['apm_code'].iloc[i]
    page = labels.loc[labels['apm_code'] == code]['page']

    ground_truth['apm_code'] += [code]
    ground_truth['page'] += [int(page)]

In [None]:
labels = pd.DataFrame(ground_truth)
labels

Unnamed: 0,apm_code,page
0,1961,17
1,1962,17
2,4170,17
3,4171,17
4,4172,17
...,...,...
111,14513,150
112,16750,152
113,14510,152
114,8189,153


In [None]:
# Step 4: Compute TF-IDF values
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(preprocessed_documents)

# Convert TF-IDF document term matrix to DataFrame
feature_names = vectorizer.get_feature_names_out()
df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)

# Print the TF-IDF DataFrame
print("\nTF-IDF DataFrame:")
print(df_tfidf)


TF-IDF DataFrame:
          000      0001  050      0573      0626      0750      0793  \
0    0.000000  0.000000  0.0  0.000000  0.000000  0.000000  0.000000   
1    0.000000  0.000000  0.0  0.000000  0.000000  0.000000  0.000000   
2    0.000000  0.000000  0.0  0.000000  0.000000  0.000000  0.000000   
3    0.000000  0.000000  0.0  0.000000  0.000000  0.000000  0.000000   
4    0.000000  0.000000  0.0  0.000000  0.000000  0.000000  0.000000   
..        ...       ...  ...       ...       ...       ...       ...   
180  0.030222  0.000000  0.0  0.000000  0.000000  0.000000  0.000000   
181  0.000000  0.000000  0.0  0.000000  0.000000  0.000000  0.000000   
182  0.000000  0.000000  0.0  0.000000  0.000000  0.000000  0.000000   
183  0.000000  0.000000  0.0  0.000000  0.000000  0.000000  0.000000   
184  0.000000  0.049887  0.0  0.049887  0.049887  0.049887  0.049887   

           10       100  101the  ...  zones  zoos  zulaq  zuweila    zwolle  \
0    0.000000  0.000000     0.0  ... 

In [None]:
# Predict
predictions = []
pred_y_scores = []

for i in range(len(evaluation_metadata)):
    new_document = evaluation_metadata['data'].iloc[i]

    new_preprocessed_document = preprocess_document(new_document)
    new_tfidf_vector = vectorizer.transform([new_preprocessed_document])

    # Convert TF-IDF matrix to DataFrame
    feature_names = vectorizer.get_feature_names_out()
    df_tfidf_new = pd.DataFrame(new_tfidf_vector.toarray(), columns=feature_names)

    similarity_scores = cosine_similarity(new_tfidf_vector, tfidf_matrix)

    p = similarity_scores.argmax()
    predictions += [p]

    pred_y_scores += list(similarity_scores)

In [None]:
predictions

[16,
 16,
 6,
 6,
 6,
 6,
 6,
 6,
 18,
 6,
 177,
 11,
 6,
 11,
 6,
 22,
 22,
 34,
 35,
 33,
 6,
 3,
 36,
 6,
 6,
 6,
 182,
 37,
 120,
 6,
 6,
 135,
 3,
 42,
 6,
 11,
 15,
 12,
 3,
 48,
 51,
 6,
 135,
 135,
 6,
 48,
 6,
 6,
 6,
 79,
 11,
 76,
 170,
 6,
 77,
 6,
 78,
 3,
 6,
 6,
 6,
 87,
 74,
 6,
 98,
 16,
 100,
 102,
 6,
 103,
 103,
 98,
 145,
 98,
 15,
 11,
 11,
 98,
 98,
 112,
 98,
 6,
 6,
 6,
 132,
 114,
 1,
 98,
 6,
 141,
 124,
 124,
 98,
 6,
 130,
 118,
 6,
 15,
 3,
 6,
 6,
 3,
 6,
 128,
 141,
 125,
 6,
 143,
 6,
 11,
 11,
 150,
 108,
 152,
 11,
 33]

In [None]:
from sklearn.metrics import accuracy_score, top_k_accuracy_score

print('accuracy of the TF-IDF model:')
print(accuracy_score(list(ground_truth['page']), predictions,normalize=True))

print('top-k accuracy of the TF-IDF model:')
print(top_k_accuracy_score(list(ground_truth['page']), np.array(pred_y_scores), k=10,normalize=True, labels=list(range(0,185))))

accuracy of the TF-IDF model:
0.1896551724137931
top-k accuracy of the TF-IDF model:
0.6896551724137931


## BM25

In [None]:
!pip install rank_bm25

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2


In [None]:
import string
import pandas as pd
import numpy as np

from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from rank_bm25 import BM25Okapi


dirname = '../data/catalogs/'
fname = 'Eternal_Egypt_LR'

output_dir = '../raw_extract/'

In [None]:
# prepare data
raw_corpus = pd.read_csv(output_dir + 'extract_Eternal_Egypt_LR.csv')
raw_corpus

Unnamed: 0,data
0,
1,EtErnal Egypt
2,
3,AllArd \npierson \nmuseum\nBen van den Bercke...
4,eternal egypt4
...,...
180,eternal egypt180 Sultan: noble title – later ...
181,"181BiBliograpHyDerriks, C., L. Delvaux, Antiqu..."
182,eternal egypt182\nPhotos of objects in the All...
183,183tHE nEtHErlands-FlEmisH institutE in Cairo\...


In [None]:
raw_corpus = raw_corpus.replace(np.nan,'',regex=True)

documents = list(raw_corpus['data'])
documents

['',
 'EtErnal Egypt',
 '',
 'AllArd  \npierson \nmuseum\nBen van den Bercken\nWillem van Haarlem\nWith contributions by\nH. Willems\nD. HuygeM.J. RavenF. Choël\nK. Innemée',
 'eternal egypt4',
 '5',
 '6 Published by the Allard Pierson Museum to coincide with \nthe Eternal Egypt Experience exhibition from 12 July 2013 \nto 5 January 2014. Exhibition and publication are the result of close cooperation between the Allard Pierson Museum, Bibliotheca Alexandrina/Cultnat, the Netherlands- \nFlemish Institute in Cairo (NVIC) and the Netherlands \nOrganisation for Scientific Research (NWO).\nSponsors\nThe exhibition and publication have been realized with  \nthe generous support of DJOSER Reizen, the Friends of the Allard Pierson Museum and the University of Amsterdam.',
 '7',
 '8Contents\n Foreword  –  11\n Chronology  –  12\n \n1 Prehistoric Egypt 1,600,000-4,000 bc   –  16\n Ben van den Bercken\n Climate and climate change\n Religion and Magic\n Material culture\n Social organization\n exc

In [None]:
# process ground truth so that it is correct

labels = pd.read_csv('../labels/egypt.csv')
evaluation_metadata = pd.read_csv('../metadata/evaluation_metadata_eng.csv')

ground_truth = {'apm_code':[], 'page':[]}

for i in range(len(evaluation_metadata)):
    code = evaluation_metadata['apm_code'].iloc[i]
    page = labels.loc[labels['apm_code'] == code]['page']

    ground_truth['apm_code'] += [code]
    ground_truth['page'] += [int(page)]

labels = pd.DataFrame(ground_truth)
labels

FileNotFoundError: ignored

In [None]:
# preprocess the data
def preprocess_document(document):
    # Tokenization
    tokens = word_tokenize(document)
    # Lowercase conversion
    tokens = [token.lower() for token in tokens]
    # Punctuation removal
    tokens = [token for token in tokens if token not in string.punctuation]
    # Stop word removal
    stop_words = set(stopwords.words("english"))
    tokens = [token for token in tokens if token not in stop_words]

    return " ".join(tokens)

In [None]:
preprocessed_documents = [preprocess_document(document) for document in documents]
print(preprocessed_documents)

['', 'eternal egypt', '', 'allard pierson museum ben van den bercken willem van haarlem contributions h. willems d. huygem.j ravenf choël k. innemée', 'eternal egypt4', '5', '6 published allard pierson museum coincide eternal egypt experience exhibition 12 july 2013 5 january 2014 exhibition publication result close cooperation allard pierson museum bibliotheca alexandrina/cultnat netherlands- flemish institute cairo nvic netherlands organisation scientific research nwo sponsors exhibition publication realized generous support djoser reizen friends allard pierson museum university amsterdam', '7', '8contents foreword – 11 chronology – 12 1 prehistoric egypt 1,600,000-4,000 bc – 16 ben van den bercken climate climate change religion magic material culture social organization excav ation rock art el-hosh qurta 2 predynastic early dynastic periods 4,000-2707 bc – 28 willem van haarlem material culture religion writing kingship excav ation tell ibrahim awad 3 old kingdom first intermediate

In [None]:
tokenized_corpus = [doc.split(" ") for doc in preprocessed_documents]

bm25 = BM25Okapi(tokenized_corpus)

In [None]:
evaluation_metadata = pd.read_csv('../metadata/evaluation_metadata_eng.csv')
query = evaluation_metadata.iloc[0]['data']

tokenized_query = query.split(" ")

doc_scores = bm25.get_scores(tokenized_query)

In [None]:
doc_scores.shape

(185,)

In [None]:
# evaluate
predictions = []
pred_y_scores = []

for i in range(len(evaluation_metadata)):
    new_document = evaluation_metadata['data'].iloc[i]

    query = preprocess_document(new_document)
    tokenized_query = query.split(" ")

    similarity_scores = bm25.get_scores(tokenized_query)

    p = similarity_scores.argmax()
    predictions += [p]
    pred_y_scores += [list(similarity_scores)]

In [None]:
from sklearn.metrics import accuracy_score, top_k_accuracy_score

print('accuracy of the BM25 model:')
print(accuracy_score(list(ground_truth['page']), predictions, normalize=True))

print('top-k accuracy of the BM25 model:')
print(top_k_accuracy_score(ground_truth['page'], np.array(pred_y_scores), k=10,normalize=True, labels=list(range(0,185))))

accuracy of the BM25 model:
0.27586206896551724
top-k accuracy of the BM25 model:
0.7672413793103449


## BERT

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import pandas as pd
import numpy as np

In [2]:
# raw_corpus = pd.read_csv('/content/drive/MyDrive/STUDIE/MASTER/thesis/data/extract_Eternal_Egypt_LR.csv')

raw_corpus = pd.read_csv('../raw_extract/extract_Eternal_Egypt_LR.csv')


In [3]:
raw_corpus = raw_corpus.replace(np.nan,'',regex=True)
documents = list(raw_corpus['data'])

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.30.1-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m56.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m26.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m72.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m

In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModel
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')
model = AutoModel.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')

# tokenizer = AutoTokenizer.from_pretrained("ohshimalab/bert-base-minpaku")
# model = AutoModelForMaskedLM.from_pretrained("ohshimalab/bert-base-minpaku")

In [6]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
# model.to(device)

In [7]:
# initialize dictionary to store tokenized sentences
tokens = {'input_ids': [], 'attention_mask': []}

for sentence in documents:
    # encode each sentence and append to dictionary
    new_tokens = tokenizer.encode_plus(sentence, max_length=128,
                                       truncation=True, padding='max_length',
                                       return_tensors='pt')

    tokens['input_ids'].append(new_tokens['input_ids'][0])
    tokens['attention_mask'].append(new_tokens['attention_mask'][0])

# reformat list of tensors into single tensor
tokens['input_ids'] = torch.stack(tokens['input_ids'])
tokens['attention_mask'] = torch.stack(tokens['attention_mask'])

In [8]:
outputs = model(**tokens)

embeddings = outputs.last_hidden_state
attention_mask = tokens['attention_mask']
mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
masked_embeddings = embeddings * mask
summed = torch.sum(masked_embeddings, 1)
summed_mask = torch.clamp(mask.sum(1), min=1e-9)

mean_pooled = summed / summed_mask


: 

: 

In [None]:
# store mean pooled in other variable
embeddings_book = mean_pooled

In [None]:
# store locally
embeddings_book = embeddings_book.cpu().detach().numpy()

In [None]:
torch.save(mean_pooled, 'book_embedded.t')

In [None]:
#  = mean_pooled.cpu()
# mean_pooled = mean_pooled.detach().numpy().cpu()

# calculate
cosine_similarity(
    [mean_pooled[0]],
    mean_pooled[1:]
).argmax()

1

In [None]:
evaluation_metadata = pd.read_csv('/content/drive/MyDrive/STUDIE/MASTER/thesis/data/evaluation_metadata_eng.csv')

In [None]:
evaluation_corpus = evaluation_metadata.replace(np.nan,'',regex=True)
documents_evaluation = list(evaluation_corpus['data'])

In [None]:
# initialize dictionary to store tokenized sentences
tokens_eval = {'input_ids': [], 'attention_mask': []}

for sentence in documents_evaluation:
    # encode each sentence and append to dictionary
    new_tokens = tokenizer.encode_plus(sentence, max_length=128,
                                       truncation=True, padding='max_length',
                                       return_tensors='pt')
    new_tokens.to('cpu')

    tokens_eval['input_ids'].append(new_tokens['input_ids'][0])
    tokens_eval['attention_mask'].append(new_tokens['attention_mask'][0])

# reformat list of tensors into single tensor
tokens_eval['input_ids'] = torch.stack(tokens_eval['input_ids'])
tokens_eval['attention_mask'] = torch.stack(tokens_eval['attention_mask'])

In [None]:
outputs_eval = model(**tokens_eval)

In [None]:
embeddings = outputs_eval.last_hidden_state

In [None]:
attention_mask = tokens_eval['attention_mask']

In [None]:
mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()

In [None]:
masked_embeddings = embeddings * mask

In [None]:
summed = torch.sum(masked_embeddings, 1)

In [None]:
summed_mask = torch.clamp(mask.sum(1), min=1e-9)

In [None]:
mean_pooled = summed / summed_mask

In [None]:
evaluation = mean_pooled

In [None]:
evaluation = evaluation.cpu().detach().numpy()

In [None]:
torch.save(evaluation, '/content/evaluation_embeddings.t')

In [None]:
evaluation.shape, embeddings_book.shape

(torch.Size([116, 768]), torch.Size([185, 768]))

In [None]:
torch.cuda.empty_cache()

In [None]:
embeddings_book = embeddings_book.detach().numpy()

In [None]:
device.reset()

AttributeError: ignored

In [None]:
!nvidia-smi

Mon Jun 12 19:06:47 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   43C    P0    57W / 400W |  40507MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
torch.cuda.clear_cache()

AttributeError: ignored

In [None]:
# evaluate
predictions = []
pred_y_scores = []

for i in range(evaluation.shape[0]):
    embedding = evaluation[i]

    p = cosine_similarity(
        [embedding],
        embeddings_book).argmax()

    predictions += [p]


In [None]:
predictions

[179,
 20,
 20,
 20,
 20,
 20,
 8,
 179,
 179,
 179,
 8,
 8,
 179,
 20,
 179,
 59,
 134,
 8,
 31,
 8,
 8,
 182,
 8,
 181,
 8,
 126,
 59,
 126,
 126,
 8,
 8,
 8,
 181,
 123,
 9,
 160,
 47,
 91,
 48,
 134,
 123,
 179,
 8,
 146,
 47,
 8,
 54,
 9,
 70,
 8,
 8,
 36,
 8,
 8,
 8,
 9,
 48,
 9,
 179,
 181,
 91,
 181,
 8,
 8,
 107,
 59,
 8,
 179,
 8,
 8,
 8,
 134,
 8,
 8,
 8,
 59,
 59,
 181,
 181,
 109,
 8,
 8,
 141,
 123,
 8,
 141,
 181,
 134,
 8,
 107,
 48,
 107,
 107,
 179,
 90,
 134,
 109,
 134,
 184,
 182,
 47,
 182,
 181,
 36,
 48,
 182,
 107,
 38,
 181,
 53,
 8,
 8,
 91,
 59,
 146,
 125]

In [None]:
# process ground truth so that it is correct

labels = pd.read_csv('/content/drive/MyDrive/STUDIE/MASTER/thesis/data/egypt.csv')
evaluation_metadata = pd.read_csv('/content/drive/MyDrive/STUDIE/MASTER/thesis/data/evaluation_metadata_eng.csv')

ground_truth = {'apm_code':[], 'page':[]}

for i in range(len(evaluation_metadata)):
    code = evaluation_metadata['apm_code'].iloc[i]
    page = labels.loc[labels['apm_code'] == code]['page']

    ground_truth['apm_code'] += [code]
    ground_truth['page'] += [int(page)]

labels = pd.DataFrame(ground_truth)
labels

Unnamed: 0,apm_code,page
0,1961,17
1,1962,17
2,4170,17
3,4171,17
4,4172,17
...,...,...
111,14513,150
112,16750,152
113,14510,152
114,8189,153


In [None]:
from sklearn.metrics import accuracy_score, top_k_accuracy_score

print('accuracy of the BERT model:')
print(accuracy_score(list(ground_truth['page']), predictions, normalize=True))

# print('top-k accuracy of the BM25 model:')
# print(top_k_accuracy_score(ground_truth['page'], np.array(pred_y_scores), k=10,normalize=True, labels=list(range(0,185))))

accuracy of the BERT model:
0.008620689655172414
