# Models file for query-document matching

## Doc2Vec

In [1]:
import gensim
import pandas as pd
import numpy as np

dirname = '../processed/'
EGYPT_EN = 'processed_Eternal_Egypt_LR.csv'

dirname_metadata = '../metadata/'
queries = 'evaluation_metadata_eng.csv' 

In [2]:
# prepare data
raw_corpus = pd.read_csv(dirname + EGYPT_EN)
raw_corpus = raw_corpus.replace(np.nan,'',regex=True)

documents = list(raw_corpus['data'])

In [3]:
def read_corpus(fname, tokens_only=False):

    for i, row in pd.read_csv(fname).iterrows():
        try:
            tokens = gensim.utils.simple_preprocess(row['data'])
        except:
            tokens = ''
        if tokens_only:
            yield tokens
        else:
            # For training data, add tags
            yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

In [4]:
train_set = list(read_corpus(dirname + EGYPT_EN ))
eval_set = list(read_corpus('../metadata/processed_evaluation_metadata_eng.csv'))

In [5]:
train_set, eval_set
print(len(train_set), len(eval_set))

185 116


In [6]:
#Import all the dependencies
from gensim.models.doc2vec import Doc2Vec

max_epochs = 100
vec_size = 20
alpha = 0.025

model = Doc2Vec(vector_size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                dm =1,
                epochs=max_epochs)

In [39]:
model.build_vocab(train_set)

In [40]:
for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(train_set,
                total_examples=model.corpus_count,
                epochs=model.epochs)
    
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

iteration 0
iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
iteration 20
iteration 21
iteration 22
iteration 23
iteration 24
iteration 25
iteration 26
iteration 27
iteration 28
iteration 29
iteration 30
iteration 31
iteration 32
iteration 33
iteration 34
iteration 35
iteration 36
iteration 37
iteration 38
iteration 39
iteration 40
iteration 41
iteration 42
iteration 43
iteration 44
iteration 45
iteration 46
iteration 47
iteration 48
iteration 49
iteration 50
iteration 51
iteration 52
iteration 53
iteration 54
iteration 55
iteration 56
iteration 57
iteration 58
iteration 59
iteration 60
iteration 61
iteration 62
iteration 63
iteration 64
iteration 65
iteration 66
iteration 67
iteration 68
iteration 69
iteration 70
iteration 71
iteration 72
iteration 73
iteration 74
iteration 75
iteration 76
iteration

In [15]:
model.save("models/d2v.model")
print("Model Saved")

Model Saved


In [44]:
# Predict the page for the first metadata entry
inferred_vector = model.infer_vector(eval_set[0][0])
sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(0, ' '.join(eval_set[0][0])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)

for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_set[sims[index][0]].words)))

Test Document (0): «th century bc mm fragment leech fibula call nr apm fragment leech fibula metal object ornament bracket leech fibula decoration incised lines hollow gray green patina bronze material incision italy italy presumably allard pierson archaeological collection prehistoric italic»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec<dm/m,d20,n5,w5,s0.001,t3>:

MOST (11, 0.751389741897583): «eternal egypt unfurls fascinating history egypt goes deeper pharaoh mummy stereotypes focuses long history egypt earliest prehis tory bc modern coptic islamic times offer readers overview rich egyptian collection allard pierson museum archaeological fieldwork conducted egypt dutch flemish universities ancient egyptian culture archaeological collection museum illustrates inspiring multifaceted history egypt prehistory including early middle ages archaeological fieldwork egypt resulted surprising new perspec tives many years last least publication also highlights excavations conducted allard pierso

In [45]:
predictions = []
pred_y_scores = []

for doc_id in range(len(eval_set)):
    inferred_vector = model.infer_vector(eval_set[doc_id][0])
    sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))

    p, score = sims[0]
    predictions += [p]

    sims.sort(key=lambda x: x[0])

    res = [[i for i, j in sims],
       [j for i, j in sims]]
    
    pred_y_scores += [res[1]] 

In [46]:
scores = pd.DataFrame(pred_y_scores)

In [7]:
# process labels
labels = pd.read_csv('../labels/egypt.csv')
evaluation_metadata = pd.read_csv('../metadata/evaluation_metadata_eng.csv')

In [8]:
# create the ground truth so that the metadata
# lines up with the corresponding ground truth page

ground_truth = {'apm_code':[], 'page':[]}

for i in range(len(evaluation_metadata)):
    code = evaluation_metadata['apm_code'].iloc[i]
    page = labels.loc[labels['apm_code'] == code]['page']

    ground_truth['apm_code'] += [code]
    ground_truth['page'] += [int(page)]

In [9]:
labels = pd.DataFrame(ground_truth)

### Evaluation

In [50]:
from sklearn.metrics import accuracy_score, top_k_accuracy_score, precision_score, recall_score, f1_score

print('Accuracy of the Doc2Vec model:')
print(accuracy_score(ground_truth['page'], predictions,normalize=True))

print('precision of the Doc2Vec model:')
print(precision_score(ground_truth['page'], predictions, average='macro'))

print('recall of the Doc2Vec model:')
print(recall_score(ground_truth['page'], predictions, average='micro'))

print('f1 score of the Doc2Vec model:')
print(f1_score(ground_truth['page'], predictions, average='micro'))

print('top-k accuracy of the Doc2Vec model:')
print(top_k_accuracy_score(list(ground_truth['page']), np.array(pred_y_scores), k=10,normalize=True, labels=list(range(0,185))))

Accuracy of the Doc2Vec model:
0.034482758620689655
precision of the Doc2Vec model:
0.031914893617021274
recall of the Doc2Vec model:
0.034482758620689655
f1 score of the Doc2Vec model:
0.034482758620689655
top-k accuracy of the Doc2Vec model:
0.2413793103448276


  _warn_prf(average, modifier, msg_start, len(result))


## TF-IDF

TF-IDF for matching objects to pages at museum catalogs

In [10]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [11]:
# prepare data
raw_corpus = pd.read_csv(dirname + EGYPT_EN)
raw_corpus = raw_corpus.replace(np.nan,'',regex=True)

documents = list(raw_corpus['data'])

# preprocessed_documents = pd.read_csv(dirname + EGYPT_EN)
preprocessed_metadata = pd.read_csv('../metadata/processed_evaluation_metadata_eng.csv')

In [12]:
# Step 4: Compute TF-IDF values
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)

# Convert TF-IDF document term matrix to DataFrame
feature_names = vectorizer.get_feature_names_out()
df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)

# Print the TF-IDF DataFrame
print("\nTF-IDF DataFrame:")
print(df_tfidf)


TF-IDF DataFrame:
         000  050        10  100  101the       104  105the  106  107  1070  \
0    0.00000  0.0  0.000000  0.0     0.0  0.000000     0.0  0.0  0.0   0.0   
1    0.00000  0.0  0.000000  0.0     0.0  0.000000     0.0  0.0  0.0   0.0   
2    0.00000  0.0  0.000000  0.0     0.0  0.000000     0.0  0.0  0.0   0.0   
3    0.00000  0.0  0.000000  0.0     0.0  0.000000     0.0  0.0  0.0   0.0   
4    0.00000  0.0  0.000000  0.0     0.0  0.000000     0.0  0.0  0.0   0.0   
..       ...  ...       ...  ...     ...       ...     ...  ...  ...   ...   
180  0.03161  0.0  0.039296  0.0     0.0  0.052434     0.0  0.0  0.0   0.0   
181  0.00000  0.0  0.000000  0.0     0.0  0.000000     0.0  0.0  0.0   0.0   
182  0.00000  0.0  0.000000  0.0     0.0  0.000000     0.0  0.0  0.0   0.0   
183  0.00000  0.0  0.000000  0.0     0.0  0.000000     0.0  0.0  0.0   0.0   
184  0.00000  0.0  0.000000  0.0     0.0  0.000000     0.0  0.0  0.0   0.0   

     ...  zones  zoos  zulaq  zuweila    zwo

In [13]:
# Predict
predictions = []
pred_y_scores = []

for i in range(len(evaluation_metadata)):
    new_document = preprocessed_metadata['data'].iloc[i]
    new_tfidf_vector = vectorizer.transform([new_document])

    # Convert TF-IDF matrix to DataFrame
    feature_names = vectorizer.get_feature_names_out()
    df_tfidf_new = pd.DataFrame(new_tfidf_vector.toarray(), columns=feature_names)

    similarity_scores = cosine_similarity(new_tfidf_vector, tfidf_matrix)

    p = similarity_scores.argmax()
    predictions += [p]
    pred_y_scores += list(similarity_scores)

In [14]:
from sklearn.metrics import accuracy_score, top_k_accuracy_score, precision_score, recall_score, f1_score

print('Accuracy of the TF-IDF model:')
print(accuracy_score(ground_truth['page'], predictions,normalize=True))

print('precision of the TF-IDF model:')
print(precision_score(ground_truth['page'], predictions, average='macro'))

print('recall of the TF-IDF model:')
print(recall_score(ground_truth['page'], predictions, average='macro'))

print('f1 score of the TF-IDF model:')
print(f1_score(ground_truth['page'], predictions, average='macro'))

print('top-k accuracy of the TF-IDF model:')
print(top_k_accuracy_score(list(ground_truth['page']), np.array(pred_y_scores), k=10,normalize=True, labels=list(range(0,185))))

Accuracy of the TF-IDF model:
0.16379310344827586
precision of the TF-IDF model:
0.18154761904761904
recall of the TF-IDF model:
0.12202380952380952
f1 score of the TF-IDF model:
0.14030612244897958
top-k accuracy of the TF-IDF model:
0.6982758620689655


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## BM25

In [56]:
from rank_bm25 import BM25Okapi

In [57]:
tokenized_corpus = [doc.split(" ") for doc in documents]
bm25 = BM25Okapi(tokenized_corpus)

In [58]:
query = evaluation_metadata.iloc[0]['data']

tokenized_query = query.split(" ")

doc_scores = bm25.get_scores(tokenized_query)

In [59]:
doc_scores.shape

(185,)

In [60]:
# evaluate
predictions = []
pred_y_scores = []

for i in range(len(evaluation_metadata)):
    query = evaluation_metadata['data'].iloc[i]
    tokenized_query = query.split(" ")

    similarity_scores = bm25.get_scores(tokenized_query)

    p = similarity_scores.argmax()
    predictions += [p]
    pred_y_scores += [list(similarity_scores)]

In [61]:
from sklearn.metrics import accuracy_score, top_k_accuracy_score, precision_score, recall_score, f1_score

print('Accuracy of the BM25 model:')
print(accuracy_score(ground_truth['page'], predictions,normalize=True))

print('precision of the BM25 model:')
print(precision_score(ground_truth['page'], predictions, average='macro'))

print('recall of the BM25 model:')
print(recall_score(ground_truth['page'], predictions, average='micro'))

print('f1 score of the BM25 model:')
print(f1_score(ground_truth['page'], predictions, average='micro'))

print('top-k accuracy of the BM25 model:')
print(top_k_accuracy_score(list(ground_truth['page']), np.array(pred_y_scores), k=10,normalize=True, labels=list(range(0,185))))

Accuracy of the BM25 model:
0.2413793103448276
precision of the BM25 model:
0.16218944099378882
recall of the BM25 model:
0.2413793103448276
f1 score of the BM25 model:
0.2413793103448276
top-k accuracy of the BM25 model:
0.6206896551724138


  _warn_prf(average, modifier, msg_start, len(result))
