# Text-Matching for Eternal Egypt
TF-IDF as a pilot for matching pages to objects at museum catalogs

## P_z = Prepare the data for text-matching pilot
page processing


In [2]:
import csv
import string
import pandas as pd
import numpy as np
import nltk

from PyPDF2 import PdfReader
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

dirname_metadata='../metadata/'
fname_metadata='evaluation_metadata_eng.csv'

In [3]:
raw_corpus = pd.read_csv(dirname_metadata + fname_metadata)

In [4]:
raw_corpus = raw_corpus.replace(np.nan,'',regex=True)

In [5]:
documents = list(raw_corpus['data'])

In [6]:
documents

['6th century BC, d 055 (mm), fragment leech fibula, -600/-500, Call Nr:APM01961, fragment leech fibula, metal, object, ornament, bracket of leech fibula; decoration incised lines; hollow; gray green patina, bronze material, incision, Italy, Italy (presumably), Allard Pierson Archaeological Collection, -600, -500, prehistoric Italic,',
 '8th-7th century BC, h 117 (mm), amphora, -800/-600, Call Nr:APM01962, amphora, pottery, object, vessel, small wide-necked amphora; ears quite angular, from lip to on shoulder, groups of concentric circles on neck and shoulder, bundle of 5 narrow horizontal bands around the waist, a broad band below, ground bright red, Algemeene Gids 181, Corpus Vasorum Antiquorum I. IICb, pl 4:7, vg Myres , Cesnola coll. p.105, pottery material, Cyprus, Allard Pierson Archaeological Collection, -800, -600, Greek Greek, geometric, ',
 'prehistoric or earlier, d 034 (mm), knife, microlith, -3000, Call Nr:APM04170, knife, microlith, utensil, object, stone, microlith; narr

In [7]:
# Step 3: Preprocessing the documents
def preprocess_document(document):
    # Tokenization
    tokens = word_tokenize(document)
    # Lowercase conversion
    tokens = [token.lower() for token in tokens] 
    # Punctuation removal
    tokens = [token for token in tokens if token not in string.punctuation]
    # Stop word removal
    stop_words = set(stopwords.words("english"))
    tokens = [token for token in tokens if token not in stop_words]

    return " ".join(tokens)
 
preprocessed_documents = [preprocess_document(document) for document in documents]
print(preprocessed_documents)

['6th century bc 055 mm fragment leech fibula -600/-500 call nr apm01961 fragment leech fibula metal object ornament bracket leech fibula decoration incised lines hollow gray green patina bronze material incision italy italy presumably allard pierson archaeological collection -600 -500 prehistoric italic', '8th-7th century bc h 117 mm amphora -800/-600 call nr apm01962 amphora pottery object vessel small wide-necked amphora ears quite angular lip shoulder groups concentric circles neck shoulder bundle 5 narrow horizontal bands around waist broad band ground bright red algemeene gids 181 corpus vasorum antiquorum i. iicb pl 4:7 vg myres cesnola coll p.105 pottery material cyprus allard pierson archaeological collection -800 -600 greek greek geometric', 'prehistoric earlier 034 mm knife microlith -3000 call nr apm04170 knife microlith utensil object stone microlith narrow dark knife back sides turn sharp see front inv card drawing 3 sides n shirai july 21 2004 exit flint material egypt f

## P_x=process evaluation pages

In [55]:
evaluation_raw = pd.read_csv('../raw_extract/extract_Eternal_Egypt_LR.csv')
labels = pd.read_csv('../labels/egypt.csv')

In [15]:
ids = labels['page'].unique()
evaluation = evaluation_raw.iloc[ids]

In [52]:
evaluation.to_csv('evaluation.csv')

## P_prepare label file

In [56]:
evaluation, labels

(                                                  data
 17   eternal egypt18\nfound were unfortunately not ...
 18   1919\nThe development of the prehistoric \ncul...
 21   22\neternal egypt\nand ivory and copper object...
 22   23prEHistoriC Egypt\ning and gathering also to...
 23   eternal egypt24 Badarian sites (4,400­4,000 BC...
 ..                                                 ...
 145  146\neternal egypt\nIsmant el-Kharab, Kellis \...
 150  151\nright from the start, many Christians \ni...
 152  153CoptiC Egypt\nthe second. As a result the c...
 153  154\neternal egypt\nmonasteries had survived. ...
 159    eternal egypt160160EtErnal EgyptIslamic \nEgypt
 
 [71 rows x 1 columns],
      page  apm_code
 0      17      1961
 1      17      1962
 2      17      4170
 3      17      4171
 4      17      4172
 ..    ...       ...
 114   150     14513
 115   152     16750
 116   152     14510
 117   153      8189
 118   159     16385
 
 [119 rows x 2 columns])

## P_b=train_vectorized_model

In [16]:
# train vectorizer

# Step 4: Compute TF-IDF values
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(preprocessed_documents)
 
# Convert TF-IDF document term matrix to DataFrame
feature_names = vectorizer.get_feature_names_out()
df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)
 
# Print the TF-IDF DataFrame
print("\nTF-IDF DataFrame:")
print(df_tfidf)


TF-IDF DataFrame:
     000  0000  0014  0037  0100  018  0200       022  023       025  ...  \
0    0.0   0.0   0.0   0.0   0.0  0.0   0.0  0.000000  0.0  0.000000  ...   
1    0.0   0.0   0.0   0.0   0.0  0.0   0.0  0.000000  0.0  0.000000  ...   
2    0.0   0.0   0.0   0.0   0.0  0.0   0.0  0.000000  0.0  0.000000  ...   
3    0.0   0.0   0.0   0.0   0.0  0.0   0.0  0.192411  0.0  0.000000  ...   
4    0.0   0.0   0.0   0.0   0.0  0.0   0.0  0.000000  0.0  0.172718  ...   
..   ...   ...   ...   ...   ...  ...   ...       ...  ...       ...  ...   
111  0.0   0.0   0.0   0.0   0.0  0.0   0.0  0.000000  0.0  0.000000  ...   
112  0.0   0.0   0.0   0.0   0.0  0.0   0.0  0.000000  0.0  0.000000  ...   
113  0.0   0.0   0.0   0.0   0.0  0.0   0.0  0.000000  0.0  0.000000  ...   
114  0.0   0.0   0.0   0.0   0.0  0.0   0.0  0.000000  0.0  0.000000  ...   
115  0.0   0.0   0.0   0.0   0.0  0.0   0.0  0.000000  0.0  0.000000  ...   

     young  youth  zandee  zeitschrift  zigzag  zijderve

In [18]:
new_document = evaluation['data'].iloc[0]

new_preprocessed_document = preprocess_document(new_document)
new_tfidf_vector = vectorizer.transform([new_preprocessed_document])
 
# Convert TF-IDF matrix to DataFrame
feature_names = vectorizer.get_feature_names_out()
df_tfidf_new = pd.DataFrame(new_tfidf_vector.toarray(), columns=feature_names)
 
# Print the TF-IDF DataFrame
print("\nTF-IDF DataFrame:")
print(df_tfidf_new)


TF-IDF DataFrame:
        000  0000  0014  0037  0100  018  0200  022  023  025  ...  young  \
0  0.473922   0.0   0.0   0.0   0.0  0.0   0.0  0.0  0.0  0.0  ...    0.0   

   youth  zandee  zeitschrift  zigzag  zijderveld  zwolle   äg  égypte  \
0    0.0     0.0          0.0     0.0         0.0     0.0  0.0     0.0   

   étoffes  
0      0.0  

[1 rows x 1718 columns]


In [41]:
# predict
# Step 6: Measure document similarity
 
# Compute cosine similarity between the new document and all other documents
similarity_scores = np.array(cosine_similarity(new_tfidf_vector, tfidf_matrix)[0])
similarity_scores.argmax()

56

In [43]:
raw_corpus.iloc[similarity_scores.argmax()]['apm_code']

8851

In [48]:
# evaluate
predictions = []
pred_y_scores = []

for i in range(len(evaluation)):
    new_document = evaluation['data'].iloc[i]
    
    new_preprocessed_document = preprocess_document(new_document)
    new_tfidf_vector = vectorizer.transform([new_preprocessed_document])
    
    # Convert TF-IDF matrix to DataFrame
    feature_names = vectorizer.get_feature_names_out()
    df_tfidf_new = pd.DataFrame(new_tfidf_vector.toarray(), columns=feature_names)

    similarity_scores = cosine_similarity(new_tfidf_vector, tfidf_matrix)

    p = similarity_scores.argmax()
    predictions += [p]

    pred_y_scores += list(similarity_scores)


In [51]:
predictions

71

In [40]:
labels=pd.read_csv('../labels/labels_exp_2.csv', header=None)
# predictions, labels, pred_y_scores
list(labels.iloc[0])

[1961, 1962, 4170, 4171, 4172, 4173, 4222]

In [39]:
from sklearn.metrics import accuracy_score, top_k_accuracy_score, multilabel_confusion_matrix

print('accuracy of the TF-IDF model:')
print(multilabel_confusion_matrix(list(labels.iloc[0]), pred_y_scores))

print('top-k accuracy of the TF-IDF model:')
print(top_k_accuracy_score(list(ground_truth['page']), np.array(pred_y_scores), k=10,normalize=True, labels=list(range(0,185))))

accuracy of the TF-IDF model:


ValueError: Found input variables with inconsistent numbers of samples: [7, 1]