# Imports

In [1]:
import pandas as pd
import numpy as np

# Loading the dataset

In [2]:
df = pd.read_csv("./without_links.csv")

In [3]:
df.head()

Unnamed: 0,fact
0,Roman Empire. Dacia is invaded by barbarians.
1,Roman Empire. End of the war with Parthia : ...
2,Asia. King Chogo of Baekje succeeds to the t...
3,Religion. Pope Soter succeeds Pope Anicetus...
4,Byzantine Empire. Emperor Maurice regains th...


In [111]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40356 entries, 0 to 40355
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   fact    40356 non-null  object
dtypes: object(1)
memory usage: 315.4+ KB


In [112]:
df.shape

(40356, 1)

# Preprocessing

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
vectorizer = TfidfVectorizer(strip_accents="unicode", lowercase=True, stop_words="english", smooth_idf=True)

In [6]:
tfidf_wm = vectorizer.fit_transform(df.values.flatten())

In [7]:
type(tfidf_wm)

scipy.sparse._csr.csr_matrix

In [117]:
tfidf_wm[0]

<1x49471 sparse matrix of type '<class 'numpy.float64'>'
	with 5 stored elements in Compressed Sparse Row format>

In [118]:
len(vectorizer.get_feature_names_out())

49471

In [8]:
import pickle

In [120]:
pickle.dump(vectorizer, open("tfidf_vectorizer.pkl", "wb"))

In [9]:
pickle.dump(tfidf_wm, open("tdidf_wm", "wb"))

In [121]:
from sklearn.metrics.pairwise import cosine_similarity

In [141]:
def retrieve(query):
    query_tfidf = vectorizer.transform([query])
    similarity_arr = cosine_similarity(query_tfidf, tfidf_wm).flatten()
    best_doc_index_arr = np.argpartition(similarity_arr, -10)[-10:]
    ret = []
    for ind in best_doc_index_arr:
        ret.append(df.iloc[ind].values[0])
    return ret

In [142]:
retrieve("Roman Empire")

['Roman Empire.  Armenia  and  Mesopotamia  becomes a  protectorate  of the  Roman Empire .',
 'Roman Empire. The Isle of  Rhodes  returns to the Roman Empire.',
 'Roman Empire.  Noricum  is incorporated into the  Roman Empire .',
 'Roman Empire.  Roman emperor   Nero  is also a  Roman consul .',
 'Roman Empire. The kingdom of  Osroene  becomes a  province  of the  Roman Empire .',
 'Roman Empire.  Roman emperor   Domitian  becomes  Roman Consul .',
 'Roman Empire.  Roman emperor   Nero  is also a  Roman Consul .',
 'Roman Empire.  Roman emperor   Domitian  becomes a  Roman Consul .',
 'Roman Empire. Fire in  Rome .',
 'Roman Empire. A fifteen-year  plague  begins in the  Roman Empire .']