In [2]:
def load_sentences(path: str):
    chunks = []
    with open(path) as file:
        for line in file.readlines():
            line = line.strip()
            if line:
                chunks.append(line)
    return chunks

meditations = '../data/meditations.txt'
chunks = load_sentences(meditations)

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from string import punctuation

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(chunk: str):
    chunk = chunk.lower()
    chunk = re.sub(r'[^a-z\s]', '', chunk)
    tokens = word_tokenize(chunk)
    cleaned_tokens = [
        lemmatizer.lemmatize(word) for word in tokens
        if word not in stop_words and word not in punctuation
    ]
    cleaned_chunk = ' '.join(cleaned_tokens)
    return cleaned_chunk

chunks_clean = list(clean_text(chunk) for chunk in chunks)


In [None]:
from gensim.models import Word2Vec

corpus = list(chunk.split() for chunk in chunks_clean)

model = Word2Vec(
    corpus,
    vector_size=100,
    window=5,
    min_count=1,
    sg=0,
    workers=4
)

model.save("word2vec_model")

In [9]:
import numpy as np

def embed_chunk(chunk: str, model):
    embeddings = list(model.wv[word] for word in chunk.split())
    return np.mean(embeddings, axis=0)


model = Word2Vec.load("word2vec_model")
vectors = list(embed_chunk(chunk, model) for chunk in chunks_clean)


In [None]:
import pandas as pd

database = pd.DataFrame.from_dict(
    {
        "chunk": chunks,
        "chunk_clean": chunks_clean,
        "vector": vectors
    }
)

database.head()

In [23]:
from sklearn.metrics.pairwise import cosine_similarity

query = "food"
top_k = 3

query_vector = embed_chunk(clean_text(query), model)

similarity_scores = cosine_similarity([query_vector], database.vector.to_list())[0]
similarity_pairs = list(zip(database.chunk, similarity_scores))
results = sorted(similarity_pairs, key=lambda x: x[1], reverse=True)

for k in range(top_k):
    print(results[k])

("With food and drinks and cunning magic arts  Turning the channel's course to 'scape from death.  The breeze which heaven has sent  We must endure, and toil without complaining.", 0.9948866)
('I go through the things which happen according to nature until I shall fall and rest, breathing out my breath into that element out of which I daily draw it in, and falling upon that earth out of which my father collected the seed, and my mother the blood, and my nurse the milk; out of which during so many years I have been supplied with food and drink; which bears me when I tread on it and abuse it for so many purposes.', 0.99469537)
('The ruling faculty does not disturb itself; I mean, does not frighten itself or cause itself pain. But if any one else can frighten or pain it, let him do so. For the faculty itself will not by its own opinion turn itself into such ways. Let the body itself take care, if it can, that is suffer nothing, and let it speak, if it suffers. But the soul itself, that wh