In [2]:
def load_sentences(path: str):
    chunks = []
    with open(path) as file:
        for line in file.readlines():
            line = line.strip()
            if line:
                chunks.append(line)
    return chunks

meditations = '../data/meditations.txt'
chunks = load_sentences(meditations)

In [3]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from string import punctuation

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(chunk: str):
    chunk = chunk.lower()
    chunk = re.sub(r'[^a-z\s]', '', chunk)
    tokens = word_tokenize(chunk)
    cleaned_tokens = [
        lemmatizer.lemmatize(word) for word in tokens
        if word not in stop_words and word not in punctuation
    ]
    cleaned_chunk = ' '.join(cleaned_tokens)
    return cleaned_chunk

chunks_clean = list(clean_text(chunk) for chunk in chunks)


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/garrett.partenza/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/garrett.partenza/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/garrett.partenza/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
from gensim.models import FastText, KeyedVectors

corpus = list(chunk.split() for chunk in chunks_clean)

model = FastText(
    corpus,
    vector_size=300,
    window=8,
    min_count=1,
    sg=0,
    workers=4,
    bucket=50000
)

model.save("word2vec_model")
model.wv.save_word2vec_format('word2vec_model.bin', binary=True)

In [5]:
import numpy as np

def embed_chunk(chunk: str, model):
    embeddings = list(model.wv[word] for word in chunk.split())
    return np.mean(embeddings, axis=0)


model = FastText.load("word2vec_model")
vectors = list(embed_chunk(chunk, model) for chunk in chunks_clean)
print(vectors[0].shape)

(300,)


In [6]:
import pandas as pd

database = pd.DataFrame.from_dict(
    {
        "chunk": chunks,
        "chunk_clean": chunks_clean,
        "vector": vectors
    }
)

database.head()

Unnamed: 0,chunk,chunk_clean,vector
0,From my grandfather Verus I learned good moral...,grandfather verus learned good moral governmen...,"[0.028837962, -0.52616304, -0.07005191, 0.1680..."
1,From the reputation and remembrance of my fath...,reputation remembrance father modesty manly ch...,"[0.036602926, -0.67389375, -0.08977281, 0.2152..."
2,"From my mother, piety and beneficence, and abs...",mother piety beneficence abstinence evil deed ...,"[0.03358567, -0.6202404, -0.08298099, 0.198239..."
3,"From my great-grandfather, not to have frequen...",greatgrandfather frequented public school good...,"[0.03419673, -0.63160354, -0.084649496, 0.2017..."
4,"From my governor, to be neither of the green n...",governor neither green blue party game circus ...,"[0.02666075, -0.49113876, -0.065551996, 0.1568..."


In [7]:
database[["chunk", "vector"]].to_csv("meditations.csv")

In [8]:
from sklearn.metrics.pairwise import cosine_similarity

query = "planets and the universe"
print(clean_text(query))
top_k = 3

query_vector = embed_chunk(clean_text(query), model)

similarity_scores = cosine_similarity([query_vector], database.vector.to_list())[0]
similarity_pairs = list(zip(database.chunk, similarity_scores))
results = sorted(similarity_pairs, key=lambda x: x[1], reverse=True)

top_k_results = []
for k in range(top_k):
    print(results[k][0])
    top_k_results.append(results[k][0])

planet universe
Since it is possible that thou mayest depart from life this very moment, regulate every act and thought accordingly. But to go away from among men, if there are gods, is not a thing to be afraid of, for the gods will not involve thee in evil; but if indeed they do not exist, or if they have no concern about human affairs, what is it to me to live in a universe devoid of gods or devoid of Providence? But in truth they do exist, and they do care for human things, and they have put all the means in man's power to enable him not to fall into real evils. And as to the rest, if there was anything evil, they would have provided for this also, that it should be altogether in a man's power not to fall into it. Now that which does not make a man worse, how can it make a man's life worse? But neither through ignorance, nor having the knowledge, but not the power to guard against or correct these things, is it possible that the nature of the universe has overlooked them; nor is it 

In [9]:
import ollama

def generate_prompts(query, results):
    system_prompt = (
        "<system_prompt>"
        "You are a Stoic AI assistant, deeply versed in the teachings of Marcus Aurelius. "
        "Your job is to follow the user's task exactly, not straying from any of the directions provided to you."
        "</system_prompt>"
    )

    task_xml = (
        "<task>"
        "Analyze the following user query and the provided quotes from Marcus Aurelius' Meditations. "
        "Select the most relevant quote that addresses the user's concern. Structure your response as follows:"
        "<instructions>"
        "<step>Quote: Begin with the chosen quote, enclosed in quotation marks.</step>"
        "<step>Do not hallucinate the chosen quote, you must choose one from the given results.</step>"
        "<step>Interpretation: In 2-3 sentences, why you chose this quote, given the users original query.</step>"
        "<step>Write from the point of view that, the user is trusting that this is the most relevant quote.</step>"
        "<step>Advice: In 4-5 sentences, offer practical guidance based on the quote and Stoic principles.</step>"
        "<step>Do not write more than a few sentences outside of the selected quote.</step>"
        "<step>Do not discuss anything about stoicism outside of the quote and query.</step>"
        "</instructions>"
        "Maintain a wise and compassionate tone throughout your response. Aside from citing your chosen quote, use language that assumes you are speaking to the original user personally. Use language and style that mirrors that of a modern day philosopher spreading stoic wisdom to a student. Construct your response in parsable XML format with <quote>, <interpretation>, and <advice> for the keys mentioned in the afformentioned steps, including a <root> key for the root of the entire response."
        "</task>"
    )

    query_xml = f"<rag_query>{query}</rag_query>"

    search_results_xml = "<search_results>" + "".join(
        f"<search_result>{result}</search_result>" for result in results
    ) + "</search_results>"

    user_prompt = f"<user_prompt>{task_xml}{query_xml}{search_results_xml}</user_prompt>"

    return system_prompt, user_prompt

def stoic_guide(system_prompt, user_prompt):

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ]

    response = ollama.chat(
        model='llama3:instruct',
        messages=messages
    )

    return response['message']['content']

system_prompt, user_prompt = generate_prompts(query, top_k_results)
print(stoic_guide(system_prompt, user_prompt))

<root>
<quote>Since it is possible that thou mayest depart from life this very moment, regulate every act and thought accordingly. But to go away from among men, if there are gods, is not a thing to be afraid of, for the gods will not involve thee in evil; but if indeed they do not exist, or if they have no concern about human affairs, what is it to me to live in a universe devoid of gods or devoid of Providence?</quote>

<interpretation>I chose this quote because you were curious about planets and the universe. This passage addresses the idea that our existence is connected to the larger cosmic order, whether we believe in a divine presence or not. It's a reminder to consider our place within the grand scheme and adjust our actions accordingly.</interpretation>

<advice>As you ponder the mysteries of the universe, take this quote as a gentle reminder to cultivate inner wisdom. Regulate your thoughts and actions today, knowing that every moment is a precious opportunity to align yourse

In [17]:
from gensim.models.fasttext_inner import ft_hash_bytes
import json

def custom_ft_hash_bytes(bytez: bytes) -> int:
    h = 2166136261
    for b in bytez:
        h = h ^ b  # XOR the current byte value
        h = h * 16777619  # Multiply by the magic prime number
    return h & 0xFFFFFFFF  # Ensure the result is bounded to 32 bits

def generate_char_ngrams(text, n):
    """
    Generates character n-grams from a given text.

    Args:
        text (str): The input text.
        n (int): The length of the n-grams.

    Returns:
        list: A list of character n-grams.
    """
    ngrams = [text[i:i+n] for i in range(len(text) - n + 1)]
    return ngrams

def generate_ngrams(word, min=3, max=6):
    ngram_lists = list(generate_char_ngrams(word, x) for x in range(min, max+1))
    ngrams = [item for sublist in ngram_lists for item in sublist]
    return ngrams

def custom_embed(word: str):
    if word in model.wv.key_to_index:
        print("Whole word found")
        return model.wv.vectors[model.wv.key_to_index[word]]
    ngrams = generate_ngrams("<"+word+">")
    res = np.zeros(300)
    for ngram in ngrams:
        bytez = ngram.encode('utf-8')
        hash_value = ft_hash_bytes(bytez)
        bounded_hash_value = hash_value % 50000
        vec = model.wv.vectors_ngrams[bounded_hash_value]
        res += vec
    return res / len(ngrams)

word = "thou"
custom_embedding = custom_embed(word)
gensim_embedding = model.wv[word]

np.isclose(custom_embedding, gensim_embedding)

model.wv.vectors_ngrams.astype(np.float32).tofile("ngrams.bin")
model.wv.vectors.astype(np.float32).tofile("vectors.bin")
with open("vocab.json", "w") as file:
    json.dump(model.wv.key_to_index, file)

Whole word found
