In [29]:
# imports for langchain, plotly and Chroma
import os
import requests
from bs4 import BeautifulSoup
from typing import List
from dotenv import load_dotenv
from openai import OpenAI
import google.generativeai
import anthropic
import glob
import gradio as gr
from IPython.display import Markdown, display, update_display
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import numpy as np
import plotly.graph_objects as go
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.embeddings import HuggingFaceEmbeddings
import pandas as pd
import google.generativeai as genai
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

In [30]:
# Load environment variables in a file called .env
# Print the key prefixes to help with any debugging

load_dotenv(override=True)
openai_api_key = os.getenv('OPENAI_API_KEY')
anthropic_api_key = os.getenv('ANTHROPIC_API_KEY')
google_api_key = os.getenv('GOOGLE_API_KEY')

if openai_api_key:
    print(f"OpenAI API Key exists and begins {openai_api_key[:8]}")
else:
    print("OpenAI API Key not set")
    
if anthropic_api_key:
    print(f"Anthropic API Key exists and begins {anthropic_api_key[:7]}")
else:
    print("Anthropic API Key not set")

if google_api_key:
    print(f"Google API Key exists and begins {google_api_key[:8]}")
else:
    print("Google API Key not set")


OpenAI API Key exists and begins sk-proj-
Anthropic API Key exists and begins sk-ant-
Google API Key exists and begins AIzaSyBX


In [3]:
# Load .env file
load_dotenv()
google.generativeai.configure()

In [28]:
import pandas as pd
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

# Load and Clean Data
raw_csv = pd.read_csv(r"C:/Users/Lucian/LLM_Platform/llm_engineering/projects/Hieroglyph_Data_CSV.csv")
csv = raw_csv.drop(['Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7', 'Description', 'Gardiner'], axis=1).set_index("Ideogram")
hier_dict = csv.to_dict()
hifix_dict = dict(ele for sub in hier_dict.values() for ele in sub.items())

# Initialize Model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Create a dictionary to store vector embeddings
vector_database = {}

for key, glyph in hifix_dict.items():
    cleaned_key = key.replace("\xa0", " ").strip()  # Remove special characters from English gloss
    if cleaned_key:  # Ensure text is not empty
        vector = model.encode(cleaned_key).astype(np.float32)  # Convert to float32
        vector_database[cleaned_key] = (vector, glyph)  # Store embedding + corresponding hieroglyph
    else:
        print(f"Skipping invalid entry: {key}")

# Ensure we have valid vectors
if not vector_database:
    raise ValueError("No valid vectors generated. Check your input data.")

# Convert dictionary into NumPy array
keys = list(vector_database.keys())
vectors = np.array([vec[0] for vec in vector_database.values()], dtype=np.float32)  # Only store embeddings

# Use Cosine Similarity Instead of L2 Distance
index = faiss.IndexFlatIP(vectors.shape[1])  # IP (Inner Product) = Cosine Similarity when normalized
faiss.normalize_L2(vectors)  # Normalize for cosine similarity
index.add(vectors)

print(f"FAISS index created with {len(keys)} vectors.")

# Query Knowledgebase
def query_knowledgebase(query):
    query_vector = model.encode(query).astype(np.float32)  # Convert query to vector
    faiss.normalize_L2(query_vector.reshape(1, -1))  # Normalize for cosine similarity

    k = min(6, len(keys))  # Ensure k does not exceed available vectors
    distances, indices = index.search(query_vector.reshape(1, -1), k)  # Perform search

    nearest_keys = [keys[idx] for idx in indices.flatten()]
    nearest_glyphs = [vector_database[key][1] for key in nearest_keys]  # Get corresponding hieroglyphs

    return list(zip(nearest_keys, nearest_glyphs, distances.flatten()))  # Return key, glyph, and distance

# Example Query
response = query_knowledgebase("a donkey gets spanked")
print("Retrieved results:")
for text, glyph, distance in response:
    print(f"{text} → {glyph} (Similarity: {distance:.4f})")


FAISS index created with 944 vectors.
Retrieved results:
Donkey (ꜥꜣ) → 𓃘 (Similarity: 0.6122)
donkey (ꜥꜣ) → 𓂸 (Similarity: 0.6122)
Kid (goat) (jb) → 𓃛 (Similarity: 0.3828)
Snake (ḥfꜣw) → 𓆚 (Similarity: 0.3537)
Bull (kꜣ) → 𓄀 (Similarity: 0.3423)
bull (kꜣ) → 𓂺 (Similarity: 0.3423)


In [31]:
system_prompt = "You are an expert in egyptian Hieroglyphs. Your job is to take in English text and then use your knowledge base to create the best Hieroglyphic translation.\
Break down the text into words and phrases and then prioritize symbols in the References.\
Always try to use symbols from the References first. Try not to use the same symbol too many times. Never Use Modern Emojis. If you do not find a good symbol to match the exact word then use the symbols to phoenetically sound out the word.\
Write a small explanation first but always write the completed translation using only hieroglyphs. Please write from left to right"

In [23]:
# Generate a response using ChatGPT with the retrieved data
def generate_response_gpt(user_query):
    # Retrieve relevant information based on the user's query
    retrieved_keys,dim = query_knowledgebase(user_query)
    
    # Construct a context based on the retrieved keys
    context = "Here are some references:\n"
    for key in retrieved_keys:
        context += f"- {key}: {hifix_dict[key]}\n"  # Assuming hifix_dict holds the original texts

    # Call the OpenAI API to generate a response based on the context
    prompt = f"{context}\n\nUser question: {user_query}\n\nChatGPT response:"
    
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_query}
          ],)
    
    return response.choices[0].message.content

def generate_response_gemini(user_query):
    """
    Generates a response using Gemini, incorporating retrieved data from a knowledge base.
    """
    # Retrieve relevant information based on the user's query
    retrieved_data = query_knowledgebase(user_query)

    # Construct a context based on the retrieved data
    if retrieved_data:
        context = "Here are some relevant references:\n"
        for text, glyph, _ in retrieved_data:  # Extract text and hieroglyphs
            context += f"- {text}: {glyph}\n"
    else:
        context = "No relevant hieroglyphs found."

    # Build the conversation context
    chat = genai.GenerativeModel('gemini-2.0-flash').start_chat(history=[])
    
    chat.send_message(system_prompt) # This isn't strictly the same, as it doesn't *inject* into the response. But Gemini models do better with a guiding context.

    # Send user query along with hieroglyph references
    final_query = f"{context}\n\nUser question: {user_query}"
    response = chat.send_message(final_query)

    return response.text 

In [24]:
# Example query interaction
response= query_knowledgebase("Khesekh entered the room already filled with soldiers fresh from battle")
print(f"Retrieved keys: {response}")

Retrieved keys: [('Khepresh (ḫprš)', '𓋙', 0.32896325), ('Darkness (kkw)', '𓇱', 0.28117758), ('Sobek (sbk)', '𓆍', 0.2712364), ('injure (nkn)', '𓂿', 0.26553637), ('soldier, company/unit of soldiers (mnfyt), army/expedition (mšꜥ)', '𓀎', 0.2645527), ('Kheker-frieze', '𓐮', 0.26224443)]


In [25]:
user_query = "“By Whom?” The king cut him off looking almost angry"

In [26]:
response = generate_response_gemini(user_query)
print(f"Gemini says: {response}")

Gemini says: Alright, I'll incorporate those references to translate "By Whom? The king cut him off looking almost angry."

**Explanation:**

*   **By:** I will use "mj" (in,from,by) **𓅓**
*   **Whom?:** As there is no direct translation for "Whom" I will try to use "Who" which can be expressed as "jm" **𓇋𓅓** which can be further modified to **𓇋𓅓𓈖**
*   **The:** reed leaf (**𓇋**)
*   **king:** I will use King (nswt): 𓇓
*   **cut:** I will use the glyph for "to cut" or "to slaughter" (**𓂺**)
*   **him:** Use the suffix pronoun "f" (**𓆑**) for "him" as a direct object.
*   **off:** Since there is no direct translation for "off" in this context, I will attempt to convey the idea of abruptness or interruption using **𓈖** (n), which can indicate negation or a sudden stop.
*   **looking:** I will phonetically spell this out using: **𓄛𓊹𓏏𓈖** (lu-ki-in-g)
*   **Almost:** No direct translation. I will use "close to" which is "ek" **𓇋𓎡**
*   **angry:** I will use the glyph "grr" for angry, bad **

In [27]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)

NameError: name 'documents' is not defined

In [None]:
text_loader_kwargs = {'encoding': 'utf-8'}
# If that doesn't work, some Windows users might need to uncomment the next line instead
# text_loader_kwargs={'autodetect_encoding': True}

documents = []
for folder in folders:
    doc_type = os.path.basename(folder)
    loader = DirectoryLoader(folder, glob="**/*.md", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)
    folder_docs = loader.load()
    for doc in folder_docs:
        doc.metadata["doc_type"] = doc_type
        documents.append(doc)

In [20]:
# Let's investigate the vectors

collection = vectorstore._collection
count = collection.count()

sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"There are {count:,} vectors with {dimensions:,} dimensions in the vector store")

NameError: name 'vectorstore' is not defined

## Visualizing the Vector Store

Let's take a minute to look at the documents and their embedding vectors to see what's going on.

In [21]:
# Prework (with thanks to Jon R for identifying and fixing a bug in this!)

result = collection.get(include=['embeddings', 'documents', 'metadatas'])
vectors = np.array(result['embeddings'])
documents = result['documents']
metadatas = result['metadatas']
doc_types = [metadata['doc_type'] for metadata in metadatas]
colors = [['blue', 'green', 'red', 'orange'][['products', 'employees', 'contracts', 'company'].index(t)] for t in doc_types]

NameError: name 'collection' is not defined

In [6]:
# We humans find it easier to visalize things in 2D!
# Reduce the dimensionality of the vectors to 2D using t-SNE
# (t-distributed stochastic neighbor embedding)

tsne = TSNE(n_components=2, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 2D scatter plot
fig = go.Figure(data=[go.Scatter(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='2D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x',yaxis_title='y'),
    width=800,
    height=600,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



NameError: name 'colors' is not defined

In [17]:
# Let's try 3D!

tsne = TSNE(n_components=3, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 3D scatter plot
fig = go.Figure(data=[go.Scatter3d(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    z=reduced_vectors[:, 2],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='3D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x', yaxis_title='y', zaxis_title='z'),
    width=900,
    height=700,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

NameError: name 'colors' is not defined