In [1]:
import os
from dotenv import load_dotenv
from supabase import create_client, Client
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer

# Load environment variables from .env file
load_dotenv()

# Initialize Supabase client
url: str = os.environ.get("SUPABASE_URL")
key: str = os.environ.get("SUPABASE_KEY")
supabase: Client = create_client(url, key)

print("Libraries imported and Supabase client initialized.")

Libraries imported and Supabase client initialized.


In [2]:
# Load the sentence-transformer model
model = SentenceTransformer('mixedbread-ai/mxbai-embed-large-v1')

print("Embedding model loaded.")

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/266 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/677 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/670M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

Embedding model loaded.


In [3]:
# Sample document text
sample_document = """
The Solar System is the gravitationally bound system of the Sun and the objects that orbit it.
It was formed 4.6 billion years ago from the gravitational collapse of a giant interstellar molecular cloud.
The vast majority (99.86%) of the system's mass is in the Sun, with most of the remaining mass contained in the planet Jupiter.
The four inner terrestrial planets—Mercury, Venus, Earth, and Mars—are composed primarily of rock and metal.
The four outer giant planets are substantially more massive than the terrestrials.
The two largest, Jupiter and Saturn, are gas giants, being composed mainly of hydrogen and helium; the two outermost planets, Uranus and Neptune, are ice giants, being composed mostly of substances with relatively high melting points compared with hydrogen and helium, called volatiles, such as water, ammonia, and methane.
All eight planets have nearly circular orbits that lie within a nearly flat disc called the ecliptic.
"""

print("Sample document prepared.")

Sample document prepared.


In [4]:
# Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=250,
    chunk_overlap=50,
    length_function=len
)

# Create chunks
chunks = text_splitter.split_text(sample_document)

# Print out the chunks to see the result
for i, chunk in enumerate(chunks):
    print(f"--- Chunk {i+1} ---")
    print(chunk)
    print()

--- Chunk 1 ---
The Solar System is the gravitationally bound system of the Sun and the objects that orbit it.
It was formed 4.6 billion years ago from the gravitational collapse of a giant interstellar molecular cloud.

--- Chunk 2 ---
The vast majority (99.86%) of the system's mass is in the Sun, with most of the remaining mass contained in the planet Jupiter.
The four inner terrestrial planets—Mercury, Venus, Earth, and Mars—are composed primarily of rock and metal.

--- Chunk 3 ---
The four outer giant planets are substantially more massive than the terrestrials.

--- Chunk 4 ---
The two largest, Jupiter and Saturn, are gas giants, being composed mainly of hydrogen and helium; the two outermost planets, Uranus and Neptune, are ice giants, being composed mostly of substances with relatively high melting points compared with

--- Chunk 5 ---
with relatively high melting points compared with hydrogen and helium, called volatiles, such as water, ammonia, and methane.

--- Chunk 6 ---
A

In [5]:
# Generate embeddings for each chunk and prepare for Supabase
documents_to_insert = []
for chunk in chunks:
    # Generate the embedding
    embedding = model.encode(chunk).tolist()
    
    # Add to our list
    documents_to_insert.append({
        'content': chunk,
        'embedding': embedding
    })

# Insert all documents into the Supabase table
try:
    data, count = supabase.table('documents').insert(documents_to_insert).execute()
    print(f"Successfully inserted {len(data[1])} chunks into the database.")
except Exception as e:
    print(f"An error occurred: {e}")

Successfully inserted 6 chunks into the database.


In [6]:
# User's search query
query = "What are the inner planets made of?"

# Generate embedding for the query
query_embedding = model.encode(query).tolist()

# Perform the search using the RPC function
try:
    response = supabase.rpc('match_documents', {
        'query_embedding': query_embedding,
        'match_threshold': 0.5,  # Adjust this threshold as needed
        'match_count': 3        # Get the top 3 matches
    }).execute()
    
    # Print the results
    print(f"Found {len(response.data)} matching documents for the query: '{query}'\n")
    for doc in response.data:
        print(f"--- Similarity: {doc['similarity']:.4f} ---")
        print(doc['content'])
        print()
        
except Exception as e:
    print(f"An error occurred during search: {e}")

Found 3 matching documents for the query: 'What are the inner planets made of?'

--- Similarity: 0.7482 ---
The vast majority (99.86%) of the system's mass is in the Sun, with most of the remaining mass contained in the planet Jupiter.
The four inner terrestrial planets—Mercury, Venus, Earth, and Mars—are composed primarily of rock and metal.

--- Similarity: 0.6878 ---
The two largest, Jupiter and Saturn, are gas giants, being composed mainly of hydrogen and helium; the two outermost planets, Uranus and Neptune, are ice giants, being composed mostly of substances with relatively high melting points compared with

--- Similarity: 0.6396 ---
The Solar System is the gravitationally bound system of the Sun and the objects that orbit it.
It was formed 4.6 billion years ago from the gravitational collapse of a giant interstellar molecular cloud.

