In [1]:
# DIRECTORY SET
import os
import sys
from pathlib import Path
base_dir=Path(os.getcwd()).parent
# os.chdir(os.path.join(base_dir, 'serverproject'))
os.chdir(base_dir)
print(os.getcwd())

# Load dotenv
import dotenv
dotenv.load_dotenv()

# DJANGO SETUP
import django
sys.path.append(os.path.abspath(''))
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "serverproject.settings")
django.setup()

# Import async modules
import asyncio
from asgiref.sync import sync_to_async

# Import display modules
from IPython.display import display, Markdown

# Import other modules
import faiss
import time
import numpy as np


d:\DestinyFolder\DestinyRecaps\DestinyRecapsApi\serverproject


In [2]:
%load_ext autoreload
%autoreload 2

from destinyapp.models import StreamRecapData, FastRecapData

from core import services
from core import utils
from core import controller

In [5]:
model="text-embedding-3-large"

async def fetch_embedding(chunk):
    # Simulate an async call to the embeddings API
    #return await asyncio.to_thread(openai_client.embeddings.create, input=chunk, model=model)
    model="text-embedding-3-large"        

    fails=0
    while fails<5:
        try:
            return await utils.async_openai_client.embeddings.create(input=chunk, model=model)
        except Exception as e:
            fails+=1
            print("Emedding Fail Retrying:",e)
            await asyncio.sleep(10+(fails*2))
    return None

async def generate_embeddings_async(text_chunks, model):

    responses = await asyncio.gather(*(fetch_embedding(chunk) for chunk in text_chunks))
    embeddings = [response.data[0].embedding for response in responses]
    return np.array(embeddings)


In [15]:
len(embedding.data[0].embedding)


3072

In [22]:
import os
print(os.getcwd())
indexes_path=os.path.join(os.getcwd(),"search_dev_notebooks","working","indexes")
print(indexes_path)
print(os.path.isdir(indexes_path))


d:\DestinyFolder\DestinyRecaps\DestinyRecapsApi\serverproject
d:\DestinyFolder\DestinyRecaps\DestinyRecapsApi\serverproject\search_dev_notebooks\working\indexes
True


In [None]:
text_to_embed="This is a test"

embedding=await fetch_embedding(text_to_embed)


In [25]:
embedding_size=len(embedding.data[0].embedding)

embedding_data=np.array(embedding.data[0].embedding)


In [26]:
# create a faiss index
vector_db=faiss.IndexFlatL2(embedding_size)
vector_db.add(np.array([embedding_data]))

# larger index creation

In [38]:
embedding=await fetch_embedding("get embedding size")
embedding_size=len(embedding.data[0].embedding)
vector_db=faiss.IndexFlatL2(embedding_size)

In [39]:
# create bigger faiss index
texts=["This is a test", "The quick brown fox jumps over the lazy dog", "I love eating pizza on rainy days", "Machine learning is fascinating", "The sunset painted the sky in brilliant colors", "She danced through fields of wildflowers", "The old library held countless untold stories", "Coffee is essential for Monday mornings", "Waves crashed against the rocky shore", "Dragons soared through cloudy skies"]
embeddings=await generate_embeddings_async(texts, model)


In [40]:
vector_db.add(np.array(embeddings))


In [42]:
# save the index to a file
faiss.write_index(vector_db, os.path.join(indexes_path, "test_index.faiss"))

In [43]:
# load the index from a file
vector_db=faiss.read_index(os.path.join(indexes_path, "test_index.faiss"))

In [44]:
# query the index
query_text="This is a test"
query_embedding=await fetch_embedding(query_text)
query_embedding_np = np.array(query_embedding.data[0].embedding).astype('float32').reshape(1, -1)
k_size=5
k=vector_db.ntotal
if k>k_size:
    k=k_size

D, I = vector_db.search(query_embedding_np, k)

In [45]:
print("D:",D)
print("I:",I)


D: [[2.1857386e-06 1.1849377e+00 1.6127548e+00 1.6167562e+00 1.6696446e+00]]
I: [[0 1 3 7 8]]


In [47]:
query = np.asarray(query_embedding.data[0].embedding)
embedding_matrix = np.asarray(embeddings)

# Reshape query to 1D if needed
if len(query.shape) > 1:
    query = query.reshape(-1)
    
# Calculate euclidean distances using broadcasting
# This is more efficient than looping
distances = np.sqrt(np.sum((embedding_matrix - query) ** 2, axis=1))

# Get indices of k smallest distances
nearest_indices = np.argsort(distances)[:k]

In [49]:
distances

array([0.00147842, 1.08854847, 1.34954656, 1.26994282, 1.33518695,
       1.34190124, 1.35120326, 1.27151728, 1.29214726, 1.29558511])

In [48]:
print("nearest_indices:",nearest_indices)

nearest_indices: [0 1 3 7 8]


# finish of larger index creation

In [27]:
# save the index to a file
faiss.write_index(vector_db, os.path.join(indexes_path, "test_index.faiss"))


In [None]:
# load the index from a file
vector_db=faiss.read_index(os.path.join(indexes_path, "test_index.faiss"))


In [36]:
# query the index
query_text="This is a test"
query_embedding=await fetch_embedding(query_text)
query_embedding_np = np.array(query_embedding.data[0].embedding).astype('float32').reshape(1, -1)
k_size=5
k=vector_db.ntotal
if k>k_size:
    k=k_size

D, I = vector_db.search(query_embedding_np, k)

In [37]:
print("D:",D)
print("I:",I)


D: [[2.1857386e-06]]
I: [[0]]
