In [1]:
import setup

setup.init_django()

In [2]:
from decouple import config
from blog.models import BlogPost
from blog import services

In [3]:
# qs = BlogPost.objects.filter(can_delete=True)
# qs

In [4]:
# !pip install llama-index sqlalchemy llama-index-vector-stores-postgres

In [5]:
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

In [6]:
LLM_MODEL = config("LLM_MODEL", default="gpt-4o") # not in use use
EMEDDING_LENGTH = config("EMEDDING_LENGTH", default=1536, cast=int)
EMEDDING_MODEL =config("EMEDDING_MODEL", default="text-embedding-3-small")
OPENAI_API_KEY = config("OPENAI_API_KEY")

llm = OpenAI(model=LLM_MODEL, api_key=OPENAI_API_KEY)
embed_model = OpenAIEmbedding(model=EMEDDING_MODEL, api_key=OPENAI_API_KEY)

In [40]:
from typing import List

class MyOpenAIEmbedding(OpenAIEmbedding):
    
    def _get_query_embedding(self, query: str) -> List[float]:
        """Get query embedding."""
        print('my query', query) 
        # obj, created = Query.objects.get_or_create(query=query)
        # obj.get_query_embedding()
        return super()._get_query_embedding(query)

    def _get_text_embedding(self, text: str) -> List[float]:
        """Get text embedding."""
        print("texts", text)
        return super()._get_text_embedding(text)

    def _get_text_embeddings(self, texts: List[str]) -> List[List[float]]:
        """Get text embeddings.

        By default, this is a wrapper around _get_text_embedding.
        Can be overridden for batch queries.
        """
        print("texts", texts)
        return super()._get_text_embeddings(texts)
        
embed_model = MyOpenAIEmbedding(model=EMEDDING_MODEL, api_key=OPENAI_API_KEY)

In [41]:
from llama_index.core import Settings

Settings.llm = llm
Settings.embed_model = embed_model

In [42]:
vector_db_name = "vector_db"
vector_db_table_name = "blogpost" # -> data_blogpost

In [43]:
DATABASE_URL = config("DATABASE_URL_POOL")
if DATABASE_URL.startswith("postgres://"):
    DATABASE_URL = DATABASE_URL.replace("postgres://", "postgresql://", 1)

In [44]:
# create a new database
from sqlalchemy import create_engine, text

engine = create_engine(DATABASE_URL, isolation_level="AUTOCOMMIT")
with engine.connect() as connection:
    result = connection.execute(text("SELECT 1 FROM pg_database WHERE datname = :db_name"), {"db_name": vector_db_name})
    db_exists = result.scalar() == 1
    if not db_exists:
        session.execute(text('CREATE EXTENSION IF NOT EXISTS vector'))
        connection.execute(text(f"CREATE DATABASE {vector_db_name}"))

In [45]:
from sqlalchemy import make_url
from llama_index.vector_stores.postgres import PGVectorStore

url = make_url(DATABASE_URL)
vector_store = PGVectorStore.from_params(
    database=vector_db_name,
    host=url.host,
    password=url.password,
    port=url.port or 5432,
    user=url.username,
    table_name=vector_db_table_name,
    embed_dim=EMEDDING_LENGTH,
)

In [46]:
from llama_index.core import VectorStoreIndex, StorageContext

storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_vector_store(vector_store, storage_context=storage_context)
query_engine = index.as_query_engine()

In [52]:
from llama_index.core import Document

docs = []
qs = BlogPost.objects.filter(can_delete=True)
for obj in qs:
    docs.append(
        Document(
            text=f"{obj.get_embedding_text_raw()}",
            doc_id=str(obj.id),
            embedding=obj.embedding.tolist(),
            metadata = {
                "pk": obj.pk,
                "title": obj.title
            }
        )
    )

# docs

In [53]:
for doc in docs:
    index.delete_ref_doc(f"{doc.id_}", delete_from_docstore=True)
    index.insert(doc)

In [49]:
response = query_engine.query("The dog jumped")

my query The dog jumped


In [16]:
for k in response.metadata.keys():
    for subk, v in response.metadata[k].items():
        print(subk, v)

pk 34
title Blog Post 1
pk 35
title Blog Post 2


In [54]:
port = url.port or 5432
db_url = f"postgresql://{url.username}:{url.password}@{url.host}:{port}/{vector_db_name}"


from sqlalchemy import create_engine, text
import numpy as np


# Create the SQLAlchemy engine
engine = create_engine(db_url)

with engine.connect() as connection:
    # Define the SQL query to select only the id and embedding columns
    query = text(f"SELECT * FROM data_{vector_db_table_name}")
    query = text(f"SELECT metadata_, embedding FROM data_{vector_db_table_name}")
    
    # Execute the query
    result = connection.execute(query)
    
    # Fetch all rows
    rows = result.fetchall()

In [55]:
def calculate_cosine_metrics(v1, v2):
    dot_product = np.dot(v1, v2)
    magnitude1 = np.linalg.norm(v1)
    magnitude2 = np.linalg.norm(v2)
    cosine_similarity = dot_product / (magnitude1 * magnitude2)
    cosine_distance = 1 - cosine_similarity
    return int(cosine_similarity* 100), int(cosine_distance * 100)

In [56]:
for row in rows:
    metadata_, embedding = row[0], row[1]
    # print(metadata_)
    blog_post_pk = metadata_.get("pk")
    obj = BlogPost.objects.get(pk=blog_post_pk)
    embedding_array = np.array(embedding.strip('[]').split(','), dtype=float)
    obj_embedding_array = np.array(obj.embedding, dtype=float)
    print(calculate_cosine_metrics(embedding_array.shape, obj_embedding_array.shape))
    # print(obj.embedding, embedding)

(100, 0)
(100, 0)
(100, 0)
(100, 0)
