In [1]:
import setup

setup.init_django()

In [2]:
from openai import OpenAI
from blog.models import BlogPost
from decouple import config
EMEDDING_MODEL=config("EMEDDING_MODEL", default="text-embedding-3-small")
RECREATE_DATA=True

In [3]:
docs = [
    "The dog jumped over the cat", 
    "The cat jumped over the dog",
    "It is very warm today",
    "The cat is yellow and the dog is red",
]

In [4]:
new_data = []
for i, x in enumerate(docs):
    new_data.append(
        BlogPost(title=f"Blog Post {i+1}", content=x, can_delete=True)
    )

if RECREATE_DATA:
    qs = BlogPost.objects.filter(can_delete=True)
    qs.delete()
    BlogPost.objects.bulk_create(new_data)

In [5]:
qs = BlogPost.objects.filter(can_delete=True)
qs.count()

4

In [6]:
client = OpenAI(
    api_key=config("OPENAI_API_KEY")
)

In [7]:
def get_embedding(text, model=EMEDDING_MODEL):
    text = text.replace("\n", " ").strip()
    return client.embeddings.create(input=[text], model=model).data[0].embedding

In [8]:
EMEDDING_MODEL

'text-embedding-3-small'

In [9]:
for obj in qs:
    if obj.embedding is None:
        obj.embedding = get_embedding(obj.get_embedding_text_raw())
        obj.save()

In [21]:
query = "The dog jumped over the green cow"
# query = "The dog jumped over the cat"
query_embedding = get_embedding(query)

In [22]:
from pgvector.django import CosineDistance
from django.db.models import F

qs = BlogPost.objects.annotate(
    distance=CosineDistance('embedding',query_embedding),
    similarity=1 - F("distance")
).order_by("distance")[:2]
for obj in qs:
    print(obj.title, obj.distance, obj.similarity * 100)

Blog Post 2 0.2821284104662174 71.78715895337827
Blog Post 1 0.2862588550525559 71.37411449474442
