In [1]:
import asyncio, json, os, instructor, lancedb
from typing import List
from lancedb.embeddings import get_registry
from lancedb.pydantic import LanceModel, Vector
from openai import AsyncOpenAI, OpenAI
from pydantic import BaseModel
from dotenv import load_dotenv
load_dotenv(override=True)

  from .autonotebook import tqdm as notebook_tqdm


True

### 1. Generating synthetic products

In [2]:
client = instructor.from_openai(OpenAI(api_key=os.environ['OPENAI_API_KEY']))

In [3]:
class Product(BaseModel):
    title: str
    description: str

In [4]:
def read_txt(path):
    with open(path, "r") as f:
        return f.read()

In [5]:
prompt_template = read_txt("products_prompt.txt")
print(prompt_template.format(n_products=5)[:300] + "...")

Create a list of 5 products someone might buy at a hardware store based on the following instructions:
- Each product title should be repeated 2-3 times. Do not have any with duplicate product descriptions.
- Each product with a given title should have some small distinctions apparent from the descr...


#### Temperature=0.0

> Temperature is a technique used to redistribute the probabilities of the possible values. Intuitively, it reduces the probabilities of common tokens, and as a result, increases the probabilities of rarer tokens. This enables models to create more creative responses.

From [this article](https://huyenchip.com/2024/01/16/sampling.html#temperature)

In [6]:
def generate_products(n_products=100) -> List[Product]:
    prompt = prompt_template.format(n_products=n_products)
    try:
        products = client.chat.completions.create(
            model="gpt-4o",
            response_model=List[Product],
            messages=[{"role": "user", "content": prompt}],
            temperature=0.0,
        )
        return products
    except Exception as e:
        print(f"Error generating evals: {str(e)}")
        return []

In [7]:
# because our prompt says to repeat the title 2-3 times, even if I just ask for 1 product
# it returns 2-3 products
products = generate_products(n_products=1)

In [8]:
for prod in products:
    print(f"Title: {prod.title}")
    print(f"Desc: {prod.description[:100]}...")

Title: Cordless Drill
Desc: This 18V cordless drill offers a perfect balance of power and portability, making it ideal for both ...
Title: Cordless Drill
Desc: Our 20V cordless drill provides superior torque and extended battery life, perfect for heavy-duty ta...
Title: Cordless Drill
Desc: Experience the convenience of our 12V cordless drill, designed for light to medium tasks around the ...
Title: Adjustable Wrench
Desc: This 10-inch adjustable wrench is crafted from high-quality steel for durability and strength. The s...
Title: Adjustable Wrench
Desc: Our 8-inch adjustable wrench is lightweight yet robust, perfect for tight spaces and smaller tasks. ...
Title: Adjustable Wrench
Desc: The 12-inch adjustable wrench offers maximum leverage and is ideal for larger nuts and bolts. Its wi...


We have created the list of products. Now we will create the product reviews.

In [9]:
products = generate_products()

In [10]:
# weird that when we ask for 100 it only produces 90 results
len(products)

87

### 2. Generating synthetic reviews for products

In [13]:
r_prompt_template = read_txt("reviews_prompt.txt")

In [15]:
p = Product(title="chainsaw", description="something about chainsaw")
print(r_prompt_template.format(n=5, product=p)[:300] + "...")

Write 5 realistic but detailed / specific product reviews that might show up on a hardware store's website.

The reviews should be about the following product:
Product Title: chainsaw
Product Description: something about chainsaw

Add many relevant and concrete facts about the products (this is for ...


In [16]:
async_client = instructor.from_openai(AsyncOpenAI())

In [22]:
class Review(BaseModel):
    review: str

class ProductReviews(BaseModel):
    product_title: str
    product_description: str
    review: str

In [23]:
reviews_per_product = 10

async def make_reviews(product: Product, semaphore: asyncio.Semaphore) -> List[ProductReviews]:
    async with semaphore:
        prompt = r_prompt_template.format(n=reviews_per_product, product=product)
        try:
            result = await async_client.chat.completions.create(
                model="gpt-4o",
                response_model=List[Review],
                messages=[{"role": "user", "content": prompt}],
                temperature=0.0
            )
            return [
                ProductReviews(
                    product_title=product.title,
                    product_description=product.description,
                    review=r.review,
                )
                for r in result
            ]

        except Exception as e:
            print(f"Error: {str(e)}")
            return []

In [24]:
async def create_synthetic_reviews(max_concurrency: int = 4) -> List[ProductReviews]:
    out = []
    semaphore = asyncio.Semaphore(max_concurrency)
    tasks = [make_reviews(o, semaphore) for o in products]
    results = await asyncio.gather(*tasks, return_exceptions=True)
    for r in results:
        if not isinstance(r, Exception):
            out.extend(r)
    return out

reviews = await create_synthetic_reviews()

In [31]:
reviews[2].review

"I purchased this 18V cordless drill for some home improvement tasks, and it has been fantastic. The motor is strong enough to handle any job I've thrown at it, from drilling into brick to assembling furniture. The battery life is excellent, and it charges quickly. The comfortable grip and lightweight design make it easy to use for extended periods without fatigue. A great investment for any DIY enthusiast."

### Store the items to be retrieved in LanceDB

In [32]:
db = lancedb.connect("./lancedb")
func = get_registry().get("openai").create(name="text-embedding-3-small")

In [33]:
class Products(LanceModel):
    id: str = func.SourceField()
    title: str = func.SourceField()
    description: str = func.SourceField()
    vector: Vector(func.ndims()) = func.VectorField()

In [36]:
products_table = db.create_table("products", schema=Products, mode="overwrite")
products_data = [
    {"id": f"{i}", "title": obj.title, "description": obj.description}
    for i, obj in enumerate(products)
]
products_table.add(products_data)
products_table.create_fts_index("description", replace=True)
product_id_map = {
    p["title"]: p["id"] for p in products_table.to_pandas().to_dict("records")
}

In [37]:
class Reviews(LanceModel):
    id: str = func.SourceField()
    product_title: str = func.SourceField()
    product_description: str = func.SourceField()
    review: str = func.SourceField()
    vector: Vector(func.ndims()) = func.VectorField()

In [38]:
reviews_table = db.create_table("reviews", schema=Reviews, mode="overwrite")

reviews_with_product_id = [
    {
        "id": f"{i}",
        "product_title": review.product_title,
        "product_description": review.product_description,
        "review": review.review,
    }
    for i, review in enumerate(reviews)
]
reviews_table.add(reviews_with_product_id)
reviews_table.create_fts_index("review", replace=True)

[2024-08-26T19:19:41Z WARN  lance::dataset] No existing dataset at /Users/dipamvasani/Desktop/Desktop - Dipam’s MacBook Air/coding/systematically-improving-rag/week1_bootstrap_evals/lancedb/reviews.lance, it will be created


In case you want to see the data quickly in a text editor, we also store the data in JSON.

In [39]:
with open("./reviews.json", "w") as f:
    json.dump([i.dict() for i in reviews], f)

In [None]:
# TODO:
# learn more about lancedb thru documentation and see how we created the tables
# can we see these tables in the UI
# push nb to github