In [27]:
import chromadb
from openai import OpenAI
import pandas as pd
import json
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer

In [28]:
# chroma_client = chromadb.Client()
# collection = chroma_client.get_or_create_collection(name='documents')
# collection.add(documents=[],ids=[],metadatas=[])

In [29]:
def load_reddit_json(path):
    with open(path, "r") as f:
        return json.load(f)

def reddit_to_documents(data, source_name):
    documents = []
    doc_id = 0

    for post in data:
        # --- Create post chunk ---
        post_text = f"Title: {post['title']}\n\nPost: {post['selftext']}"
        documents.append({
            "id": f"{source_name}_post_{doc_id}",
            "source": source_name,
            "type": "post",
            "text": post_text
        })
        doc_id += 1

        # --- Create comment chunks (group 3–5 comments each) ---
        comments = post.get("comments", [])
        group_size = 5  # you can experiment with 3–8

        for i in range(0, len(comments), group_size):
            group = comments[i:i+group_size]
            comment_text = ""

            for c in group:
                comment_text += f"{c['author']}: {c['body']}\n"

            documents.append({
                "id": f"{source_name}_comments_{doc_id}",
                "source": source_name,
                "type": "comments",
                "text": f"Comments for post '{post['title']}':\n" + comment_text
            })

            doc_id += 1

    return documents

In [30]:
def chunk_documents(docs):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=50,
        separators=["\n\n", "\n", ".", "!", "?", " "]
    )

    chunked = []
    for d in docs:
        for i, chunk in enumerate(splitter.split_text(d["text"])):
            chunked.append({
                "id": f"{d['id']}_chunk_{i}",
                "source": d["source"],
                "type": d["type"],
                "text": chunk
            })
    return chunked

In [39]:
def add_in_batches(collection, texts, embeddings, ids, metadatas, batch_size=5000):
    for i in range(0, len(texts), batch_size):
        end = i + batch_size
        print(f"Adding batch {i} to {end} ...")

        collection.add(
            documents=texts[i:end],
            embeddings=embeddings[i:end],
            ids=ids[i:end],
            metadatas=metadatas[i:end]
        )

    print("All batches successfully added!")

In [31]:
model = SentenceTransformer("all-MiniLM-L6-v2")

def embed(texts):
    return model.encode(texts, show_progress_bar=True)

In [32]:
def store_in_chroma(chunks):
    texts = [c["text"] for c in chunks]
    ids = [c["id"] for c in chunks]

    embeddings = embed(texts)

    collection.add(
        documents=texts,
        embeddings=embeddings,
        ids=ids,
        metadatas=chunks
    )

In [33]:
json_path = "/Users/Suvethika/Downloads/cookingforbeginners_data.json"#"/Users/Suvethika/Downloads/cookingforbeginners_data(limit=1000).json"

with open(json_path, "r") as f:
    data = json.load(f)

print(f"Loaded {len(data)} Reddit posts.")

Loaded 250 Reddit posts.


In [34]:
documents = reddit_to_documents(data, source_name="cookingforbeginners")
print(f"Created {len(documents)} document blocks.")

Created 500 document blocks.


In [35]:
chunks = chunk_documents(documents)
print(f"Created {len(chunks)} chunks.")

Created 914 chunks.


In [36]:
texts = [c["text"] for c in chunks]
ids = [c["id"] for c in chunks]
metadatas = [{"source": c["source"], "type": c["type"]} for c in chunks]

In [37]:
embeddings = embed(texts)
print(f"Created {len(embeddings)} embeddings.")

Batches:   0%|          | 0/29 [00:00<?, ?it/s]

Created 914 embeddings.


In [40]:
chroma_client = chromadb.Client()

collection = chroma_client.get_or_create_collection(
    name="documents",
    metadata={"hnsw:space": "cosine"}
)

In [41]:
add_in_batches(
    collection,
    texts,
    embeddings,
    ids,
    metadatas,
    batch_size=5000
)

Adding batch 0 to 5000 ...
All batches successfully added!


In [42]:
results = collection.query(
    query_texts=["How do I stop onions from burning?"],
    n_results=5
)
print(results)

{'ids': [['cookingforbeginners_post_4_chunk_0', 'cookingforbeginners_comments_5_chunk_0', 'AskCulinary_post_108_chunk_2', 'cookingforbeginners_post_418_chunk_3', 'AskCulinary_comments_198_chunk_0']], 'embeddings': None, 'documents': [['Title: Helpful guide for onions!\n\nPost:', "Comments for post 'Helpful guide for onions!':\nClubLegend_Theater: Okay so I just started cooking about 2 months ago.  I'm learning a lot \n\n1. I have started growing my green onions.  It's been going well but I'm a few weeks in and today I chopped one for my rice and it was very sticky and wet.  I did some concerned googling and apparently they can get sappy if they are fresh and damp.", ". The two onions that were in there pretty much totally dissolved during the simmer - there were almost no traces that there had been onion in there at all after cooking everything - so I'm thinking that may be partially to blame.", 'While the chicken is browning, dice the onion roughly and mince the garlic.\n\nOnce the ch

In [43]:
json_path_2 = "/Users/Suvethika/Downloads/AskCulinary_data.json"

with open(json_path_2, "r") as f:
    data2 = json.load(f)

print(f"Loaded {len(data2)} Reddit posts from AskCulinary.")

Loaded 251 Reddit posts from AskCulinary.


In [44]:
documents2 = reddit_to_documents(data2, source_name="AskCulinary")
print(f"Created {len(documents2)} document blocks.")

Created 498 document blocks.


In [45]:
chunks2 = chunk_documents(documents2)
print(f"Created {len(chunks2)} chunks.")

Created 1123 chunks.


In [46]:
texts2 = [c["text"] for c in chunks2]
ids2 = [c["id"] for c in chunks2]
metadatas2 = [{"source": c["source"], "type": c["type"]} for c in chunks2]

In [47]:
embeddings2 = embed(texts2)
print(f"Created {len(embeddings2)} embeddings.")

Batches:   0%|          | 0/36 [00:00<?, ?it/s]

Created 1123 embeddings.


In [48]:
add_in_batches(
    collection,
    texts2,
    embeddings2,
    ids2,
    metadatas2,
    batch_size=5000
)

Adding batch 0 to 5000 ...
All batches successfully added!


In [49]:
results = collection.query(
    query_texts=["What is the best way to mince garlic?"],
    n_results=5
)

print(results)

{'ids': [['cookingforbeginners_post_240_chunk_1', 'AskCulinary_comments_47_chunk_1', 'AskCulinary_post_462_chunk_1', 'AskCulinary_post_307_chunk_1', 'cookingforbeginners_post_240_chunk_0']], 'embeddings': None, 'documents': [["3 garlic bulbs, peeled to the cloves\n\n2 cups of olive oil (doesn't even matter if it's the cheap stuff, it's gonna taste amazing)\n\n\nThrow all of that in a saucepan on low low low heat for 20 minutes, or until you start seeing little bubbles start to come up in your oil.  Don't cook it so hot though that it starts to burn or carmalize the garlic.", 'Remove from heat and strain garlic. Reserve garlic for another use. Allow oil to cool and add remaining ingredients to oil. Use an immersion blender to disperse dry ingredients. Allow to sit 24 hours before use. Contents will settle so stir really well before using.', 'In Western cooking when frying anything you either dump the garlic with every other vegetable or only at the end after they soften.', "Saute the ve

In [50]:
results = collection.query(
    query_texts=["How to make a meringue?"],
    n_results=5
)
print(results)

{'ids': [['AskCulinary_comments_248_chunk_0', 'cookingforbeginners_comments_487_chunk_9', 'AskCulinary_comments_93_chunk_10', 'AskCulinary_comments_351_chunk_3', 'cookingforbeginners_post_274_chunk_0']], 'embeddings': None, 'documents': [["Comments for post 'I’m Kristen Miglore, Food52 founding editor ＆ Genius Recipes writer and host—I’m here to talk all things home cooking, from fixing kitchen fails to holiday meal prep. AMA!':\nTogapr33: So i made lemon meringue pie over the weekend for the first time...and Ive got two questions - Why was the lemond curd somewhat runny instead of gelatonous? and why are you supposed to slowly pour sugar into the egg whites for meringue?", 'Turn the heat down to low. Drizzle 1 TSP olive oil on the other side and flip. Add the toppings below and throw into the broiler on high for 2-3 minutes until cheese is melted and bubbly. Top with fresh dill!\n\n# Toppings & Sauce\n\n***Sauce:***\n\n* 1/4 cup Siggi’s 0% plain skyr\n* 1 TSP of Harissa (I used Trader

In [51]:
json_path = "/Users/Suvethika/Downloads/cookinghacks_data.json"#"/Users/Suvethika/Downloads/cookingforbeginners_data(limit=1000).json"
with open(json_path, "r") as f:
    data3 = json.load(f)
print(f"Loaded {len(data)} Reddit posts.")
documents3 = reddit_to_documents(data3, source_name="CookingHacks")
print(f"Created {len(documents3)} document blocks.")
chunks3 = chunk_documents(documents3)
print(f"Created {len(chunks3)} chunks.")
texts3 = [c["text"] for c in chunks3]
ids3 = [c["id"] for c in chunks3]
metadatas3 = [{"source": c["source"], "type": c["type"]} for c in chunks3]
embeddings3 = embed(texts3)
print(f"Created {len(embeddings3)} embeddings.")

Loaded 250 Reddit posts.
Created 200 document blocks.
Created 855 chunks.


Batches:   0%|          | 0/27 [00:00<?, ?it/s]

Created 855 embeddings.


In [52]:
add_in_batches(
    collection,
    texts3,
    embeddings3,
    ids3,
    metadatas3,
    batch_size=5000
)

Adding batch 0 to 5000 ...
All batches successfully added!


In [53]:
json_path = "/Users/Suvethika/Downloads/AskBaking_data.json"#"/Users/Suvethika/Downloads/cookingforbeginners_data(limit=1000).json"
with open(json_path, "r") as f:
    data4 = json.load(f)
print(f"Loaded {len(data4)} Reddit posts.")
documents4 = reddit_to_documents(data4, source_name="CookingHacks")
print(f"Created {len(documents4)} document blocks.")
chunks4 = chunk_documents(documents4)
print(f"Created {len(chunks4)} chunks.")
texts4 = [c["text"] for c in chunks4]
ids4 = [c["id"] for c in chunks4]
metadatas4 = [{"source": c["source"], "type": c["type"]} for c in chunks4]
embeddings4 = embed(texts4)
print(f"Created {len(embeddings4)} embeddings.")

Loaded 250 Reddit posts.
Created 500 document blocks.
Created 760 chunks.


Batches:   0%|          | 0/24 [00:00<?, ?it/s]

Created 760 embeddings.


In [54]:
add_in_batches(
    collection,
    texts4,
    embeddings4,
    ids4,
    metadatas4,
    batch_size=5000
)

Adding batch 0 to 5000 ...
All batches successfully added!


In [55]:
json_path = "/Users/Suvethika/Downloads/Cooking_data.json"#"/Users/Suvethika/Downloads/cookingforbeginners_data(limit=1000).json"
with open(json_path, "r") as f:
    data5 = json.load(f)
print(f"Loaded {len(data5)} Reddit posts.")
documents5 = reddit_to_documents(data5, source_name="CookingHacks")
print(f"Created {len(documents5)} document blocks.")
chunks5 = chunk_documents(documents5)
print(f"Created {len(chunks5)} chunks.")
texts5 = [c["text"] for c in chunks5]
ids5 = [c["id"] for c in chunks5]
metadatas5 = [{"source": c["source"], "type": c["type"]} for c in chunks5]
embeddings5 = embed(texts5)
print(f"Created {len(embeddings5)} embeddings.")

Loaded 100 Reddit posts.
Created 199 document blocks.
Created 459 chunks.


Batches:   0%|          | 0/15 [00:00<?, ?it/s]

Created 459 embeddings.


In [56]:
add_in_batches(
    collection,
    texts5,
    embeddings5,
    ids5,
    metadatas5,
    batch_size=5000
)

Adding batch 0 to 5000 ...
All batches successfully added!


In [57]:
results = collection.query(
    query_texts=["How to make a meringue?"],
    n_results=5
)
print(results)

{'ids': [['CookingHacks_post_282_chunk_0', 'CookingHacks_post_384_chunk_0', 'CookingHacks_post_282_chunk_1', 'CookingHacks_post_324_chunk_1', 'CookingHacks_post_282_chunk_2']], 'embeddings': None, 'documents': [['Title: What am I doing wrong my the meringue ?', 'Title: Help! Trying to make Swiss Meringue but got glue', "Post: I made lemon meringue pies but the meringue looks kinda sad :') \nWhat can I do differently to get the classic torched look with an oven (if it's even possible) ?\nIt's a French meringue with a tsp of lemon juice (I don't have creme of tartar).", 'Recipe: https://chelsweets.com/swiss-meringue-buttercream-frosting/', 'I used this recipe https://youtu.be/lwtxluLYqOI?si=VNSv6Qn5k6Z8vPuq (Preppy Kitchen - "The PERFECT lemon meringue pie recipe") but my meringue never got the consistency shown in the video even after mixing it for ~25 min, I suspect this was the issue but I\'m not sure why, should I\'ve just kept going ?']], 'uris': None, 'included': ['metadatas', 'doc