In [1]:
import json

# Load JSON data
with open('WHO.json', 'r') as f:
    data = json.load(f)

# Check the structure of the data 
print(data[:2])  

[{'question': 'What are effective treatments for depression?', 'answer': 'Psychological treatment and medications.'}, {'question': 'What is the first line of treatment for depression?', 'answer': 'Psychological treatments.'}]


In [5]:
from tqdm import tqdm
def batch_data(data, batch_size):
    for i in range(0, len(data), batch_size):
        yield data[i:i + batch_size]
                

In [15]:
import google.generativeai as genai
import re
import json
import os
from dotenv import load_dotenv

load_dotenv()

def augment_data_with_gemini(batch):
    api_key = os.getenv("GOOGLE_API_KEY")
    if not api_key:
        return {"error": "API key not found. Please set GOOGLE_API_KEY in your .env file."}

    genai.configure(api_key=api_key)
    model=genai.GenerativeModel("gemini-1.5-flash")
    
       
    prompt = "Please add the following fields to this dataset: 'id' (unique identifier), 'category' (based on the question and answer), 'source' (placeholder: 'WHO'), and 'context' (a brief explanation or background for the answer). Example format:\n\n" \
             "Input:\n" \
             "[{'question': 'What are effective treatments for depression?', 'answer': 'Psychological treatment and medications.'}]\n" \
             "Output:\n" \
             "[{'id': '1', 'question': 'What are effective treatments for depression?', 'answer': 'Psychological treatment and medications.', 'category': 'Depression', 'source': 'WHO', 'context': 'Psychological treatments include cognitive-behavioral therapy and interpersonal therapy. Medications are prescribed based on severity.'}]\n\n" \
             "Now process the following data:\n" \
             f"{json.dumps(batch)}"
    
    response = model.generate_content(
        prompt,
            generation_config = genai.GenerationConfig(
            max_output_tokens=1000,
            temperature=0.1,
    ))

    try:
        # Extract the text part from the response
        text_part = response.candidates[0].content.parts[0].text
        
        # Use regex to extract the JSON content between [ and ]
        json_match = re.search(r'\[.*?\]', text_part, re.DOTALL)
        
        if json_match:
            # Extract the matched JSON string and parse it
            json_text = json_match.group(0)
            return json.loads(json_text)
        else:
            print("No JSON content found in the response.")
            return None
    
    except Exception as e:
        print(f"Error parsing response: {e}")
        return None



# Process the dataset in batches
augmented_data = []
batch_size = 5  
for batch in tqdm(batch_data(data, batch_size), desc="Processing Batches"):
    result = augment_data_with_gemini(batch)
    if result:
        augmented_data.extend(result)

# Save the augmented data to a new file
with open("WHO_augmented_data.json", "w") as f:
    json.dump(augmented_data, f, indent=4)

print("Augmentation completed. Augmented data saved to 'WHO_augmented_data.json'.")



Processing Batches: 153it [10:42,  4.20s/it]

Augmentation completed. Augmented data saved to 'augmented_data.json'.





In [5]:
import json
from sentence_transformers import SentenceTransformer
from chromadb import PersistentClient
from chromadb.config import Settings

# Load JSON data
with open("WHO_augmented_data.json", "r") as f:
    data = json.load(f)

# Initialize the embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Initialize ChromaDB
client = PersistentClient(path="chroma_db")  # Directory to store the database

# Create or get a collection
collection = client.get_or_create_collection("eunoia")

# Generate embeddings and add to ChromaDB
for item in data:
    existing = collection.query(ids=[item_id], n_results=1)
    if existing["ids"]:
        print(f"Embedding with ID {item_id} already exists. Skipping.")
        continue  # Skip if the ID already exist
    # Combine question, answer, and context to form the embedding input
    text_to_embed = f"Q: {item['question']} A: {item['answer']} Context: {item['context']}"
    embedding = model.encode(text_to_embed)  # Generate the embedding

    # Add to ChromaDB collection
    collection.add(
        embeddings=[embedding],
        metadatas=[{
            "id": item["id"],
            "category": item["category"],
            "source": item["source"],
            "question": item["question"],
            "answer": item["answer"],
            "context": item["context"]
        }],
        ids=[item["id"]]  # Use the "id" field as the unique identifier
    )

print("Data successfully stored in ChromaDB.")


Insert of existing embedding ID: 1
Add of existing embedding ID: 1
Insert of existing embedding ID: 2
Add of existing embedding ID: 2
Insert of existing embedding ID: 3
Add of existing embedding ID: 3
Insert of existing embedding ID: 4
Add of existing embedding ID: 4
Insert of existing embedding ID: 5
Add of existing embedding ID: 5
Insert of existing embedding ID: 6
Add of existing embedding ID: 6
Insert of existing embedding ID: 7
Add of existing embedding ID: 7
Insert of existing embedding ID: 8
Add of existing embedding ID: 8
Insert of existing embedding ID: 9
Add of existing embedding ID: 9
Insert of existing embedding ID: 10
Add of existing embedding ID: 10
Insert of existing embedding ID: 11
Add of existing embedding ID: 11
Insert of existing embedding ID: 12
Add of existing embedding ID: 12
Insert of existing embedding ID: 13
Add of existing embedding ID: 13
Insert of existing embedding ID: 14
Add of existing embedding ID: 14
Insert of existing embedding ID: 15
Add of existing 

Data successfully stored in ChromaDB.
