# Intro to ChromaDB 

In [2]:
import os
import chromadb
import json

import shutil  # ✅ Properly handles folder deletion

import requests
import time


### Step 1: ChromaDB setup
Checking for ChromaDB, creating if its not there

Option to purge DB and recreate from news.json file

In [None]:
# 🔹 CONFIG: Set `purge_db = True` to DELETE & RESET the database
purge_db = False  # Set to True to delete existing DB and start fresh

# ✅ Define ChromaDB path
CHROMA_DB_PATH = "./chroma_db"
COLLECTION_NAME = "news_articles"

# 🔍 Check if ChromaDB exists
if not os.path.exists(CHROMA_DB_PATH):
    print("⚠️ ChromaDB instance not found. Creating a new one...")
    os.makedirs(CHROMA_DB_PATH)
else:
    print("✅ ChromaDB instance found. Using existing database.")

# 🔥 Purge Database If `purge_db` is Enabled
if purge_db:
    print("⚠️ Purging existing ChromaDB...")
    shutil.rmtree(CHROMA_DB_PATH, ignore_errors=True)  # ✅ Remove the directory properly
    os.makedirs(CHROMA_DB_PATH, exist_ok=True)  # ✅ Recreate empty directory
    print("✅ ChromaDB successfully reset.")

# ✅ Reinitialize ChromaDB Client (needed after purge)
client = chromadb.PersistentClient(path=CHROMA_DB_PATH)

# ✅ Recreate Collection If Purged
if purge_db:
    try:
        client.delete_collection(COLLECTION_NAME)  # ✅ Properly delete existing collection
    except Exception:
        pass  # Ignore errors if collection doesn't exist
    collection = client.create_collection(COLLECTION_NAME)  # ✅ Recreate the collection
    print(f"✅ Collection '{COLLECTION_NAME}' recreated after purge.")
else:
    # ✅ Load existing collection
    try:
        collection = client.get_collection(COLLECTION_NAME)
        print(f"✅ Collection '{COLLECTION_NAME}' found.")
    except Exception:
        print(f"⚠️ Collection '{COLLECTION_NAME}' not found. Creating a new one...")
        collection = client.create_collection(COLLECTION_NAME)

print(f"✅ ChromaDB is ready to use.")

# ✅ Load articles from news.json
with open("news.json", "r", encoding="utf-8") as f:
    news_articles = json.load(f)

# ✅ Get existing document IDs in ChromaDB (only if NOT purging)
existing_ids = set(collection.get()["ids"]) if not purge_db else set()
print(f"📊 Existing articles in ChromaDB: {len(existing_ids)}")

# ✅ Insert articles (skip duplicates)
new_entries = []
for article in news_articles:
    article_id = article["url"]
    if article_id not in existing_ids:  # Avoid duplicates
        new_entries.append({
            "document": article["content"],
            "metadata": {"title": article["title"], "url": article["url"]},
            "id": article_id
        })

# ✅ Add new articles to ChromaDB
if new_entries:
    collection.add(
        documents=[entry["document"] for entry in new_entries],
        metadatas=[entry["metadata"] for entry in new_entries],
        ids=[entry["id"] for entry in new_entries]
    )
    print(f"✅ {len(new_entries)} new articles added to ChromaDB.")
else:
    print("🔹 No new articles added. Database is up to date.")


✅ ChromaDB instance found. Using existing database.
⚠️ Purging existing ChromaDB...
✅ ChromaDB successfully reset.
✅ Collection 'news_articles' recreated after purge.
✅ ChromaDB is ready to use.
📊 Existing articles in ChromaDB: 0
✅ 49 new articles added to ChromaDB.


### Checking for new articles to add

In [None]:

# ✅ Initialize ChromaDB Client
CHROMA_DB_PATH = "./chroma_db"
COLLECTION_NAME = "news_articles"

client = chromadb.PersistentClient(path=CHROMA_DB_PATH)
collection = client.get_or_create_collection(COLLECTION_NAME)

# ✅ Load existing document IDs in ChromaDB
existing_ids = set(collection.get()["ids"])  # Convert to set for fast lookups
print(f"📊 Total existing articles in ChromaDB: {len(existing_ids)}")

# ✅ Load new articles from JSON
with open("RAG/news.json", "r", encoding="utf-8") as f:
    news_articles = json.load(f)

# ✅ Add only NEW articles to ChromaDB
new_entries = []
for article in news_articles:
    article_id = article["url"]  # Using URL as a unique ID
    if article_id not in existing_ids:
        new_entries.append({
            "document": article["content"],
            "metadata": {"title": article["title"], "url": article["url"]},
            "id": article_id
        })

# ✅ Insert new entries if any
if new_entries:
    collection.add(
        documents=[entry["document"] for entry in new_entries],
        metadatas=[entry["metadata"] for entry in new_entries],
        ids=[entry["id"] for entry in new_entries]
    )
    print(f"✅ {len(new_entries)} new articles added to ChromaDB.")
else:
    print("🔹 No new articles to add. Database is up to date.")


📊 Total existing articles in ChromaDB: 49
🔹 No new articles to add. Database is up to date.


### Testing queries in the ChromaDB

In [6]:
from chromadb import Client

query = "Is democracy going well?"
results = collection.query(query_texts=[query], n_results=3)

for res in results['documents'][0]:
    print("🔹 Retrieved:", res)


🔹 Retrieved: Five years after the first wave of the COVID-19 pandemic, we’re starting to see how that event has accelerated mistrust toward some traditional news outlets. Yet, As trust in national news erodes, platforms like local radio appear to be holding onto credibility.

A new Pew Research Center report found 54% of Americans believe the media exaggerated COVID risks.

Pew reports this view was overwhelmingly held by Republicans, with 80% saying coverage overstated the dangers, compared to just 30% of Democrats. The pandemic also reinforced a partisan divide in confidence toward accessing reliable health information. While 74% of Democrats express confidence in their ability to find accurate information during a future health crisis, only 46% of Republicans share that view.

Overall, just 60% of Americans feel prepared to navigate a future health emergency, reflecting ongoing skepticism about official sources.

The study also highlights a broader shift in media trust, particularly

### Doing Analysis on the stored values in ChromaDB