## Explanation of the Code

1Ô∏è‚É£ **Fetch news from Mediastack**  
   - Fetches articles based on your API key and fetch limit (e.g., 10 articles).

2Ô∏è‚É£ **Check for paywalled articles**  
   - Skips articles from known paywalled domains (e.g., New York Times).

3Ô∏è‚É£ **Extract full article text**  
   - Attempts to extract text using `newspaper3k`, `Unstructured`, and `BeautifulSoup`.

4Ô∏è‚É£ **Store articles in JSON**  
   - Saves the articles in a JSON file (`news.json`).

5Ô∏è‚É£ **Convert text to embeddings**  
   - Uses the `SentenceTransformer` to generate embeddings for each article's text.

6Ô∏è‚É£ **Store embeddings in ChromaDB**  
   - Adds the generated embeddings into ChromaDB for semantic search.


--- ## NOTE: the current chroma_db directory is one up, if this structure changes, it needs to change too. \
find it below

### to install

pip install fake-useragent && pip install newspaper3k && pip install lxml_html_clean

In [6]:
import os
import requests
import json
import time
from datetime import datetime
from unstructured.partition.html import partition_html
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from newspaper import Article

# üîπ CONFIGURATION: Define the Mediastack API and the number of articles to fetch
API_KEY = "356bb7cd80f02083d604ba6ba1dfadd8"
MAX_ARTICLES = 50  # Change this to limit results

# Mediastack Base URL
BASE_URL = f"http://api.mediastack.com/v1/news?access_key={API_KEY}&countries=us&limit={MAX_ARTICLES}"

# List of known paywalled domains
paywalled_domains = ["nytimes.com", "washingtonpost.com", "theatlantic.com", "bloomberg.com"]

# User-Agent Rotator
ua = UserAgent()

# üîπ Ensure NEWS_FILES directory exists
NEWS_DIR = "NEWS_FILES"
os.makedirs(NEWS_DIR, exist_ok=True)

def is_paywalled(url):
    """Check if the article is from a paywalled domain."""
    return any(domain in url for domain in paywalled_domains)

def extract_full_text(url):
    """Extract full article text using newspaper3k, Unstructured, and BeautifulSoup."""
    try:
        headers = {'User-Agent': ua.random}
        page = requests.get(url, headers=headers, timeout=10)

        if page.status_code != 200:
            return f"Error: Page returned status code {page.status_code}"

        # Attempt 1: newspaper3k (best for full-text extraction)
        article = Article(url)
        article.download()
        article.parse()
        if len(article.text) > 500:
            return article.text

        # Attempt 2: Unstructured (fallback)
        elements = partition_html(text=page.text)
        extracted_text = "\n".join([el.text for el in elements if el.text.strip()])
        if len(extracted_text) > 500:
            return extracted_text

        # Attempt 3: BeautifulSoup (last resort)
        soup = BeautifulSoup(page.text, "html.parser")
        paragraphs = soup.find_all("p")
        extracted_text = "\n".join([p.get_text() for p in paragraphs])
        return extracted_text if len(extracted_text) > 500 else "Content could not be extracted."

    except Exception as e:
        return f"Error extracting content: {str(e)}"

# üîπ Fetch news from Mediastack
response = requests.get(BASE_URL)
news_data = response.json().get("data", [])[:MAX_ARTICLES]  # Limit articles

articles_list = []

# üîπ Process each article
for i, article in enumerate(news_data):
    url = article.get("url", "")

    if not url or is_paywalled(url):
        print(f"üö´ Skipping paywalled article: {url}")
        continue

    print(f"üîç [{i+1}/{MAX_ARTICLES}] Processing: {url}")
    full_text = extract_full_text(url)

    # ‚úÖ Apply Fix: Ensure correct data types for each field
    articles_list.append({
        "title": article.get("title", "Unknown title"),
        "url": url,
        "published_date": article.get("published_at", "Unknown date"),
        "source_name": article["source"]["name"] if isinstance(article.get("source"), dict) else "Unknown source",
        "author": article["author"] if isinstance(article.get("author"), str) else "Unknown author",
        "category": article["category"] if isinstance(article.get("category"), str) else "Unknown category",
        "content": full_text
    })

    time.sleep(2)  # Avoid being blocked by rate limits

# üîπ Generate filename with sequential numbering and date
date_str = datetime.now().strftime("%Y-%m-%d")
existing_files = [f for f in os.listdir(NEWS_DIR) if f.startswith("news_") and f.endswith(".json")]
next_number = len(existing_files) + 1
filename = f"news_{next_number:03d}_{date_str}.json"
filepath = os.path.join(NEWS_DIR, filename)

# üîπ Save articles in JSON format
with open(filepath, "w", encoding="utf-8") as f:
    json.dump(articles_list, f, indent=4)

print(f"‚úÖ {len(articles_list)} articles saved in '{filepath}'.")


üîç [1/50] Processing: https://www.sportingnews.com/us/mlb/new-york-yankees/news/yankees-325-million-slugger-sends-concerning-2-word-message-after-brutal-injury-news/c617bb400e40a6142b4f9e45
üîç [2/50] Processing: https://www.theyeshivaworld.com/news/liveblogs/live-blog/2365660/burchett-dc-paper-trail-to-reveal-corruption.html
üîç [4/50] Processing: https://www.ndtvprofit.com/ipos/innovatiview-india-plans-to-raise-rs-2000-crore
üîç [5/50] Processing: https://myedmondsnews.com/2025/02/sponsor-spotlight-maximize-the-child-and-dependent-care-credit/
üîç [6/50] Processing: https://www.theyeshivaworld.com/news/liveblogs/live-blog/2365656/emily-damari-prays-for-abducted-friends-at-kosel.html
üîç [7/50] Processing: https://sentinelcolorado.com/uncategorized/colorado-senate-dems-take-major-step-to-ease-unionization-process/
üîç [8/50] Processing: https://sentinelcolorado.com/metro/colorado-senate-dems-take-major-step-to-ease-unionization-process/
üîç [9/50] Processing: https://journalr

In [6]:
import os
import chromadb

# ‚úÖ Define ChromaDB path (stored one directory above)
CHROMA_DB_PATH = "../chroma_db"

# üîç Check if the ChromaDB folder exists
if not os.path.exists(CHROMA_DB_PATH):
    print("‚ö†Ô∏è ChromaDB instance not found. Creating a new one...")
    os.makedirs(CHROMA_DB_PATH)  # Create the folder if missing
else:
    print("‚úÖ ChromaDB instance found. Using existing database.")

# ‚úÖ Initialize ChromaDB
client = chromadb.PersistentClient(path=CHROMA_DB_PATH)

# ‚úÖ Get or Create Collection
collection_name = "news_articles"

try:
    collection = client.get_collection(collection_name)
    print(f"‚úÖ Collection '{collection_name}' exists.")
except Exception:
    print(f"‚ö†Ô∏è Collection '{collection_name}' not found. Creating a new one...")
    collection = client.create_collection(collection_name)

print(f"‚úÖ ChromaDB is ready to use.")


‚úÖ ChromaDB instance found. Using existing database.
‚úÖ Collection 'news_articles' exists.
‚úÖ ChromaDB is ready to use.


In [7]:
import json
from sentence_transformers import SentenceTransformer

# ‚úÖ Load articles from news.json
with open("news.json", "r", encoding="utf-8") as f:
    articles_list = json.load(f)

# üîÑ Convert articles to embeddings and store in ChromaDB
print("üîÑ Converting articles to embeddings and storing them in ChromaDB...")

# ‚úÖ Load embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# ‚úÖ Check existing document IDs in ChromaDB to prevent duplicates
existing_ids = set(collection.get()["ids"])

new_entries = []
for article in articles_list:
    article_id = article["url"]
    if article_id not in existing_ids:  # Avoid duplicates
        text = f"{article['title']} {article['content']}"
        embedding = embedding_model.encode(text).tolist()

        new_entries.append({
            "document": text,
            "metadata": {
                "title": article["title"],
                "url": article["url"],
                "published_date": article["published_date"],
                "source_name": article["source_name"],
                "author": article["author"],
                "category": article["category"]
            },
            "id": article_id,
            "embedding": embedding
        })

# ‚úÖ Add new articles to ChromaDB
if new_entries:
    collection.add(
        documents=[entry["document"] for entry in new_entries],
        metadatas=[entry["metadata"] for entry in new_entries],
        ids=[entry["id"] for entry in new_entries],
        embeddings=[entry["embedding"] for entry in new_entries]
    )
    print(f"‚úÖ {len(new_entries)} new articles added to ChromaDB.")
else:
    print("üîπ No new articles added. Database is up to date.")


üîÑ Converting articles to embeddings and storing them in ChromaDB...
‚úÖ Articles converted to embeddings and stored in ChromaDB.


# üîπ CONFIG: Set `purge_db = True` to DELETE & RESET the database
purge_db = True  # Set to True to delete existing DB and start fresh

# ‚úÖ Define ChromaDB path
CHROMA_DB_PATH = "./chroma_db"
COLLECTION_NAME = "news_articles"

# üîç Check if ChromaDB exists
if not os.path.exists(CHROMA_DB_PATH):
    print("‚ö†Ô∏è ChromaDB instance not found. Creating a new one...")
    os.makedirs(CHROMA_DB_PATH)
else:
    print("‚úÖ ChromaDB instance found. Using existing database.")

# üî• Purge Database If `purge_db` is Enabled
if purge_db:
    print("‚ö†Ô∏è Purging existing ChromaDB...")
    shutil.rmtree(CHROMA_DB_PATH, ignore_errors=True)  # ‚úÖ Remove the directory properly
    os.makedirs(CHROMA_DB_PATH, exist_ok=True)  # ‚úÖ Recreate empty directory
    print("‚úÖ ChromaDB successfully reset.")


In [1]:
'''

import shutil

# üîπ CONFIG: Set `purge_db = True` to DELETE & RESET the database
purge_db = True  # Set to True to delete existing DB and start fresh

# ‚úÖ Define ChromaDB path
CHROMA_DB_PATH = "./chroma_db"
COLLECTION_NAME = "news_articles"

# üîç Check if ChromaDB exists
if not os.path.exists(CHROMA_DB_PATH):
    print("‚ö†Ô∏è ChromaDB instance not found. Creating a new one...")
    os.makedirs(CHROMA_DB_PATH)
else:
    print("‚úÖ ChromaDB instance found. Using existing database.")

# üî• Purge Database If `purge_db` is Enabled
if purge_db:
    print("‚ö†Ô∏è Purging existing ChromaDB...")
    shutil.rmtree(CHROMA_DB_PATH, ignore_errors=True)  # ‚úÖ Remove the directory properly
    os.makedirs(CHROMA_DB_PATH, exist_ok=True)  # ‚úÖ Recreate empty directory
    print("‚úÖ ChromaDB successfully reset.")
    
'''

'\n\nimport shutil\n\n# üîπ CONFIG: Set `purge_db = True` to DELETE & RESET the database\npurge_db = True  # Set to True to delete existing DB and start fresh\n\n# ‚úÖ Define ChromaDB path\nCHROMA_DB_PATH = "./chroma_db"\nCOLLECTION_NAME = "news_articles"\n\n# üîç Check if ChromaDB exists\nif not os.path.exists(CHROMA_DB_PATH):\n    print("‚ö†Ô∏è ChromaDB instance not found. Creating a new one...")\n    os.makedirs(CHROMA_DB_PATH)\nelse:\n    print("‚úÖ ChromaDB instance found. Using existing database.")\n\n# üî• Purge Database If `purge_db` is Enabled\nif purge_db:\n    print("‚ö†Ô∏è Purging existing ChromaDB...")\n    shutil.rmtree(CHROMA_DB_PATH, ignore_errors=True)  # ‚úÖ Remove the directory properly\n    os.makedirs(CHROMA_DB_PATH, exist_ok=True)  # ‚úÖ Recreate empty directory\n    print("‚úÖ ChromaDB successfully reset.")\n    \n'