## Explanation of the Code

1️⃣ **Fetch news from Mediastack**  
   - Fetches articles based on your API key and fetch limit (e.g., 10 articles).

2️⃣ **Check for paywalled articles**  
   - Skips articles from known paywalled domains (e.g., New York Times).

3️⃣ **Extract full article text**  
   - Attempts to extract text using `newspaper3k`, `Unstructured`, and `BeautifulSoup`.

4️⃣ **Store articles in JSON**  
   - Saves the articles in a JSON file (`news.json`).

5️⃣ **Convert text to embeddings**  
   - Uses the `SentenceTransformer` to generate embeddings for each article's text.

6️⃣ **Store embeddings in ChromaDB**  
   - Adds the generated embeddings into ChromaDB for semantic search.


--- ## NOTE: the current chroma_db directory is one up, if this structure changes, it needs to change too. \
find it below

### to install

pip install fake-useragent newspaper3k lxml_html_clean requests unstructured bs4 chromadb sentence-transformers

In [5]:
import os
import requests
import json
import time
from datetime import datetime
from unstructured.partition.html import partition_html
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from newspaper import Article

In [20]:
# CONFIGURATION

MAX_ARTICLES = 5  # Change this to limit results


API_KEY = "356bb7cd80f02083d604ba6ba1dfadd8"
BASE_URL = f"http://api.mediastack.com/v1/news?access_key={API_KEY}&countries=us&limit={MAX_ARTICLES}"


## Run this if you want to check that the API key works

# print("🔍 Checking API Response:")
# print(json.dumps(response.json(), indent=4))  # Pretty-print the JSON response


In [21]:
NEWS_DIR = "NEWS_FILES"
os.makedirs(NEWS_DIR, exist_ok=True)  # Ensure archive directory exists

# ✅ User-Agent Rotator & Paywall Handling
ua = UserAgent()
paywalled_domains = {"nytimes.com", "washingtonpost.com", "theatlantic.com", "bloomberg.com"}

def is_paywalled(url):
    return any(domain in url for domain in paywalled_domains)

def extract_full_text(url):
    """Extracts article content using newspaper3k, Unstructured, and BeautifulSoup."""
    try:
        headers = {'User-Agent': ua.random}
        page = requests.get(url, headers=headers, timeout=10)

        if page.status_code != 200:
            return f"Error: Page returned status code {page.status_code}"

        article = Article(url)
        article.download()
        article.parse()
        if len(article.text) > 500:
            return article.text

        elements = partition_html(text=page.text)
        extracted_text = "\n".join([el.text for el in elements if el.text.strip()])
        if len(extracted_text) > 500:
            return extracted_text

        soup = BeautifulSoup(page.text, "html.parser")
        paragraphs = soup.find_all("p")
        extracted_text = "\n".join([p.get_text() for p in paragraphs])
        return extracted_text if len(extracted_text) > 500 else "Content could not be extracted."
    
    except Exception as e:
        return f"Error extracting content: {str(e)}"

# ✅ Fetch & Process News
response = requests.get(BASE_URL)
news_data = response.json().get("data", [])[:MAX_ARTICLES]

articles_list = []
for i, article in enumerate(news_data):
    url = article.get("url", "")
    
    if not url or is_paywalled(url):
        print(f"🚫 Skipping paywalled article: {url}")
        continue

    print(f"🔍 [{i+1}/{MAX_ARTICLES}] Processing: {url}")
    full_text = extract_full_text(url)

    articles_list.append({
        "title": article.get("title", "Unknown title"),
        "url": url,
        "published_date": article.get("published_at", "Unknown date"),
        "source_name": article["source"]["name"] if isinstance(article.get("source"), dict) else "Unknown source",
        "author": article["author"] if isinstance(article.get("author"), str) else "Unknown author",
        "category": article["category"] if isinstance(article.get("category"), str) else "Unknown category",
        "content": full_text
    })
    time.sleep(2)  # Avoid API rate limits

# ✅ Save Archived JSON File
date_str = datetime.now().strftime("%Y-%m-%d")
json_files = [f for f in os.listdir(NEWS_DIR) if f.startswith(f"news_{date_str}")]
file_number = len(json_files) + 1
filename = f"news_{date_str}_{file_number}.json"
filepath = os.path.join(NEWS_DIR, filename)

with open(filepath, "w", encoding="utf-8") as f:
    json.dump(articles_list, f, indent=4)

print(f"✅ {len(articles_list)} articles saved in '{filepath}'.")


🔍 [1/5] Processing: https://www.mymotherlode.com/news/europe/3536936/greece-says-over-100-migrants-are-detained-as-officials-link-the-surge-in-arrivals-to-mideast-wars.html
🔍 [2/5] Processing: https://www.naharnet.com/stories/en/311070-un-says-delay-in-israel-s-withdrawal-from-south-lebanon-violation-of-1701
🔍 [3/5] Processing: https://orissadiary.com/former-uk-pm-rishi-sunak-visits-parliament-house-with-family/
🔍 [4/5] Processing: https://nesn.com/2025/02/vladimir-guerrero-jr-rumors-these-teams-favorites-to-sign-slugger/
🔍 [5/5] Processing: https://www.denverpost.com/2025/02/18/colorado-weather-forecast-arctic-blast-snow-mountains-freezing-temperatures/
✅ 5 articles saved in 'NEWS_FILES\news_2025-02-18_3.json'.


### Checking the ChromaDB

In [14]:
import os
import chromadb

# ✅ ChromaDB Path
CHROMA_DB_PATH = "../chroma_db"

# ✅ Ensure ChromaDB Exists
if not os.path.exists(CHROMA_DB_PATH):
    print("⚠️ ChromaDB instance not found. Creating a new one...")
    os.makedirs(CHROMA_DB_PATH)  
else:
    print("✅ ChromaDB instance found. Using existing database.")

# ✅ Initialize Client & Collection
client = chromadb.PersistentClient(path=CHROMA_DB_PATH)
collection_name = "news_articles"

try:
    collection = client.get_collection(collection_name)
    print(f"✅ Collection '{collection_name}' exists.")
except Exception:
    print(f"⚠️ Collection '{collection_name}' not found. Creating a new one...")
    collection = client.create_collection(collection_name)

print(f"✅ ChromaDB is ready to use.")


✅ ChromaDB instance found. Using existing database.
✅ Collection 'news_articles' exists.
✅ ChromaDB is ready to use.


### Adding Entries to ChromaDB from the newly generated news.json file and archiving it

It checks for the most recent file from NEWS_FILES directory
- Toggle if you wish to check all files.

In [18]:
import os
import json
from sentence_transformers import SentenceTransformer

# ✅ CONFIGURATION
NEWS_DIR = "NEWS_FILES"
check_all_files = False  # Toggle: True to check all files, False to only process the latest file

# ✅ Get Files to Process
if check_all_files:
    news_files = sorted(os.listdir(NEWS_DIR))  # Get all files
    print(f"🔍 Processing ALL {len(news_files)} news files in {NEWS_DIR}")
else:
    news_files = [sorted(os.listdir(NEWS_DIR))[-1]]  # Get only the most recent file
    print(f"🔍 Processing LATEST file: {news_files[0]}")

# ✅ Load Embedding Model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# ✅ Get Existing Entries in ChromaDB to Avoid Duplicates
existing_ids = set(collection.get()["ids"])
new_entries = []
seen_ids = set()  # Local set to track duplicates within the batch

# ✅ Process Each File
for news_file in news_files:
    news_filepath = os.path.join(NEWS_DIR, news_file)

    try:
        with open(news_filepath, "r", encoding="utf-8") as f:
            articles_list = json.load(f)

        print(f"✅ Loaded {len(articles_list)} articles from {news_file}.")

        # ✅ Process Articles
        for article in articles_list:
            article_id = article["url"]

            # ✅ Check if already in ChromaDB or seen in current batch
            if article_id in existing_ids or article_id in seen_ids:
                print(f"🚫 Skipping duplicate: {article_id}")
                continue  # Skip duplicate

            text = f"{article['title']} {article['content']}"
            embedding = embedding_model.encode(text).tolist()

            new_entries.append({
                "document": text,
                "metadata": {
                    "title": article["title"],
                    "url": article["url"],
                    "published_date": article["published_date"],
                    "source_name": article["source_name"],
                    "author": article["author"],
                    "category": article["category"]
                },
                "id": article_id,
                "embedding": embedding
            })

            seen_ids.add(article_id)  # ✅ Track new IDs to prevent duplicates in this batch

    except Exception as e:
        print(f"⚠️ Error processing {news_file}: {str(e)}")

# ✅ Store in ChromaDB
if new_entries:
    collection.add(
        documents=[entry["document"] for entry in new_entries],
        metadatas=[entry["metadata"] for entry in new_entries],
        ids=[entry["id"] for entry in new_entries],
        embeddings=[entry["embedding"] for entry in new_entries]
    )
    print(f"✅ {len(new_entries)} new articles added to ChromaDB.")
else:
    print("🔹 No new articles added. Database is up to date.")


🔍 Processing LATEST file: news_2025-02-18.json
✅ Loaded 10 articles from news_2025-02-18.json.
✅ 10 new articles added to ChromaDB.
