## Explanation of the Code

1️⃣ **Fetch news from Mediastack**  
   - Fetches articles based on your API key and fetch limit (e.g., 10 articles).

2️⃣ **Check for paywalled articles**  
   - Skips articles from known paywalled domains (e.g., New York Times).

3️⃣ **Extract full article text**  
   - Attempts to extract text using `newspaper3k`, `Unstructured`, and `BeautifulSoup`.

4️⃣ **Store articles in JSON**  
   - Saves the articles in a JSON file (`news.json`).

5️⃣ **Convert text to embeddings**  
   - Uses the `SentenceTransformer` to generate embeddings for each article's text.

6️⃣ **Store embeddings in ChromaDB**  
   - Adds the generated embeddings into ChromaDB for semantic search.


--- ## NOTE: the current chroma_db directory is one up, if this structure changes, it needs to change too. \
find it below

### to install

pip install fake-useragent && pip install newspaper3k && pip install lxml_html_clean

In [5]:
import requests
import json
import time
import chromadb
from sentence_transformers import SentenceTransformer
from unstructured.partition.html import partition_html
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from newspaper import Article

# 🔹 CONFIGURATION: Define the Mediastack API and the number of articles to fetch
API_KEY = "356bb7cd80f02083d604ba6ba1dfadd8"
MAX_ARTICLES = 50  # You can change this to 5, 10, etc.

# Mediastack Base URL
BASE_URL = f"http://api.mediastack.com/v1/news?access_key={API_KEY}&countries=us&limit={MAX_ARTICLES}"

# List of known paywalled domains (to avoid scraping content)
paywalled_domains = ["nytimes.com", "washingtonpost.com", "theatlantic.com", "bloomberg.com"]

# User-Agent Rotator
ua = UserAgent()

def is_paywalled(url):
    """Check if the article is from a paywalled domain."""
    return any(domain in url for domain in paywalled_domains)

def extract_full_text(url):
    """Extract full article text using newspaper3k, Unstructured, and BeautifulSoup."""
    try:
        headers = {'User-Agent': ua.random}
        page = requests.get(url, headers=headers, timeout=10)

        if page.status_code != 200:
            return f"Error: Page returned status code {page.status_code}"

        # Attempt 1: newspaper3k (best for full-text extraction)
        article = Article(url)
        article.download()
        article.parse()
        if len(article.text) > 500:
            return article.text

        # Attempt 2: Unstructured (fallback)
        elements = partition_html(text=page.text)
        extracted_text = "\n".join([el.text for el in elements if el.text.strip()])
        if len(extracted_text) > 500:
            return extracted_text

        # Attempt 3: BeautifulSoup (last resort)
        soup = BeautifulSoup(page.text, "html.parser")
        paragraphs = soup.find_all("p")
        extracted_text = "\n".join([p.get_text() for p in paragraphs])
        return extracted_text if len(extracted_text) > 500 else "Content could not be extracted."

    except Exception as e:
        return f"Error extracting content: {str(e)}"

# 🔹 Fetch news from Mediastack
response = requests.get(BASE_URL)
news_data = response.json().get("data", [])[:MAX_ARTICLES]  # Limit articles

articles_list = []

# 🔹 Process each article
for i, article in enumerate(news_data):
    url = article.get("url", "")

    if not url or is_paywalled(url):
        print(f"🚫 Skipping paywalled article: {url}")
        continue

    print(f"🔍 [{i+1}/{MAX_ARTICLES}] Processing: {url}")
    full_text = extract_full_text(url)

    articles_list.append({
        "title": article.get("title", "Unknown title"),
        "url": url,
        "content": full_text
    })

    time.sleep(2)  # Avoid being blocked by rate limits

# 🔹 Save articles in JSON format
with open("news.json", "w", encoding="utf-8") as f:
    json.dump(articles_list, f, indent=4)

print(f"✅ Articles saved in 'news.json'.")



🔍 [1/50] Processing: https://www.marketbeat.com/instant-alerts/deere-company-nysede-shares-sold-by-mathes-company-inc-2025-02-17/
🔍 [2/50] Processing: https://www.marketbeat.com/instant-alerts/mathes-company-inc-has-416-million-position-in-the-procter-gamble-company-nysepg-2025-02-17/
🔍 [3/50] Processing: https://www.marketbeat.com/instant-alerts/servicenow-inc-nysenow-shares-sold-by-mathes-company-inc-2025-02-17/
🔍 [4/50] Processing: https://www.marketbeat.com/instant-alerts/mathes-company-inc-sells-421-shares-of-parker-hannifin-co-nyseph-2025-02-17/
🔍 [5/50] Processing: https://www.marketbeat.com/instant-alerts/mathes-company-inc-sells-1277-shares-of-general-electric-nysege-2025-02-17/
🔍 [6/50] Processing: https://www.marketbeat.com/instant-alerts/mathes-company-inc-has-746-million-stock-position-in-ge-vernova-inc-nysegev-2025-02-17/
🔍 [7/50] Processing: https://www.marketbeat.com/instant-alerts/woodard-co-asset-management-group-inc-adv-invests-712000-in-vertiv-holdings-co-nysevrt-20

In [6]:
import os
import chromadb

## NOTE: the current chroma_db directory is one up, if this structure changes, it needs to change too.API_KEY
## right here

# Define ChromaDB path
CHROMA_DB_PATH = "../chroma_db"

# 🔍 Check if the ChromaDB folder exists
if not os.path.exists(CHROMA_DB_PATH):
    print("⚠️ ChromaDB instance not found. Creating a new one...")
    os.makedirs(CHROMA_DB_PATH)  # Create the folder if missing
else:
    print("✅ ChromaDB instance found. Using existing database.")

# ✅ Initialize ChromaDB
client = chromadb.PersistentClient(path=CHROMA_DB_PATH)

# ✅ Check if collection exists, create if missing
collection_name = "news_articles"

try:
    collection = client.get_collection(collection_name)
    print(f"✅ Collection '{collection_name}' exists.")
except Exception:
    print(f"⚠️ Collection '{collection_name}' not found. Creating a new one...")
    collection = client.create_collection(collection_name)

print(f"✅ ChromaDB is ready to use.")

✅ ChromaDB instance found. Using existing database.
✅ Collection 'news_articles' exists.
✅ ChromaDB is ready to use.


In [7]:
# 🔹 INTEGRATION WITH CHROMADB (Embeddings)
print("🔄 Converting articles to embeddings and storing them in ChromaDB...")



# Load the embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Convert each article to embeddings and store them in ChromaDB
for article in articles_list:
    text = article["title"] + " " + article["content"]
    embedding = embedding_model.encode(text).tolist()

    collection.add(
        ids=[article["url"]],
        embeddings=[embedding],
        metadatas=[{"title": article["title"], "url": article["url"]}],
        documents=[text]
    )

print("✅ Articles converted to embeddings and stored in ChromaDB.")

🔄 Converting articles to embeddings and storing them in ChromaDB...
✅ Articles converted to embeddings and stored in ChromaDB.


# 🔹 CONFIG: Set `purge_db = True` to DELETE & RESET the database
purge_db = True  # Set to True to delete existing DB and start fresh

# ✅ Define ChromaDB path
CHROMA_DB_PATH = "./chroma_db"
COLLECTION_NAME = "news_articles"

# 🔍 Check if ChromaDB exists
if not os.path.exists(CHROMA_DB_PATH):
    print("⚠️ ChromaDB instance not found. Creating a new one...")
    os.makedirs(CHROMA_DB_PATH)
else:
    print("✅ ChromaDB instance found. Using existing database.")

# 🔥 Purge Database If `purge_db` is Enabled
if purge_db:
    print("⚠️ Purging existing ChromaDB...")
    shutil.rmtree(CHROMA_DB_PATH, ignore_errors=True)  # ✅ Remove the directory properly
    os.makedirs(CHROMA_DB_PATH, exist_ok=True)  # ✅ Recreate empty directory
    print("✅ ChromaDB successfully reset.")


In [1]:
'''

import shutil

# 🔹 CONFIG: Set `purge_db = True` to DELETE & RESET the database
purge_db = True  # Set to True to delete existing DB and start fresh

# ✅ Define ChromaDB path
CHROMA_DB_PATH = "./chroma_db"
COLLECTION_NAME = "news_articles"

# 🔍 Check if ChromaDB exists
if not os.path.exists(CHROMA_DB_PATH):
    print("⚠️ ChromaDB instance not found. Creating a new one...")
    os.makedirs(CHROMA_DB_PATH)
else:
    print("✅ ChromaDB instance found. Using existing database.")

# 🔥 Purge Database If `purge_db` is Enabled
if purge_db:
    print("⚠️ Purging existing ChromaDB...")
    shutil.rmtree(CHROMA_DB_PATH, ignore_errors=True)  # ✅ Remove the directory properly
    os.makedirs(CHROMA_DB_PATH, exist_ok=True)  # ✅ Recreate empty directory
    print("✅ ChromaDB successfully reset.")
    
'''

'\n\nimport shutil\n\n# 🔹 CONFIG: Set `purge_db = True` to DELETE & RESET the database\npurge_db = True  # Set to True to delete existing DB and start fresh\n\n# ✅ Define ChromaDB path\nCHROMA_DB_PATH = "./chroma_db"\nCOLLECTION_NAME = "news_articles"\n\n# 🔍 Check if ChromaDB exists\nif not os.path.exists(CHROMA_DB_PATH):\n    print("⚠️ ChromaDB instance not found. Creating a new one...")\n    os.makedirs(CHROMA_DB_PATH)\nelse:\n    print("✅ ChromaDB instance found. Using existing database.")\n\n# 🔥 Purge Database If `purge_db` is Enabled\nif purge_db:\n    print("⚠️ Purging existing ChromaDB...")\n    shutil.rmtree(CHROMA_DB_PATH, ignore_errors=True)  # ✅ Remove the directory properly\n    os.makedirs(CHROMA_DB_PATH, exist_ok=True)  # ✅ Recreate empty directory\n    print("✅ ChromaDB successfully reset.")\n    \n'