# RAG Final Test

### Run these to install

pip install chromadb wikipedia-api requests spacy textblob scikit-learn nltk         \
python -m spacy download en_core_web_sm

In [20]:
import chromadb
import spacy
import wikipediaapi
import requests
import json
import numpy as np
import random
import pandas as pd
from tqdm import tqdm
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk

## Setup & Load ChromaDB + Check Google Fact API

In [2]:
# ‚úÖ Initialize NLP Models
nltk.download("punkt")
nlp = spacy.load("en_core_web_sm")

# ‚úÖ Initialize ChromaDB
CHROMA_DB_PATH = "../chroma_db"
client = chromadb.PersistentClient(path=CHROMA_DB_PATH)
collection = client.get_collection("news_articles")

# ‚úÖ Define user agent properly for Wikipedia API
WIKI_USER_AGENT = "FakeBuster/1.0 (contact: maxwellcranston@gmail.com)"

# ‚úÖ Initialize Wikipedia API with User-Agent
wiki = wikipediaapi.Wikipedia(
    language='en',
    user_agent=WIKI_USER_AGENT  # Pass user-agent correctly
)

# ‚úÖ Test Wikipedia Query
page = wiki.page("COVID-19")

print(f"üîç Wikipedia summary for 'COVID-19':\n{page.summary[:50]}...")  # Print first 500 chars

print("‚úÖ Wikipedia API initialized successfully!")


print("‚úÖ Wikipedia API initialized successfully!")

# ‚úÖ Google Fact Check API Key (Get from Google Fact Check Tools API)
GOOGLE_FACTCHECK_API_KEY = "AIzaSyAP0d1Ma_yn4TVDXuuGljtJZSdC08P1Y_U"

print("‚úÖ ChromaDB & External APIs initialized!")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\newpc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


üîç Wikipedia summary for 'COVID-19':
Coronavirus disease 2019 (COVID-19) is a contagiou...
‚úÖ Wikipedia API initialized successfully!
‚úÖ Wikipedia API initialized successfully!
‚úÖ ChromaDB & External APIs initialized!


## Named Entity Recognition (NER) for Context Retrieval

In [3]:
# ‚úÖ Function to extract named entities
def extract_entities(text):
    """Extract key named entities (names, locations, organizations) from text."""
    doc = nlp(text)
    entities = {ent.text for ent in doc.ents if ent.label_ in {"PERSON", "ORG", "GPE"}}
    return list(entities)

# ‚úÖ Fetch Articles from ChromaDB
articles = collection.get()
print(f"‚úÖ Retrieved {len(articles['documents'])} articles from ChromaDB.")

# ‚úÖ Extract Named Entities from each article
article_entities = {}
for i, doc in enumerate(articles["documents"]):
    entities = extract_entities(doc)
    article_entities[articles["ids"][i]] = entities

print("‚úÖ Named Entity Recognition (NER) completed!")


‚úÖ Retrieved 171 articles from ChromaDB.
‚úÖ Named Entity Recognition (NER) completed!


# Retrieve and Fetch steps:

- Retrieve: Related Articles
- Fetch Additional Fact-Checking Data (Google Fact Check API and Wikipedia )

Note: We are saving to memory and not committing to the ChromaDB yet, only after Ling analysis is done

In [6]:
# ‚úÖ Toggle for Quick Testing (Set to True for processing only 1 article)
DEBUG_MODE = True  # Change to False for full processing

# ‚úÖ Function to search ChromaDB using named entities
def search_related_articles(entity):
    """Retrieve related articles using entity search."""
    results = collection.query(query_texts=[entity], n_results=3)  # Adjust results as needed
    return results["documents"]

# ‚úÖ Function to fetch Google Fact Check API results with timeout & logging
def get_factcheck_results(query, timeout=5):
    """Retrieve fact-check results from Google's Fact Check API with timeout handling."""
    url = f"https://factchecktools.googleapis.com/v1alpha1/claims:search?query={query}&key={GOOGLE_FACTCHECK_API_KEY}"
    try:
        response = requests.get(url, timeout=timeout)
        response.raise_for_status()  # Raise an error for bad responses
        data = response.json().get("claims", [])
        return [claim["text"] for claim in data] if data else ["No fact-checks found."]
    except requests.exceptions.Timeout:
        return ["‚ö†Ô∏è Fact-check API timed out."]
    except requests.exceptions.RequestException as e:
        return [f"‚ö†Ô∏è Fact-check API error: {str(e)}"]

# ‚úÖ Function to fetch Wikipedia summary with error handling
def get_wikipedia_summary(entity):
    """Retrieve a brief Wikipedia summary for an entity if available."""
    try:
        page = wiki.page(entity)
        return page.summary if page.exists() else "No Wikipedia data found."
    except Exception as e:
        return f"‚ö†Ô∏è Wikipedia fetch error: {str(e)}"



# ‚úÖ Expand Articles with Related Context + Fact-Checking
expanded_articles = {}

# ‚úÖ Limit processing to 1 article when in DEBUG mode
article_items = list(article_entities.items())  # Convert dict to list for indexing
if DEBUG_MODE:
    article_items = article_items[:1]  # Only process the first article

for article_id, entities in tqdm(article_items):  # Now uses `article_items`
    related_content = []
    fact_check_data = []

    for entity in entities:
        related_articles = search_related_articles(entity)
        fact_check_results = get_factcheck_results(entity)
        wikipedia_summary = get_wikipedia_summary(entity)

        related_content.extend(related_articles)  # This may contain lists
        fact_check_data.append(f"Fact-Check: {fact_check_results}\nWikipedia: {wikipedia_summary}")

    # ‚úÖ Fix: Convert all related content items to strings before joining
    expanded_articles[article_id] = (
        articles["documents"][articles["ids"].index(article_id)]
        + "\n\nRelated Content:\n" + "\n".join([str(item) for item in related_content])
        + "\n\nFact-Checking Data:\n" + "\n".join(fact_check_data)
    )

print(f"‚úÖ Processed {len(expanded_articles)} article(s) with related context & fact-checking data!")

'''
# ‚úÖ Expand Articles with Related Context + Fact-Checking
expanded_articles = {}
for article_id, entities in tqdm(article_entities.items()):
    related_content = []
    fact_check_data = []

    for entity in entities:
        related_articles = search_related_articles(entity)
        fact_check_results = get_factcheck_results(entity)
        wikipedia_summary = get_wikipedia_summary(entity)

        related_content.extend(related_articles)  # This may contain lists
        fact_check_data.append(f"Fact-Check: {fact_check_results}\nWikipedia: {wikipedia_summary}")

    # ‚úÖ Fix: Convert all related content items to strings before joining
    expanded_articles[article_id] = (
        articles["documents"][articles["ids"].index(article_id)]
        + "\n\nRelated Content:\n" + "\n".join([str(item) for item in related_content])
        + "\n\nFact-Checking Data:\n" + "\n".join(fact_check_data)
    )

print("‚úÖ Articles enriched with related context & fact-checking data!")
'''



100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:39<00:00, 39.98s/it]

‚úÖ Processed 1 article(s) with related context & fact-checking data!





'\n# ‚úÖ Expand Articles with Related Context + Fact-Checking\nexpanded_articles = {}\nfor article_id, entities in tqdm(article_entities.items()):\n    related_content = []\n    fact_check_data = []\n\n    for entity in entities:\n        related_articles = search_related_articles(entity)\n        fact_check_results = get_factcheck_results(entity)\n        wikipedia_summary = get_wikipedia_summary(entity)\n\n        related_content.extend(related_articles)  # This may contain lists\n        fact_check_data.append(f"Fact-Check: {fact_check_results}\nWikipedia: {wikipedia_summary}")\n\n    # ‚úÖ Fix: Convert all related content items to strings before joining\n    expanded_articles[article_id] = (\n        articles["documents"][articles["ids"].index(article_id)]\n        + "\n\nRelated Content:\n" + "\n".join([str(item) for item in related_content])\n        + "\n\nFact-Checking Data:\n" + "\n".join(fact_check_data)\n    )\n\nprint("‚úÖ Articles enriched with related context & fact-check

## Linguistic Analysis

In [None]:
# ‚úÖ TF-IDF Outlier Analysis
def tfidf_outliers(texts, top_n=5):
    """Finds top N high-TF-IDF words per article."""
    texts = [text if isinstance(text, str) else "" for text in texts]
    vectorizer = TfidfVectorizer(stop_words="english")
    tfidf_matrix = vectorizer.fit_transform(texts)
    feature_names = vectorizer.get_feature_names_out()

    outlier_words = []
    for row in tfidf_matrix:
        scores = row.toarray()[0]
        top_indices = scores.argsort()[-top_n:]
        outlier_words.append([feature_names[i] for i in top_indices])

    return outlier_words

# ‚úÖ Sentiment Analysis
def sentiment_analysis(text):
    """Detects sentiment polarity and emotional words."""
    analysis = TextBlob(text)
    return {"polarity": analysis.sentiment.polarity, "subjectivity": analysis.sentiment.subjectivity}

# ‚úÖ Grammar & Readability Analysis
def grammar_analysis(text):
    """Analyzes grammatical complexity and readability."""
    doc = nlp(text)
    errors = sum(1 for token in doc if token.is_oov)  # Out-of-vocabulary words
    sentences = len(list(doc.sents))
    return {"grammar_errors": errors, "sentence_count": sentences}

# ‚úÖ Named Entity Density Analysis
def named_entity_density(text):
    """Measures how many entities exist in a given text."""
    doc = nlp(text)
    return {"entity_count": len(doc.ents), "text_length": len(text.split())}

# ‚úÖ Processed Articles with Linguistic Analysis
final_articles = {}

for article_id, enriched_text in expanded_articles.items():
    # ‚úÖ Run Linguistic Analysis
    tfidf_outliers_list = tfidf_outliers([enriched_text])[0]
    sentiment = sentiment_analysis(enriched_text)
    grammar = grammar_analysis(enriched_text)
    entity_density = named_entity_density(enriched_text)

    # ‚úÖ Merge with previous enrichment (RAG context + Fact-checking + Linguistics)
    final_articles[article_id] = {
        "content": enriched_text,
        "tfidf_outliers": tfidf_outliers_list,
        "sentiment_polarity": sentiment["polarity"],
        "sentiment_subjectivity": sentiment["subjectivity"],
        "grammar_errors": grammar["grammar_errors"],
        "sentence_count": grammar["sentence_count"],
        "entity_count": entity_density["entity_count"],
        "text_length": entity_density["text_length"],
    }

    # ‚úÖ Print the first processed article for review (if in DEBUG mode)
    if DEBUG_MODE:
        print("\nüîç **Test Entry Preview Before DB Commit** üîç")
        print(f"üìå **Article ID:** {article_id}")
        print(f"üìú **Content (Snippet):** {enriched_text[:500]}...")  # Show first 500 chars
        print(f"üìä **Linguistic Analysis:**")
        print(f"   - TF-IDF Outliers: {tfidf_outliers_list}")
        print(f"   - Sentiment Polarity: {sentiment['polarity']}")
        print(f"   - Sentiment Subjectivity: {sentiment['subjectivity']}")
        print(f"   - Grammar Errors: {grammar['grammar_errors']}")
        print(f"   - Sentence Count: {grammar['sentence_count']}")
        print(f"   - Entity Count: {entity_density['entity_count']}")
        print(f"   - Text Length: {entity_density['text_length']}")
        print("\nüõ†Ô∏è **Confirm before committing to ChromaDB!**")
        break  # Only process and print 1 entry in DEBUG mode

print(f"‚úÖ Linguistic analysis completed for {len(final_articles)} articles!")



üîç **Test Entry Preview Before DB Commit** üîç
üìå **Article ID:** https://newsone.com/5939034/presidents-are-judged-by-history-through-the-lens-of-morality/
üìú **Content (Snippet):** A statue of Abraham Lincoln, the 16th president of the United States, sits in the Lincoln Memorial in Washington. Historians consistently have given Lincoln, the Great Emancipator, their highest rating because of his leadership during the Civil War. Jakub Porzycki/NurPhoto via Getty Images

What will be former President Joe Biden‚Äôs legacy? How will Americans in the future consider his four years in office?

Every American president lands in the history books. And historians‚Äô assessments of their...
üìä **Linguistic Analysis:**
   - TF-IDF Outliers: ['2021', 'surveys', 'historians', 'president', 'presidents']
   - Sentiment Polarity: 0.11306499148476427
   - Sentiment Subjectivity: 0.4317448386062364
   - Grammar Errors: 128707
   - Sentence Count: 3856
   - Entity Count: 12364
   - Text Length

## Testing the Entries before putting them into DB

In [None]:
# ‚úÖ CONFIGURE SETTINGS
NUM_SAMPLES = 5  # Change this to inspect X articles
RANDOM_SAMPLES = True  # True = Random X articles, False = First X

print(f"üîç Previewing {NUM_SAMPLES} articles before committing to ChromaDB...")

üîç Previewing 5 articles before committing to ChromaDB...


### Sample & Display Enriched Articles

In [11]:
import json

# ‚úÖ Check if `expanded_articles` contains data
if not expanded_articles:
    print("‚ö†Ô∏è No enriched articles found in memory. Ensure RAG processing has completed.")
else:
    # ‚úÖ Select a subset of articles
    article_keys = list(expanded_articles.keys())

    if RANDOM_SAMPLES and len(article_keys) >= NUM_SAMPLES:
        sample_ids = random.sample(article_keys, NUM_SAMPLES)
    else:
        sample_ids = article_keys[:NUM_SAMPLES]  # Get first X articles if not enough for random

    # ‚úÖ Display selected enriched articles
    for idx, article_id in enumerate(sample_ids, start=1):
        enriched_text = expanded_articles.get(article_id, "[No content found]")  # Safe retrieval
        print(f"\nüîπ **Sample {idx}/{len(sample_ids)} - Article ID:** {article_id}")
        print(f"üìú **Enriched Content (Snippet):**\n{enriched_text[:1000]}...")  # First 1000 chars
        print("=" * 100)

    print(f"\n‚úÖ Displayed {len(sample_ids)} enriched articles.")



üîπ **Sample 1/1 - Article ID:** https://newsone.com/5939034/presidents-are-judged-by-history-through-the-lens-of-morality/
üìú **Enriched Content (Snippet):**
A statue of Abraham Lincoln, the 16th president of the United States, sits in the Lincoln Memorial in Washington. Historians consistently have given Lincoln, the Great Emancipator, their highest rating because of his leadership during the Civil War. Jakub Porzycki/NurPhoto via Getty Images

What will be former President Joe Biden‚Äôs legacy? How will Americans in the future consider his four years in office?

Every American president lands in the history books. And historians‚Äô assessments of their performance have been generally consistent over time. But some presidents‚Äô rankings have changed as the nation ‚Äì and historians themselves ‚Äì reassessed the country‚Äôs values and priorities.

Historians have been ranking presidents in surveys since Arthur Schlesinger Sr.‚Äôs first such study appeared in Life magazine in 1948

### DF - Display whole entry for an article

Create a Dataframe to see the article details in separate sections:
- Fact-Checking & Wikipedia Summary
- Linguistic Analysis

In [23]:
import pandas as pd
import json

# ‚úÖ Ensure there's an article to process
if not final_articles:
    print("‚ö†Ô∏è No processed articles found. Ensure the pipeline has run.")
else:
    # ‚úÖ Get the single article (since DEBUG_MODE is True)
    article_id, enriched_data = list(final_articles.items())[0]

    # ‚úÖ Extract sections
    fact_checking_section = enriched_data["content"].split("\n\nFact-Checking Data:\n")[-1]

    # ‚úÖ Fetch full article details from ChromaDB
    article_data = collection.get([article_id])
    
    if not article_data["documents"]:
        print(f"‚ö†Ô∏è Article ID {article_id} not found in ChromaDB!")
    else:
        full_article = {
            "article_id": article_id,
            "title": article_data["metadatas"][0].get("title", "Unknown Title"),
            "url": article_data["metadatas"][0].get("url", "Unknown URL"),
            "published_date": article_data["metadatas"][0].get("published_date", "Unknown Date"),
            "source": article_data["metadatas"][0].get("source", "Unknown Source"),
            "author": article_data["metadatas"][0].get("author", "Unknown Author"),
            "category": article_data["metadatas"][0].get("category", "Unknown Category"),
            "content": article_data["documents"][0],  # Full article text
        }

        # ‚úÖ Append linguistic analysis
        linguistic_analysis = {
            "TF-IDF Outliers": enriched_data["tfidf_outliers"],
            "Sentiment Polarity": enriched_data["sentiment_polarity"],
            "Sentiment Subjectivity": enriched_data["sentiment_subjectivity"],
            "Grammar Errors": enriched_data["grammar_errors"],
            "Sentence Count": enriched_data["sentence_count"],
            "Entity Count": enriched_data["entity_count"],
            "Text Length": enriched_data["text_length"],
        }

        # ‚úÖ Combine all data into one DataFrame
        full_data = {**full_article, **linguistic_analysis, "fact_checking_summary": fact_checking_section, "status": "ready"}

        # ‚úÖ Convert to DataFrame
        df = pd.DataFrame([full_data])

        # ‚úÖ Display the full enriched article
        print("\nüîç **DEBUG MODE: FULLY ENRICHED ARTICLE READY FOR CHROMA** üîç")
        display(df)

        print("\n‚úÖ **Article is fully processed and marked as 'ready'!**")



üîç **DEBUG MODE: FULLY ENRICHED ARTICLE READY FOR CHROMA** üîç


Unnamed: 0,article_id,title,url,published_date,source,author,category,content,TF-IDF Outliers,Sentiment Polarity,Sentiment Subjectivity,Grammar Errors,Sentence Count,Entity Count,Text Length,fact_checking_summary,status
0,https://newsone.com/5939034/presidents-are-jud...,Presidents Are Often Judged By History Through...,https://newsone.com/5939034/presidents-are-jud...,2025-02-17T14:33:46+00:00,Unknown source,"George R. Goethals, University of Richmond",general,"A statue of Abraham Lincoln, the 16th presiden...","[2021, surveys, historians, president, preside...",0.113065,0.431745,128707,3856,12364,108308,Fact-Check: ['‚ö†Ô∏è Fact-check API error: 403 Cli...,ready



‚úÖ **Article is fully processed and marked as 'ready'!**


## Save Enriched Articles Back to ChromaDB

In [26]:
import json  # ‚úÖ Import JSON to convert lists

# ‚úÖ Store enriched articles after RAG in ChromaDB as a new indexed entry
new_entries = []
for index, (article_id, enriched_data) in enumerate(final_articles.items(), start=1):
    # ‚úÖ Create a unique new ID for the enriched version
    new_entry_id = f"rag_{index}"  # Example: rag_1, rag_2, rag_3...

    # ‚úÖ Fetch full article details from ChromaDB
    article_data = collection.get([article_id])
    
    if not article_data["documents"]:
        print(f"‚ö†Ô∏è Article ID {article_id} not found in ChromaDB. Skipping...")
        continue
    
    full_article = {
        "original_article_id": article_id,  # ‚úÖ Keeps reference to original entry
        "title": article_data["metadatas"][0].get("title", "Unknown Title"),
        "url": article_data["metadatas"][0].get("url", "Unknown URL"),
        "published_date": article_data["metadatas"][0].get("published_date", "Unknown Date"),
        "source": article_data["metadatas"][0].get("source", "Unknown Source"),
        "author": article_data["metadatas"][0].get("author", "Unknown Author"),
        "category": article_data["metadatas"][0].get("category", "Unknown Category"),
    }

    # ‚úÖ Convert TF-IDF Outliers list to a JSON string
    tfidf_outliers_str = json.dumps(enriched_data["tfidf_outliers"])  # ‚úÖ Converts list to a string

    # ‚úÖ Merge with linguistic analysis + status = "ready"
    enriched_entry = {
        **full_article,  
        "enriched_content": enriched_data["content"],  # Full RAG-enriched text
        "TF-IDF Outliers": tfidf_outliers_str,  # ‚úÖ Now stored as a JSON string
        "Sentiment Polarity": enriched_data["sentiment_polarity"],
        "Sentiment Subjectivity": enriched_data["sentiment_subjectivity"],
        "Grammar Errors": enriched_data["grammar_errors"],
        "Sentence Count": enriched_data["sentence_count"],
        "Entity Count": enriched_data["entity_count"],
        "Text Length": enriched_data["text_length"],
        "fact_checking_summary": enriched_data["content"].split("\n\nFact-Checking Data:\n")[-1],
        "status": "ready",  # ‚úÖ Marks the enriched article as complete
    }

    # ‚úÖ Store in ChromaDB as a new indexed entry
    collection.add(
        ids=[new_entry_id],  # Unique indexed ID (rag_1, rag_2, etc.)
        documents=[enriched_data["content"]],
        metadatas=[enriched_entry]
    )

    new_entries.append(new_entry_id)

print(f"‚úÖ Successfully stored {len(new_entries)} enriched articles in ChromaDB!")


‚úÖ Successfully stored 1 enriched articles in ChromaDB!


### Create a PD Dataframe from RAG entries in ChromaDB
An entry gets "status = ready" once it goes through RAG.
* Retrieves all entries from ChromaDB where status = "ready".
* Allows toggling between fetching all available entries or a limited number.
* Stores the results in a pandas.DataFrame for analysis.

In [27]:
import pandas as pd
import json

# ‚úÖ Toggle: Fetch all or limit the number of entries
FETCH_ALL_READY = True  # Set to False to limit the number of entries
LIMIT_ENTRIES = 10  # Only used if FETCH_ALL_READY = False

# ‚úÖ Fetch all entries from ChromaDB
all_entries = collection.get()

# ‚úÖ Extract metadata and documents for only "ready" entries
filtered_entries = []
for i, metadata in enumerate(all_entries["metadatas"]):
    if metadata.get("status") == "ready":
        entry = {
            "article_id": metadata.get("original_article_id", "Unknown"),
            "title": metadata.get("title", "Unknown Title"),
            "url": metadata.get("url", "Unknown URL"),
            "published_date": metadata.get("published_date", "Unknown Date"),
            "source": metadata.get("source", "Unknown Source"),
            "author": metadata.get("author", "Unknown Author"),
            "category": metadata.get("category", "Unknown Category"),
            "enriched_content": all_entries["documents"][i],  # Full enriched text
            "TF-IDF Outliers": json.loads(metadata.get("TF-IDF Outliers", "[]")),  # Convert JSON string back to list
            "Sentiment Polarity": metadata.get("Sentiment Polarity"),
            "Sentiment Subjectivity": metadata.get("Sentiment Subjectivity"),
            "Grammar Errors": metadata.get("Grammar Errors"),
            "Sentence Count": metadata.get("Sentence Count"),
            "Entity Count": metadata.get("Entity Count"),
            "Text Length": metadata.get("Text Length"),
            "fact_checking_summary": metadata.get("fact_checking_summary", ""),
            "status": metadata.get("status"),
        }
        filtered_entries.append(entry)

# ‚úÖ Apply limit if not fetching all entries
if not FETCH_ALL_READY:
    filtered_entries = filtered_entries[:LIMIT_ENTRIES]

# ‚úÖ Convert to Pandas DataFrame
df_ready = pd.DataFrame(filtered_entries)

# ‚úÖ Display DataFrame in Jupyter Notebook
display(df_ready)

print(f"\n‚úÖ Retrieved {len(df_ready)} articles with status = 'ready' from ChromaDB.")


Unnamed: 0,article_id,title,url,published_date,source,author,category,enriched_content,TF-IDF Outliers,Sentiment Polarity,Sentiment Subjectivity,Grammar Errors,Sentence Count,Entity Count,Text Length,fact_checking_summary,status
0,https://newsone.com/5939034/presidents-are-jud...,Presidents Are Often Judged By History Through...,https://newsone.com/5939034/presidents-are-jud...,2025-02-17T14:33:46+00:00,Unknown source,"George R. Goethals, University of Richmond",general,"A statue of Abraham Lincoln, the 16th presiden...","[2021, surveys, historians, president, preside...",0.113065,0.431745,128707,3856,12364,108308,Fact-Check: ['‚ö†Ô∏è Fact-check API error: 403 Cli...,ready



‚úÖ Retrieved 1 articles with status = 'ready' from ChromaDB.
