#### FACT-CHECKING PROMT

In [None]:
# import json

# def construct_fact_checking_prompt(enriched_entry):
#     """
#     Constructs a well-formatted prompt for fact-checking using DeepSeek LLM.
    
#     Args:
#         enriched_entry (dict): A dictionary containing article metadata, enriched content, and linguistic analysis.
    
#     Returns:
#         str: A formatted string prompt for the LLM.
#     """
    
#     # Extract relevant fields from enriched_entry
#     title = enriched_entry.get("title", "Unknown Title")
#     url = enriched_entry.get("url", "Unknown URL")
#     published_date = enriched_entry.get("published_date", "Unknown Date")
#     source_name = enriched_entry.get("source", "Unknown Source")
#     author = enriched_entry.get("author", "Unknown Author")
#     category = enriched_entry.get("category", "Unknown Category")
#     summary = enriched_entry.get("enriched_content", "No summary available")[:500]  # Truncate if too long
    
#     tfidf_outliers = json.loads(enriched_entry.get("TF-IDF Outliers", "[]"))  # Convert back to list
#     tfidf_outliers_str = ", ".join(tfidf_outliers) if tfidf_outliers else "None"
    
#     grammar_errors = enriched_entry.get("Grammar Errors", 0)
#     sentence_count = enriched_entry.get("Sentence Count", 0)
#     sentiment_polarity = enriched_entry.get("Sentiment Polarity", "Unknown")
#     sentiment_subjectivity = enriched_entry.get("Sentiment Subjectivity", "Unknown")
#     fact_checking_summary = enriched_entry.get("fact_checking_summary", "No fact-checking data available.")
    
#     # Construct the prompt
#     prompt = f"""
#     You are a fact-checking AI analyzing the credibility of a news article. Below is the structured information:
    
#     📰 **Article Information:**
#     - **Title:** {title}
#     - **URL:** {url}
#     - **Published Date:** {published_date}
#     - **Source:** {source_name}
#     - **Author:** {author}
#     - **Category:** {category}

#     🔹 **Article Summary (Extracted via AI):**
#     "{summary}"
    
#     📊 **Linguistic Analysis:**
#     - **TF-IDF Outlier Keywords (Unique/Unusual Words):** {tfidf_outliers_str}
#     - **Grammar Issues:** {grammar_errors} errors
#     - **Sentence Count:** {sentence_count}
#     - **Sentiment Analysis:** 
#         - **Polarity (Scale -1 to 1):** {sentiment_polarity}
#         - **Subjectivity (Scale 0 to 1, higher = opinionated):** {sentiment_subjectivity}

#     🎯 **Your Task:**
#     1️⃣ **Assess the credibility of this article** based on the provided content and linguistic analysis.  
#     2️⃣ **Use the TF-IDF outlier words** to determine if the article contains **unusual phrasing or misleading language.**  
#     3️⃣ **Analyze sentiment:** Does the emotional tone suggest bias, fear-mongering, or objectivity?  
#     4️⃣ **Evaluate readability & grammar:** Is the article professionally written, or does it contain errors typical of misinformation?  
#     5️⃣ **Compare against reliable sources** if possible, to determine factual accuracy.  
    
#     🏆 **Final Response Format:**
#     - **Credibility Score:** (Scale 0-100, where 100 = totally credible, 0 = completely false)
#     - **Verdict:** (Choose one: "True", "False", or "Misleading")
#     - **Explanation:** (2-3 sentences summarizing why you assigned this rating)
#     """
    
#     return prompt

# # Example Usage
# example_entry = {
#     "title": "Breaking News: AI Solves World Hunger",
#     "url": "https://news.example.com/ai-hunger",
#     "published_date": "2025-02-18",
#     "source": "Example News",
#     "author": "John Doe",
#     "category": "Technology",
#     "enriched_content": "AI has made significant advancements... (summary content here)...\n\nFact-Checking Data:\n- Verified by multiple sources",
#     "TF-IDF Outliers": json.dumps(["AI", "breakthrough", "hunger crisis"]),
#     "Grammar Errors": 2,
#     "Sentence Count": 25,
#     "Sentiment Polarity": 0.7,
#     "Sentiment Subjectivity": 0.4,
#     "fact_checking_summary": "Verified by multiple sources."
# }

# # Generate the prompt
# prompt_text = construct_fact_checking_prompt(example_entry)
# print(prompt_text)


In [9]:
import json
import logging

# Configure logging (good practice for debugging and monitoring)
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def construct_fact_checking_prompt(enriched_entry):
    """Constructs a well-formatted prompt for fact-checking using an LLM.

    Args:
        enriched_entry (dict): A dictionary containing article metadata, 
                               enriched content, and linguistic analysis.

    Returns:
        str: A formatted string prompt for the LLM, or None if an error occurs.
    """
    try:
        title = enriched_entry.get("title", "Unknown Title")
        url = enriched_entry.get("url", "Unknown URL")
        published_date = enriched_entry.get("published_date", "Unknown Date")
        source_name = enriched_entry.get("source", "Unknown Source")
        author = enriched_entry.get("author", "Unknown Author")
        category = enriched_entry.get("category", "Unknown Category")

        summary = enriched_entry.get("enriched_content")
        if summary is None:
            logging.warning("No summary available in enriched_entry.")
            summary = "No summary available"
        summary = summary[:500]  # Truncate if too long

        tfidf_outliers_str = enriched_entry.get("TF-IDF Outliers")
        if tfidf_outliers_str is None:
            logging.warning("No TF-IDF Outliers available in enriched_entry.")
            tfidf_outliers_str = "None"
        else:
            try:
                tfidf_outliers = json.loads(tfidf_outliers_str)
                tfidf_outliers_str = ", ".join(tfidf_outliers) if tfidf_outliers else "None"
            except json.JSONDecodeError:
                logging.error("Error decoding TF-IDF Outliers JSON.")
                tfidf_outliers_str = "None"  # Handle JSON decoding errors gracefully


        grammar_errors = enriched_entry.get("Grammar Errors", 0)
        sentence_count = enriched_entry.get("Sentence Count", 0)
        sentiment_polarity = enriched_entry.get("Sentiment Polarity", "Unknown")
        sentiment_subjectivity = enriched_entry.get("Sentiment Subjectivity", "Unknown")
        fact_checking_summary = enriched_entry.get("fact_checking_summary", "No fact-checking data available.")

        prompt = f"""
        You are a fact-checking AI analyzing the credibility of a news article. Below is the structured information:

        📰 **Article Information:**
        - **Title:** {title}
        - **URL:** {url}
        - **Published Date:** {published_date}
        - **Source:** {source_name}
        - **Author:** {author}
        - **Category:** {category}

        🔹 **Article Summary (Extracted via AI):**
        "{summary}"

        📊 **Linguistic Analysis:**
        - **TF-IDF Outlier Keywords (Unique/Unusual Words):** {tfidf_outliers_str}
        - **Grammar Issues:** {grammar_errors} errors
        - **Sentence Count:** {sentence_count}
        - **Sentiment Analysis:** 
            - **Polarity (Scale -1 to 1):** {sentiment_polarity}
            - **Subjectivity (Scale 0 to 1, higher = opinionated):** {sentiment_subjectivity}

        🎯 **Your Task:**
        1️⃣ **Assess the credibility of this article** based on the provided content and linguistic analysis. 
        2️⃣ **Use the TF-IDF outlier words** to determine if the article contains **unusual phrasing or misleading language.** 
        3️⃣ **Analyze sentiment:** Does the emotional tone suggest bias, fear-mongering, or objectivity? 
        4️⃣ **Evaluate readability & grammar:** Is the article professionally written, or does it contain errors typical of misinformation? 
        5️⃣ **Compare against reliable sources** if possible, to determine factual accuracy. 

        🏆 **Final Response Format:**
        - **Credibility Score:** (Scale 0-100, where 100 = totally credible, 0 = completely false)
        - **Verdict:** (Choose one: "True", "False", or "Misleading")
        - **Explanation:** (2-3 sentences summarizing why you assigned this rating)
        """

        return prompt

    except Exception as e:
        logging.error(f"Error constructing prompt: {e}")
        return None  # Indicate failure by returning None



# Example Usage (corrected)
example_entry = {
    "title": "Breaking News: AI Solves World Hunger",
    "url": "https://news.example.com/ai-hunger",
    "published_date": "2025-02-18",
    "source": "Example News",
    "author": "John Doe",
    "category": "Technology",
    "enriched_content": "AI has made significant advancements... (summary content here)...\n\nFact-Checking Data:\n- Verified by multiple sources",
    "TF-IDF Outliers": json.dumps(["AI", "breakthrough", "hunger crisis"]),  # Use json.dumps here
    "Grammar Errors": 2,
    "Sentence Count": 25,
    "Sentiment Polarity": 0.7,
    "Sentiment Subjectivity": 0.4,
    "fact_checking_summary": "Verified by multiple sources."
}

prompt_text = construct_fact_checking_prompt(example_entry)

if prompt_text:
    print(prompt_text)
else:
    print("Failed to generate prompt.")


        You are a fact-checking AI analyzing the credibility of a news article. Below is the structured information:

        📰 **Article Information:**
        - **Title:** Breaking News: AI Solves World Hunger
        - **URL:** https://news.example.com/ai-hunger
        - **Published Date:** 2025-02-18
        - **Source:** Example News
        - **Author:** John Doe
        - **Category:** Technology

        🔹 **Article Summary (Extracted via AI):**
        "AI has made significant advancements... (summary content here)...

Fact-Checking Data:
- Verified by multiple sources"

        📊 **Linguistic Analysis:**
        - **TF-IDF Outlier Keywords (Unique/Unusual Words):** AI, breakthrough, hunger crisis
        - **Grammar Issues:** 2 errors
        - **Sentence Count:** 25
        - **Sentiment Analysis:** 
            - **Polarity (Scale -1 to 1):** 0.7
            - **Subjectivity (Scale 0 to 1, higher = opinionated):** 0.4

        🎯 **Your Task:**
        1️⃣ **Assess the cre



In [2]:
import ollama

# Generate the fact-checking prompt
prompt_text = construct_fact_checking_prompt(example_entry)

# Send the prompt to DeepSeek LLM using Ollama
response = ollama.chat(
    model="deepseek-r1",
    messages=[{"role": "user", "content": prompt_text}]
)

# Print the response
print(response['message']['content'])


<think>
Alright, I'm looking at this article about AI solving world hunger. The title is pretty catchy and promising, which makes me cautious because breakthroughs in such a critical area aren't common. The URL points to an example site from 2025, so it's up-to-date but not tied to any real source. The author is John Doe, who isn't well-known, so that’s another red flag.

The summary says AI has made significant advancements, which makes sense given the technology trend of recent years. However, the article is from 2025, and while AI has advanced, solving world hunger on its own seems a bit far-fetched. It's more likely an announcement or a hypothetical scenario rather than a proven solution.

The linguistic analysis shows some grammar errors, which might indicate it wasn't proofread. The TF-IDF outlier words like "AI," "breakthrough," and "hunger crisis" suggest the article is trying to emphasize these key points but could be overusing them to create drama or sensationalism. 

The sen

In [6]:
import json
import ollama
import chromadb
import os
from datetime import datetime

def fetch_entries_from_chromadb(collection_name="news_articles", max_entries=5):
    # ... (No changes to this function)
    client = chromadb.PersistentClient(path="./chroma_db")  # Update path as needed
    collection = client.get_collection(name=collection_name)

    results = collection.get(include=["metadatas", "documents"], limit=max_entries)

    entries = []
    for i in range(len(results["documents"])):
        entry = {
            "title": results["metadatas"][i].get("title", "Unknown Title"),
            "url": results["metadatas"][i].get("url", "Unknown URL"),
            "published_date": results["metadatas"][i].get("published_date", "Unknown Date"),
            "source": results["metadatas"][i].get("source", "Unknown Source"),
            "author": results["metadatas"][i].get("author", "Unknown Author"),
            "category": results["metadatas"][i].get("category", "Unknown Category"),
            "enriched_content": results["documents"][i],
            "TF-IDF Outliers": json.dumps(results["metadatas"][i].get("tfidf_outliers", [])),
            "Grammar Errors": results["metadatas"][i].get("grammar_errors", 0),
            "Sentence Count": results["metadatas"][i].get("sentence_count", 0),
            "Sentiment Polarity": results["metadatas"][i].get("sentiment_polarity", "Unknown"),
            "Sentiment Subjectivity": results["metadatas"][i].get("sentiment_subjectivity", "Unknown"),
            "fact_checking_summary": results["metadatas"][i].get("fact_checking_summary", "No fact-checking data available."),
        }
        entries.append(entry)

    return entries

def analyze_multiple_articles(chroma_entries, model="deepseek-r1", output_dir="PROMT_RESULTS"):  # Changed output_file to output_dir
    responses = []

    for entry in chroma_entries:
        prompt_text = construct_fact_checking_prompt(entry)
        response = ollama.chat(
            model=model,
            messages=[{"role": "user", "content": prompt_text}]
        )
        response_content = response['message']['content']
        responses.append({
            "title": entry.get("title", "Unknown Title"),
            "url": entry.get("url", "Unknown URL"),
            "published_date": entry.get("published_date", "Unknown Date"),
            "source": entry.get("source", "Unknown Source"),
            "author": entry.get("author", "Unknown Author"),
            "fact_check_result": response_content
        })

    # Create the directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

     # Generate filename with specified format
    date_str = datetime.now().strftime("%Y-%m-%d")
    json_files = [f for f in os.listdir(output_dir) if f.startswith(f"Prompt_Return_{date_str}")] # Modified to match the new file name format
    file_number = len(json_files) + 1
    filename = f"Prompt_Return_{date_str}_{file_number}.json"  # Modified filename prefix
    filepath = os.path.join(output_dir, filename)

    while os.path.exists(filepath): # Check if the file exists and increment counter
        counter += 1
        filename = f"Prompt_Return_{timestamp}_{counter}.json"
        filepath = os.path.join(output_dir, filename)

    # Save responses to a JSON file
    with open(filepath, "w", encoding="utf-8") as file:
        json.dump(responses, file, indent=4, ensure_ascii=False)

    return responses


# ** Fetch limited entries from ChromaDB and send them for analysis **
chroma_entries = fetch_entries_from_chromadb(max_entries=5)
results = analyze_multiple_articles(chroma_entries)

# Output results (optional - you might just want to save to file)
print(json.dumps(results, indent=4, ensure_ascii=False))

[
    {
        "title": "Presidents Are Often Judged By History Through The Lens Of Morality",
        "url": "https://newsone.com/5939034/presidents-are-judged-by-history-through-the-lens-of-morality/",
        "published_date": "2025-02-17T14:33:46+00:00",
        "source": "Unknown source",
        "author": "George R. Goethals, University of Richmond",
        "fact_check_result": "<think>\nAlright, let's try to figure out the credibility of this article. The title is about presidents being judged by history through morality, which sounds pretty heavy-handed. I'm not sure if that's a common way historians look at things.\n\nThe URL points to newsone.com with an ID, but I don't know much about them. The author is George R. Goethals from the University of Richmond, so maybe he has some academic background. But the category is just general, which doesn't give me much info.\n\nLooking at the summary, it's cut off mid-sentence and only mentions Lincoln and Biden. It seems like the arti