In [None]:
import json

def construct_fact_checking_prompt(enriched_entry):
    """
    Constructs a well-formatted prompt for fact-checking using DeepSeek LLM.
    
    Args:
        enriched_entry (dict): A dictionary containing article metadata, enriched content, and linguistic analysis.
    
    Returns:
        str: A formatted string prompt for the LLM.
    """
    
    # Extract relevant fields from enriched_entry
    title = enriched_entry.get("title", "Unknown Title")
    url = enriched_entry.get("url", "Unknown URL")
    published_date = enriched_entry.get("published_date", "Unknown Date")
    source_name = enriched_entry.get("source", "Unknown Source")
    author = enriched_entry.get("author", "Unknown Author")
    category = enriched_entry.get("category", "Unknown Category")
    summary = enriched_entry.get("enriched_content", "No summary available")[:500]  # Truncate if too long
    
    tfidf_outliers = json.loads(enriched_entry.get("TF-IDF Outliers", "[]"))  # Convert back to list
    tfidf_outliers_str = ", ".join(tfidf_outliers) if tfidf_outliers else "None"
    
    grammar_errors = enriched_entry.get("Grammar Errors", 0)
    sentence_count = enriched_entry.get("Sentence Count", 0)
    sentiment_polarity = enriched_entry.get("Sentiment Polarity", "Unknown")
    sentiment_subjectivity = enriched_entry.get("Sentiment Subjectivity", "Unknown")
    fact_checking_summary = enriched_entry.get("fact_checking_summary", "No fact-checking data available.")
    
    # Construct the prompt
    prompt = f"""
    You are a fact-checking AI analyzing the credibility of a news article. Below is the structured information:
    
    📰 **Article Information:**
    - **Title:** {title}
    - **URL:** {url}
    - **Published Date:** {published_date}
    - **Source:** {source_name}
    - **Author:** {author}
    - **Category:** {category}

    🔹 **Article Summary (Extracted via AI):**
    "{summary}"
    
    📊 **Linguistic Analysis:**
    - **TF-IDF Outlier Keywords (Unique/Unusual Words):** {tfidf_outliers_str}
    - **Grammar Issues:** {grammar_errors} errors
    - **Sentence Count:** {sentence_count}
    - **Sentiment Analysis:** 
        - **Polarity (Scale -1 to 1):** {sentiment_polarity}
        - **Subjectivity (Scale 0 to 1, higher = opinionated):** {sentiment_subjectivity}

    🎯 **Your Task:**
    1️⃣ **Assess the credibility of this article** based on the provided content and linguistic analysis.  
    2️⃣ **Use the TF-IDF outlier words** to determine if the article contains **unusual phrasing or misleading language.**  
    3️⃣ **Analyze sentiment:** Does the emotional tone suggest bias, fear-mongering, or objectivity?  
    4️⃣ **Evaluate readability & grammar:** Is the article professionally written, or does it contain errors typical of misinformation?  
    5️⃣ **Compare against reliable sources** if possible, to determine factual accuracy.  
    
    🏆 **Final Response Format:**
    - **Credibility Score:** (Scale 0-100, where 100 = totally credible, 0 = completely false)
    - **Verdict:** (Choose one: "True", "False", or "Misleading")
    - **Explanation:** (2-3 sentences summarizing why you assigned this rating)
    """
    
    return prompt

# Example Usage
example_entry = {
    "title": "Breaking News: AI Solves World Hunger",
    "url": "https://news.example.com/ai-hunger",
    "published_date": "2025-02-18",
    "source": "Example News",
    "author": "John Doe",
    "category": "Technology",
    "enriched_content": "AI has made significant advancements... (summary content here)...\n\nFact-Checking Data:\n- Verified by multiple sources",
    "TF-IDF Outliers": json.dumps(["AI", "breakthrough", "hunger crisis"]),
    "Grammar Errors": 2,
    "Sentence Count": 25,
    "Sentiment Polarity": 0.7,
    "Sentiment Subjectivity": 0.4,
    "fact_checking_summary": "Verified by multiple sources."
}

# Generate the prompt
prompt_text = construct_fact_checking_prompt(example_entry)
print(prompt_text)


In [3]:
import ollama

# Generate the fact-checking prompt
prompt_text = construct_fact_checking_prompt(example_entry)

# Send the prompt to DeepSeek LLM using Ollama
response = ollama.chat(
    model="deepseek-r1",
    messages=[{"role": "user", "content": prompt_text}]
)

# Print the response
print(response['message']['content'])


<think>
Alright, let's tackle this fact-checking task. I'm looking at the provided article and its analysis. The article claims that AI has solved world hunger, which is a pretty bold statement. First, I should assess the credibility based on the content.

The summary says AI has made significant advancements, but without specific details or sources backing up these claims, it's hard to take it at face value. News outlets often exaggerate for drama, so this could be a case of sensationalism.

Looking at the linguistic analysis: there are TF-IDF outlier keywords like "AI," "breakthrough," and "hunger crisis." These seem common in tech articles but don't necessarily indicate anything unusual on their own. The grammar has 2 errors—maybe they were missed by an initial check but possible for a human proofreader.

The article has 25 sentences, which seems lengthy, but without citations or context, it's just a string of statements. The sentiment is positive with a polarity of 0.7 and subjecti

In [None]:
import json
import ollama
import chromadb  # Ensure chromadb is installed and properly set up

def fetch_entries_from_chromadb(collection_name="news_articles", max_entries=5):
    """
    Fetches the latest enriched entries from ChromaDB.

    Args:
        collection_name (str): The ChromaDB collection to query.
        max_entries (int): The maximum number of entries to fetch.

    Returns:
        list: A list of retrieved entries.
    """
    client = chromadb.PersistentClient(path="./chroma_db")  # Update path as needed
    collection = client.get_collection(name=collection_name)
    
    # Retrieve latest X entries
    results = collection.get(include=["metadatas", "documents"], limit=max_entries)

    # Process entries into required format
    entries = []
    for i in range(len(results["documents"])):
        entry = {
            "title": results["metadatas"][i].get("title", "Unknown Title"),
            "url": results["metadatas"][i].get("url", "Unknown URL"),
            "published_date": results["metadatas"][i].get("published_date", "Unknown Date"),
            "source": results["metadatas"][i].get("source", "Unknown Source"),
            "author": results["metadatas"][i].get("author", "Unknown Author"),
            "category": results["metadatas"][i].get("category", "Unknown Category"),
            "enriched_content": results["documents"][i],
            "TF-IDF Outliers": json.dumps(results["metadatas"][i].get("tfidf_outliers", [])),
            "Grammar Errors": results["metadatas"][i].get("grammar_errors", 0),
            "Sentence Count": results["metadatas"][i].get("sentence_count", 0),
            "Sentiment Polarity": results["metadatas"][i].get("sentiment_polarity", "Unknown"),
            "Sentiment Subjectivity": results["metadatas"][i].get("sentiment_subjectivity", "Unknown"),
            "fact_checking_summary": results["metadatas"][i].get("fact_checking_summary", "No fact-checking data available."),
        }
        entries.append(entry)
    
    return entries

def analyze_multiple_articles(chroma_entries, model="deepseek-r1", output_file="fact_check_results.json"):
    """
    Sends multiple articles from ChromaDB to the DeepSeek LLM for analysis and saves the results.

    Args:
        chroma_entries (list): List of enriched article entries from ChromaDB.
        model (str): The LLM model to use.
        output_file (str): Path to save the JSON file containing results.

    Returns:
        list: A list of responses from the LLM.
    """
    responses = []
    
    for entry in chroma_entries:
        prompt_text = construct_fact_checking_prompt(entry)
        response = ollama.chat(
            model=model,
            messages=[{"role": "user", "content": prompt_text}]
        )
        response_content = response['message']['content']
        responses.append({
            "title": entry.get("title", "Unknown Title"),
            "url": entry.get("url", "Unknown URL"),
            "published_date": entry.get("published_date", "Unknown Date"),
            "source": entry.get("source", "Unknown Source"),
            "author": entry.get("author", "Unknown Author"),
            "fact_check_result": response_content
        })
    
    # Save responses to a JSON file
    with open(output_file, "w", encoding="utf-8") as file:
        json.dump(responses, file, indent=4, ensure_ascii=False)

    return responses

# ** Fetch limited entries from ChromaDB and send them for analysis **
chroma_entries = fetch_entries_from_chromadb(max_entries=5)
results = analyze_multiple_articles(chroma_entries)

# Output results
print(json.dumps(results, indent=4, ensure_ascii=False))
