In [None]:
!pip install -q transformers accelerate bitsandbytes sentencepiece einops
!pip install -q python-telegram-bot
!pip install -q google-api-python-client
!pip install -q requests beautifulsoup4
!pip install -q huggingface_hub
!pip install -q trafilatura

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.0/76.0 MB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m112.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m62.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m29.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import os
import re
import json
import time
import torch
import requests
import asyncio
import nest_asyncio
from bs4 import BeautifulSoup
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from googleapiclient.discovery import build
from telegram import Update
from telegram.ext import Application, CommandHandler, MessageHandler, filters, ContextTypes
from huggingface_hub import login

# Enable nested asyncio for Colab environment
nest_asyncio.apply()

TELEGRAM_TOKEN = ""
GOOGLE_API_KEY = ""
GOOGLE_CSE_ID = ""
HF_TOKEN = ""

# Model configuration
MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.3"
MAX_NEW_TOKENS = 512
MAX_INPUT_TOKEN_LENGTH = 4096

# Search settings
NUM_SEARCH_RESULTS = 5
MAX_SCRAPE_CONTENT_LENGTH = 1500

# Telegram message character limit
TELEGRAM_MESSAGE_LIMIT = 4096

In [None]:
# Cell 3: Model Loading and Tokenizer Setup


# Authenticate with Hugging Face
print("Authenticating with Hugging Face...")
login(token=HF_TOKEN)
print("Authentication successful!")

# Configure 4-bit quantization
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

print(f"Loading model: {MODEL_ID}...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    device_map="auto", 
    quantization_config=quantization_config,
    token=HF_TOKEN,
)
print("Model loaded.")

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)
print("Tokenizer loaded.")

# --- Explicitly set pad_token_id ---
if tokenizer.pad_token_id is None:
    print("Tokenizer does not have a pad_token_id defined. Setting it to eos_token_id.")
    tokenizer.pad_token_id = tokenizer.eos_token_id
    tokenizer.padding_side = "left" 
elif tokenizer.pad_token is None:
     # If ID exists but token string doesn't, try setting the string
     pad_token_id = tokenizer.pad_token_id
     print(f"Tokenizer has pad_token_id ({pad_token_id}) but no pad_token string. Setting pad_token.")
     # Find the token string corresponding to the pad_token_id
     tokenizer.pad_token = tokenizer.decode(pad_token_id)
     # If decode fails or gives empty, fallback to EOS
     if not tokenizer.pad_token:
          print("Could not decode pad_token_id. Falling back to using EOS token as PAD token.")
          tokenizer.pad_token = tokenizer.eos_token
          tokenizer.pad_token_id = tokenizer.eos_token_id # Ensure ID matches token
          tokenizer.padding_side = "left" # Reset padding side if falling back

print(f"Final Tokenizer Configuration:")
print(f" - PAD Token: '{tokenizer.pad_token}', ID: {tokenizer.pad_token_id}")
print(f" - EOS Token: '{tokenizer.eos_token}', ID: {tokenizer.eos_token_id}")
print(f" - Padding Side: {tokenizer.padding_side}")
print(f" - Vocab Size: {tokenizer.vocab_size}")
print(f"Model Config Vocab Size: {model.config.vocab_size}")

# --- Verification ---
if tokenizer.vocab_size != model.config.vocab_size:
     print("[WARNING] Tokenizer vocab size mismatch with model config vocab size!")

print("Model and Tokenizer setup complete!")

In [None]:
def split_message(text, chunk_size=TELEGRAM_MESSAGE_LIMIT):
    """Splits text into chunks of at most chunk_size characters."""
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

In [None]:
# CELL 5 Code (Unchanged from previous step)
def generate_text(prompt, max_new_tokens=MAX_NEW_TOKENS):
    """
    Generate text using the loaded LLM (e.g., Mistral7b, you can change the model if needed).
    Uses the tokenizer's chat template for formatting. Returns raw output.
    """
    messages = [{"role": "user", "content": prompt}]
    # add_generation_prompt=True is important for instruct/chat models
    formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    inputs = tokenizer(formatted_prompt, return_tensors="pt", padding=True).to(model.device)
    input_token_count = inputs.input_ids.shape[1]
    print(f"[LOG] Input tokens to model: {input_token_count}")

    with torch.no_grad():
        output_tokens = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.6,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )

    output_token_count = output_tokens.shape[1]
    generated_token_count = output_token_count - input_token_count
    print(f"[LOG] Output tokens from model: {output_token_count} (Generated: {generated_token_count})")

    # Decode ALL generated tokens (including the prompt echo if model does it)
    raw_response = tokenizer.decode(output_tokens[0], skip_special_tokens=True)

    print(f"[LOG] Raw model output length (incl. echo): {len(raw_response)} chars")
    return raw_response # Return the potentially messy raw output for cleaning

In [None]:
# CELL 6 Code (Modified - Added duplicate filter)
def google_search(query, num_results=NUM_SEARCH_RESULTS):
    """
    Search using Google Custom Search API with duplicate filtering, scrape,
    and return context string and source URLs.
    """
    print(f"[LOG] Performing Google search for: '{query}'")
    source_urls = []
    results_summary = ""
    try:
        service = build("customsearch", "v1", developerKey=GOOGLE_API_KEY)
        # *** Added filter='1' parameter ***
        result = service.cse().list(q=query, cx=GOOGLE_CSE_ID, num=num_results, filter='1').execute()

        if "items" in result:
            print(f"[LOG] Google API returned {len(result['items'])} results (with filtering).")
            items_to_process = result.get("items", [])
            for idx, item in enumerate(items_to_process):
                title = item.get("title", "No Title")
                snippet = item.get("snippet", "No Snippet")
                link = item.get("link", "")
                print(f"[LOG] Result {idx+1}: Title: {title}, Link: {link}")

                scraped_text = ""
                if link:
                    # Skip PDF links for scraping as Trafilatura/BS4 usually fail
                    if link.lower().endswith(".pdf"):
                        print("[LOG] Skipping scraping for PDF link.")
                        scraped_text = "[PDF Document - Content not scraped]"
                        # Optionally add link to sources even if not scraped? Yes.
                        source_urls.append(link)
                    else:
                         source_urls.append(link)
                         scraped_text = scrape_webpage(link, max_length=MAX_SCRAPE_CONTENT_LENGTH) # Calls Cell 7
                else:
                     # Add link even if empty, might be useful for context? Unlikely.
                     pass

                results_summary += (
                    f"Source {idx+1}:\n"
                    f"URL: {link}\n"
                    f"Title: {title}\n"
                    f"Snippet: {snippet}\n"
                    f"Scraped Content (Max {MAX_SCRAPE_CONTENT_LENGTH} chars):\n{scraped_text}\n\n"
                )
            return results_summary.strip(), list(set(source_urls)) # Unique URLs
        else:
            print("[LOG] Google API returned no results.")
            return "", []
    except Exception as e:
        print(f"[ERROR] Google Search Error: {e}")
        return "", []

In [None]:
# CELL 7 Code (Modified - Fixed Trafilatura format error)
import trafilatura
from bs4 import BeautifulSoup
import requests

def scrape_webpage(url, max_length=MAX_SCRAPE_CONTENT_LENGTH):
    """
    Scrape main text content from URL using trafilatura, falling back to basic
    BeautifulSoup filtering. Truncates to max_length. Fixes format error.
    """
    print(f"[LOG] Scraping webpage: {url}")
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
    text = ""

    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        html_content = response.content

        # Trafilatura extraction - Removed output_format='text'
        extracted_text = trafilatura.extract(html_content,include_images=False, include_comments=False, include_tables=True, url=url) # Omit output_format

        if extracted_text:
            text = extracted_text
            print(f"[LOG] Successfully extracted text with trafilatura. Length: {len(text)} chars.")
        else:
            print("[LOG] Trafilatura returned no main content, falling back to BeautifulSoup filtering.")
            soup = BeautifulSoup(html_content, "html.parser")
            for element in soup(['script', 'style', 'header', 'footer', 'nav', 'aside', 'form', 'button', 'img', 'figure', 'figcaption', 'iframe']):
                if element: element.decompose()
            main_content = soup.find('article') or soup.find('main') or soup.find('body')
            if main_content:
                 content_tags = main_content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'td', 'th', 'span', 'div'], recursive=True)
                 tag_texts = [tag.get_text(separator=' ', strip=True) for tag in content_tags]
                 text = " ".join(filter(None, tag_texts))
                 text = ' '.join(text.split())
            if text:
                print(f"[LOG] Extracted text with BeautifulSoup fallback. Length: {len(text)} chars.")
            else:
                print("[LOG] BeautifulSoup fallback also found no significant text.")

    except requests.exceptions.RequestException as e:
        print(f"[ERROR] Failed to retrieve page {url}: {e}")
        return ""
    except Exception as e:
        print(f"[ERROR] Exception during scraping/extraction for {url}: {e}")
        return ""

    # Truncation logic
    if text and len(text) > max_length:
        print(f"[LOG] Truncating extracted text from {len(text)} to {max_length} characters.")
        truncated_text = text[:max_length]
        last_period = truncated_text.rfind('.')
        if last_period > max_length - 100:
             return truncated_text[:last_period+1] + "..."
        else:
             return truncated_text + "..."
    elif text:
        return text
    else:
        return ""

In [None]:
def scrape_content(url):
    """Scrape content from webpage"""
    try:
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
        response = requests.get(url, headers=headers, timeout=15)

        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')

            # Remove unwanted elements
            for element in soup(['script', 'style', 'header', 'footer', 'nav', 'aside']):
                element.decompose()

            # Extract main content
            content = ' '.join([p.get_text().strip() for p in soup.find_all(['p', 'h1', 'h2', 'h3'])])
            return content[:MAX_SCRAPE_CONTENT_LENGTH]
        return ""
    except Exception as e:
        print(f"Scraping Error: {e}")
        return ""

In [None]:
# CELL 9 Code (prompt and parsing)
import traceback
import re

def generate_search_query(claim_text):
    """
    Uses the LLM to create effective Google Search queries.
    Asks for simple 'Query: ' prefix and parses robustly.
    """
    print(f"[LOG] Generating search query for claim: '{claim_text[:100]}...'")

    prompt = f"""Analyze the user's claim below. Generate the single BEST, concise Google search query (typically 3-7 keywords) to find news articles or discussions verifying this claim.

Claim: "{claim_text}"

Instructions:
- Focus on the main entities and action.
- Be concise.
- AVOID restrictive operators like "site:" or "-".

Return ONLY the query itself, prefixed EXACTLY like this:
Query:""" 

    generated_query = ""
    extracted_query_raw = "" # For logging
    try:
        llm_response = generate_text(prompt, max_new_tokens=50) 

        query_prefix = "Query:"
        # Find the prefix case-insensitively, allowing for space/newline after
        match = re.search(rf'{re.escape(query_prefix)}\s*(.*)', llm_response, re.IGNORECASE | re.DOTALL)

        if match:
            extracted_query_raw = match.group(1).strip()
            print(f"[LOG] Raw extracted text after '{query_prefix}': '{extracted_query_raw}'") # Log what was extracted
            # Clean aggressively: remove quotes, ensure it's on one line mostly
            generated_query = extracted_query_raw.splitlines()[0] # Take first line after prefix
            generated_query = generated_query.strip('\'" ')

        # Validate the extracted query
        if generated_query and 3 < len(generated_query) < 150: # Adjusted length validation
             # Final check for bad operators (safety net)
             if "site:" in generated_query.lower() or generated_query.strip().startswith("-"):
                  print(f"[WARN] Generated query '{generated_query}' still contains disallowed operator after cleaning. Falling back.")
                  return claim_text
             print(f"[LOG] Successfully extracted and validated query: '{generated_query}'")
             return generated_query
        else:
            print(f"[WARN] Could not extract valid query from LLM response (Raw after prefix: '{extracted_query_raw}'). Response: '{llm_response}'. Falling back.")
            return claim_text # Fallback

    except Exception as e:
        print(f"[ERROR] Failed to generate/parse search query using LLM: {e}")
        # print(traceback.format_exc()) # Uncomment for detailed traceback
        return claim_text # Fallback

In [None]:
def fact_check_headline(headline):
    """Main fact-checking workflow"""
    # Generate search query
    search_query = generate_search_query(headline)
    print(f"Searching for: {search_query}")

    # Get search results
    results = google_search(search_query, NUM_SEARCH_RESULTS)
    if not results:
        return "No reliable sources found for verification."

    # Process results
    sources = []
    for result in results:
        content = scrape_content(result.get("link", ""))
        if content:
            sources.append({
                "title": result.get("title", "No title"),
                "url": result.get("link", ""),
                "content": content[:1000] + "..." if len(content) > 1000 else content
            })

    if not sources:
        return "Could not extract content from sources."

    # Prepare fact-check prompt
    sources_text = "\n\n".join([f"Source {i+1}: {s['title']}\nURL: {s['url']}\nContent: {s['content']}"
                              for i, s in enumerate(sources)])

    prompt = f"""Analyze these sources to fact check this claim. Respond with:
    - Verdict: True/False/Unverified
    - Confidence: High/Medium/Low
    - Summary: Brief explanation
    - Key Evidence: Bullet points

    Claim: "{headline}"

    Sources:
    {sources_text}"""

    return generate_text(prompt, MAX_NEW_TOKENS)

In [None]:
# Cell 11: process_response function 
import re

def process_response(response_text):
    """
    Remove specific markers and potential leaked instructions
    from the model's output and return the cleaned text.
    """
    # Pattern to remove text between [CONTEXT] and [END_CONTEXT] (inclusive)
    pattern_context = r"\\[CONTEXT\\].*?\\[END_CONTEXT\\]"
    cleaned_text = re.sub(pattern_context, "", response_text, flags=re.DOTALL)

    # Remove specific instruction phrases that might leak (case-insensitive)
    # Add more phrases here if other leaks are observed
    leaked_phrases = [
        "Fact check the following news fact:",
        "Below, between , there is additional context gathered from the web.",
        "DO NOT include any of the text between these markers in your final answer.",
        "Also, ignore any outdated information; consider only the most current and reliable data.",
        "Based on the above, please provide a concise final analysis stating whether the fact is accurate.",
        "Include only your final conclusion and any necessary reasoning, without showing the raw context.",
        "\[CONTEXT\]", # Remove stray markers if they appear alone
        "\[END_CONTEXT\]"
    ]
    for phrase in leaked_phrases:
        # Use regex for case-insensitive replacement and ignore potential extra spaces
        cleaned_text = re.sub(r'\s*' + re.escape(phrase) + r'\s*', ' ', cleaned_text, flags=re.IGNORECASE | re.DOTALL)


    # Remove extra whitespace and newlines that might result from replacements
    cleaned_text = re.sub(r'\s{2,}', ' ', cleaned_text) 
    cleaned_text = re.sub(r'(\n\s*){2,}', '\n\n', cleaned_text)

    return cleaned_text.strip()

In [None]:
# New Helper Cell (Cell 11.5)
import re

def split_search_context(full_summary_string, num_first_pass):
    """
    Splits the concatenated search summary string into two parts based on
    'Source X:' markers.
    Returns (context_pass_1, context_pass_2)
    """
    context_pass_1 = ""
    context_pass_2 = ""

    # Regex to find "Source X:" markers at the beginning of a line
    markers = list(re.finditer(r'^Source \d+:', full_summary_string, re.MULTILINE))

    if not markers:
        return full_summary_string, "" 

    # Find the start index of the source *after* the first pass block
    split_marker_index = -1
    if len(markers) > num_first_pass:
        split_marker_index = markers[num_first_pass].start()

    if split_marker_index != -1:
        context_pass_1 = full_summary_string[:split_marker_index].strip()
        context_pass_2 = full_summary_string[split_marker_index:].strip()
    else:
        # If fewer sources than num_first_pass, all go into pass 1
        context_pass_1 = full_summary_string.strip()

    return context_pass_1, context_pass_2

In [None]:
# CELL 12 Code (Complete - Tiered Analysis & Formatting)
import string
import re
import traceback


def normalize_message(msg):
    """Normalize the message by removing punctuation and extra whitespace, and converting to lowercase."""
    translator = str.maketrans('', '', string.punctuation)
    return msg.translate(translator).strip().lower()

# Helper to parse verdict (can be moved outside if preferred)
def parse_verdict(analysis_text):
     """Extracts the verdict value from the 'Verdict: ...' line."""
     verdict_label = "Verdict:"
     # Find verdict label case-insensitively
     verdict_start_index = analysis_text.lower().find(verdict_label.lower())
     if verdict_start_index != -1:
          # Extract text after the label
          verdict_line = analysis_text[verdict_start_index + len(verdict_label):]
          # Find the end of the line containing the verdict
          verdict_line_end = verdict_line.find('\n')
          if verdict_line_end == -1: # Verdict is the last thing
               verdict_value = verdict_line.strip()
          else:
               verdict_value = verdict_line[:verdict_line_end].strip()
          # Basic cleaning of verdict value (remove extra chars if any)
          verdict_value = verdict_value.strip('.,!?;:')
          return verdict_value
     return None # Verdict not found

async def handle_message(update: Update, context: ContextTypes.DEFAULT_TYPE):
    """Handles messages, tiered analysis (3->5 sources), formatting."""
    user_message = update.message.text.strip()
    user_id = update.message.chat_id
    print(f"[LOG] Received message from {user_id}: '{user_message}'")

    if not user_message or len(user_message) < 5:
         await update.message.reply_text("Please provide a more specific statement or question to fact-check.")
         return

    normalized = normalize_message(user_message)
    simple_greetings = {"hello", "hi", "hey", "good morning", "good evening", "how are you", "what can you do"}

    if normalized in simple_greetings:
        response_text = (
            "Hello! I'm a fact-checking bot powered by Google search and AI. "
            "Please provide a news headline or statement you'd like me to verify."
        )
        print("[LOG] Detected greeting. Sending info message.")
        await update.message.reply_text(response_text)
        return

    # --- Fact-Checking Flow ---
    feedback_message = await update.message.reply_text("🤔 Thinking of the best way to search...")
    print("[LOG] Detected fact-check query. Generating search query...")

    try:
        # Generate Search Query (Calls Cell 9)
        search_query = generate_search_query(user_message)

        # Using NUM_SEARCH_RESULTS which should be 5 from Cell 2 config
        await feedback_message.edit_text(f"🔍 Searching (up to {NUM_SEARCH_RESULTS} sources) for: \"{search_query[:100]}{'...' if len(search_query)>100 else ''}\"")
        print(f"[LOG] Using search query: '{search_query}'")

        # Fetch all results (Calls Cell 6 -> Cell 7)
        search_summary_all, source_urls_all = google_search(search_query, num_results=NUM_SEARCH_RESULTS)

        if not search_summary_all:
            await feedback_message.edit_text(f"Sorry, I couldn't find enough relevant information online for '{search_query[:100]}...'. Try rephrasing your claim?")
            print("[LOG] No search results found; informed user.")
            return

        # Split context and URLs for Pass 1 (first 3)
        num_pass_1 = 3
        context_pass_1, context_pass_2 = split_search_context(search_summary_all, num_pass_1)
        urls_pass_1 = source_urls_all[:num_pass_1]
        # urls_pass_2 = source_urls_all[num_pass_1:] # Not directly needed unless logging

        if not context_pass_1:
             await feedback_message.edit_text("Error processing search results.")
             print("[ERROR] context_pass_1 is empty despite search_summary_all having content.")
             return

        await feedback_message.edit_text("🧠 Analyzing initial sources...")
        print(f"[LOG] Search complete. Performing Analysis Pass 1 (Sources 1-{min(num_pass_1, len(urls_pass_1))}).")

        # --- Analysis Pass 1 Prompt ---
        llm_prompt_1 = f"""You are a meticulous fact-checking assistant. Your task is to analyze the user's claim strictly based on the provided web search context. Do NOT use any external knowledge.

User's Claim: "{user_message}"

Context from Web Search (Sources 1-{min(num_pass_1, len(urls_pass_1))}):
--- START CONTEXT ---
{context_pass_1}
--- END CONTEXT ---

Analysis Instructions:
1. Carefully read the provided context.
2. Determine if the context supports, contradicts, or is insufficient to verify the User's Claim.
3. **First**, write a concise Summary explaining your findings based *only* on the context. Cite specific sources (e.g., "Source 1 indicates...") where possible.
4. **Second**, based *strictly* on the Summary you just wrote and the context, provide a final Verdict: Accurate, Inaccurate, Partially Accurate, or Unverifiable (if context is insufficient or contradictory without resolution).
5. **Crucially:** Your Verdict *must* logically follow from your Summary.

Required Output Format:
Respond ONLY with the Summary and Verdict, structured EXACTLY like this (Summary first):

Summary: [Your concise explanation based ONLY on the context.]
Verdict: [Your Verdict Here - e.g., Inaccurate]

Do not add any other conversational text, greetings, or repeat the claim/context unless essential within the summary."""

        # --- Generate Text and Clean Echo for Pass 1 ---
        raw_llm_output_1 = generate_text(llm_prompt_1, max_new_tokens=MAX_NEW_TOKENS)
        cleaned_analysis_1 = ""
        instruction_end_marker = "Do not add any other conversational text, greetings, or repeat the claim/context unless essential within the summary." # Last line of instructions
        summary_marker = "Summary:"
        context_end_marker = "--- END CONTEXT ---"

        marker_pos = raw_llm_output_1.rfind(instruction_end_marker)
        if marker_pos != -1:
            potential_analysis = raw_llm_output_1[marker_pos + len(instruction_end_marker):].strip()
            summary_pos = potential_analysis.find(summary_marker)
            if summary_pos != -1: cleaned_analysis_1 = potential_analysis[summary_pos:].strip(); print("[LOG] Cleaned echo Pass 1 via instruction marker + finding 'Summary:'.")
            else: marker_pos = -1; print("[WARN] P1: No 'Summary:' after instruction marker.")
        else: print("[WARN] P1: Instruction marker not found.")

        if marker_pos == -1: # Fallback if instruction marker method failed
             marker_pos_ctx = raw_llm_output_1.rfind(context_end_marker)
             if marker_pos_ctx != -1:
                 potential_output = raw_llm_output_1[marker_pos_ctx + len(context_end_marker):]
                 summary_pos = potential_output.find(summary_marker)
                 if summary_pos != -1: cleaned_analysis_1 = potential_output[summary_pos:].strip(); print("[LOG] Cleaned echo Pass 1 via context marker + finding 'Summary:'.")
                 else: fallback_chars=min(len(raw_llm_output_1),MAX_NEW_TOKENS*5); cleaned_analysis_1=raw_llm_output_1[-fallback_chars:].strip(); print("[WARN] P1: No 'Summary:' after context marker. Using raw slice.")
             else: fallback_chars=min(len(raw_llm_output_1),MAX_NEW_TOKENS*5); cleaned_analysis_1=raw_llm_output_1[-fallback_chars:].strip(); print("[WARN] P1: No markers found. Using raw slice.")
        print(f"[LOG] Cleaned Pass 1 analysis length: {len(cleaned_analysis_1)} chars.")


        if not cleaned_analysis_1 or not cleaned_analysis_1.lower().startswith("summary:"):
             print(f"[WARN] Pass 1 analysis cleaning failed or invalid. ('{cleaned_analysis_1[:100]}...'). Sending error.")
             await feedback_message.edit_text("Sorry, there was an issue processing the initial analysis.")
             return

        # Parse Verdict from Pass 1
        verdict_1 = parse_verdict(cleaned_analysis_1)
        print(f"[LOG] Analysis Pass 1 Verdict: {verdict_1}")

        # Define ambiguous verdicts (case-insensitive)
        ambiguous_verdicts = {"unverifiable", "needs more context", "insufficient information", "ambiguous", "unclear", "partially accurate"} # Added partially accurate

        final_analysis_to_use = cleaned_analysis_1
        final_urls_to_use = urls_pass_1
        analysis_source_note = f"(Based on first {len(urls_pass_1)} sources)"

        # Check if Second Pass is Needed
        if verdict_1 and verdict_1.lower() in ambiguous_verdicts and context_pass_2:
            print("[LOG] Pass 1 verdict is ambiguous and more context exists. Performing Analysis Pass 2.")
            await feedback_message.edit_text(f"Finding seems unclear ({verdict_1}), looking deeper with more sources...")

            # --- Analysis Pass 2 Prompt (using ALL context) ---
            llm_prompt_2 = f"""You are a meticulous fact-checking assistant... (Your full prompt)

User's Claim: "{user_message}"

Context from Web Search (ALL Sources 1-{len(source_urls_all)}):
--- START CONTEXT ---
{search_summary_all}
--- END CONTEXT ---

Analysis Instructions: ... (Your full instructions)

Required Output Format: ... (Summary: ..., Verdict: ...)"""

            # --- Generate Text and Clean Echo for Pass 2 ---
            raw_llm_output_2 = generate_text(llm_prompt_2, max_new_tokens=MAX_NEW_TOKENS)
            cleaned_analysis_2 = ""
            # (Use the same echo stripping logic for Pass 2)
            marker_pos_2 = raw_llm_output_2.rfind(instruction_end_marker)
            if marker_pos_2 != -1:
                potential_analysis_2 = raw_llm_output_2[marker_pos_2 + len(instruction_end_marker):].strip()
                summary_pos_2 = potential_analysis_2.find(summary_marker)
                if summary_pos_2 != -1: cleaned_analysis_2 = potential_analysis_2[summary_pos_2:].strip(); print("[LOG] Cleaned echo Pass 2 via instruction marker + finding 'Summary:'.")
                else: marker_pos_2 = -1; print("[WARN] P2: No 'Summary:' after instruction marker.")
            else: print("[WARN] P2: Instruction marker not found.")

            if marker_pos_2 == -1: # Fallback
                 marker_pos_ctx_2 = raw_llm_output_2.rfind(context_end_marker)
                 if marker_pos_ctx_2 != -1:
                     potential_output_2 = raw_llm_output_2[marker_pos_ctx_2 + len(context_end_marker):]
                     summary_pos_2 = potential_output_2.find(summary_marker)
                     if summary_pos_2 != -1: cleaned_analysis_2 = potential_output_2[summary_pos_2:].strip(); print("[LOG] Cleaned echo Pass 2 via context marker + finding 'Summary:'.")
                     else: fallback_chars=min(len(raw_llm_output_2),MAX_NEW_TOKENS*5); cleaned_analysis_2=raw_llm_output_2[-fallback_chars:].strip(); print("[WARN] P2: No 'Summary:' after context marker. Using raw slice.")
                 else: fallback_chars=min(len(raw_llm_output_2),MAX_NEW_TOKENS*5); cleaned_analysis_2=raw_llm_output_2[-fallback_chars:].strip(); print("[WARN] P2: No markers found. Using raw slice.")
            print(f"[LOG] Cleaned Pass 2 analysis length: {len(cleaned_analysis_2)} chars.")


            if cleaned_analysis_2 and cleaned_analysis_2.lower().startswith("summary:"):
                 # Parse verdict from Pass 2 to ensure it's valid before using
                 verdict_2 = parse_verdict(cleaned_analysis_2)
                 if verdict_2: # Only use Pass 2 result if verdict parsing worked
                     final_analysis_to_use = cleaned_analysis_2
                     final_urls_to_use = source_urls_all # Use all URLs
                     analysis_source_note = f"(Based on all {len(source_urls_all)} sources)"
                     print(f"[LOG] Using Pass 2 analysis result. Verdict: {verdict_2}")
                 else:
                      print("[WARN] Pass 2 analysis seemed invalid after cleaning/parsing. Reverting to Pass 1 result.")
                      # Stick with Pass 1 results if Pass 2 format is broken
            else:
                 print("[WARN] Pass 2 analysis cleaning failed or result invalid. Using Pass 1 result despite ambiguity.")
                 # Stick with Pass 1 results if Pass 2 failed processing

        # --- Construct Final User Response (using selected Pass result) ---
        formatted_analysis = ""
        summary_text = ""
        verdict_value = "Error parsing analysis" # Default

        # Parse and format the final selected analysis
        summary_label = "Summary:"
        verdict_label = "Verdict:"
        summary_start_index = final_analysis_to_use.lower().find(summary_label.lower())
        verdict_start_index = final_analysis_to_use.lower().find(verdict_label.lower())

        if summary_start_index != -1 and verdict_start_index != -1:
             summary_text = final_analysis_to_use[summary_start_index + len(summary_label) : verdict_start_index].strip()
             verdict_value = parse_verdict(final_analysis_to_use) 
             if verdict_value is None: verdict_value = "[Parsing Error]" # Fallback if parsing fails here

             formatted_analysis = f"Summary:\n{summary_text}\n\nVerdict: <b>{verdict_value}</b> {analysis_source_note}"
             print("[LOG] Successfully parsed and reformatted final Summary/Verdict.")
        else:
             formatted_analysis = final_analysis_to_use + f"\n{analysis_source_note}" # Append note if parsing fails
             print("[WARN] Could not properly parse Summary/Verdict labels in final analysis. Using raw cleaned analysis.")

        analysis_header = "📊 Analysis Result:\n"
        sources_header = "\n\n📚 Sources Consulted:\n"
        sources_list_str = "\n".join([f"• <a href='{url}'>{url}</a>" for url in final_urls_to_use]) if final_urls_to_use else "No specific source links retrieved."
        sources_note_long = "\n\n_(Sources consulted but may not be fully listed due to message length limits.)_"

        # (Length management logic remains the same)
        final_response_text = ""
        full_analysis_part = f"{analysis_header}{formatted_analysis}"
        full_analysis_len = len(full_analysis_part)
        sources_section_len = len(sources_header) + len(sources_list_str)
        note_len = len(sources_note_long)

        if full_analysis_len + sources_section_len <= TELEGRAM_MESSAGE_LIMIT:
            final_response_text = f"{full_analysis_part}{sources_header}{sources_list_str}"
            print("[LOG] Appending sources to the response.")
        elif full_analysis_len + note_len <= TELEGRAM_MESSAGE_LIMIT:
            final_response_text = f"{full_analysis_part}{sources_note_long}"
            print("[LOG] Analysis + note fits. Omitting full source list.")
        else:
            available_chars_for_analysis = TELEGRAM_MESSAGE_LIMIT - len(analysis_header) - len("...") - 10
            if available_chars_for_analysis > 0:
                 truncated_analysis = formatted_analysis[:available_chars_for_analysis]
                 verdict_start_display = truncated_analysis.rfind('\n\nVerdict:') # Find last occurrence
                 if verdict_start_display > 0: truncated_analysis = truncated_analysis[:verdict_start_display] # Truncate before verdict
                 final_response_text = f"{analysis_header}{truncated_analysis}..."
                 print("[WARN] Analysis too long even alone. Truncating analysis.")
            else:
                 final_response_text = "Analysis result is too long to display."
                 print("[ERROR] Analysis header seems too long for Telegram.")

        print(f"[LOG] Final formatted response length: {len(final_response_text)} chars")

        # Edit the feedback message or send new ones if split
        if len(final_response_text) <= TELEGRAM_MESSAGE_LIMIT:
             await feedback_message.edit_text(final_response_text, parse_mode="HTML", disable_web_page_preview=True)
        else:
             print("[LOG] Response still exceeds limit after checks, splitting.")
             await feedback_message.delete()
             chunks = split_message(final_response_text, chunk_size=TELEGRAM_MESSAGE_LIMIT)
             for i, chunk in enumerate(chunks):
                 parse_mode = "HTML" if i == 0 else None
                 await update.message.reply_text(chunk, parse_mode=parse_mode, disable_web_page_preview=True)

    except Exception as e:
        print(f"[ERROR] Error during message handling: {e}")
        print(traceback.format_exc()) # Print full traceback for debugging
        error_message = "😔 Sorry, an unexpected error occurred while processing your request. Please try again later."
        try: await feedback_message.edit_text(error_message)
        except Exception as inner_e: print(f"[ERROR] Could not edit feedback message: {inner_e}"); await update.message.reply_text(error_message)

In [None]:
def test_apis():
    """Check if Google Search API, webpage scraping, and Hugging Face Model API are working."""
    print("[TEST] Testing Google Search API and webpage scraping...")
    search_summary = google_search("test news fact")
    print(f"[TEST] Combined search and scrape summary:\n{search_summary}")

    print("[TEST] Testing Hugging Face Model API...")
    test_prompt = "Hello, how are you?"
    test_response = generate_text(test_prompt)
    print(f"[TEST] Model API Test Response (first 100 chars): {test_response[:100]}...")

# Run tests before starting the bot
test_apis()


In [None]:
def run_bot():
    """Start Telegram bot"""
    application = Application.builder().token(TELEGRAM_TOKEN).build()

    # Command handlers
    application.add_handler(CommandHandler("start", lambda u,c: u.message.reply_text(
        "Send me any claim to fact check!")))
    application.add_handler(CommandHandler("help", lambda u,c: u.message.reply_text(
        "Just send me any news headline or claim to verify!")))

    # Message handler
    application.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, handle_message))

    print("Bot is running...")
    application.run_polling()

In [None]:
# Start the bot
try:
    run_bot()
except Exception as e:
    print(f"Bot error: {e}")