In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import time
import random
import warnings
import logging
import re
from nltk.tokenize import sent_tokenize
from collections import Counter

In [2]:
# Ensure NLTK data is available
import nltk
nltk.download('punkt', quiet=True)

# Set up logging for debugging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

# Suppress warnings (optional)
warnings.filterwarnings("ignore", category=FutureWarning)

# Load BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)  # Binary classification
model.eval()  # Set model to evaluation mode

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
from langchain_groq import ChatGroq
from langchain.prompts import PromptTemplate
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# --- Step 1: Function to summarize content using generative AI ---
def summarize_content(statement, content, max_sentences=5, max_length=2000):
    """Summarize content using ChatGroq, focusing on information relevant to the statement."""
    try:
        # Initialize ChatGroq with the provided API key
        api_key = "Your_Groq_API_Key"  # Replace with your actual Groq API key
        llm = ChatGroq(api_key=api_key, model="llama3-8b-8192")

        # Construct a prompt for summarization
        prompt_template = PromptTemplate.from_template(
            """Given the following statement: '{statement}'
        And the following content: '{content}'
        Summarize the content in up to {max_sentences} sentences, including only information directly relevant to the statement. 
        Exclude irrelevant details and ensure the summary is coherent, concise, and avoids gibberish. 
        Limit the summary to {max_length} characters. If no relevant information is found, return 'No relevant summary found'."""
        )

        # Format the prompt with truncated content to avoid excessive length
        prompt = prompt_template.format(
            statement=statement,
            content=content[:10000],  # Truncate content to 10,000 characters as in original
            max_sentences=max_sentences,
            max_length=max_length
        )

        # Call ChatGroq to generate the summary
        try:
            response = llm.invoke(input=prompt)
            summary = response.content.strip()

            # Validate and truncate summary if necessary
            if len(summary) > max_length:
                sentences = sent_tokenize(summary)
                truncated = ""
                current_length = 0
                for sentence in sentences:
                    if current_length + len(sentence) <= max_length:
                        truncated += sentence + " "
                        current_length += len(sentence) + 1
                    else:
                        break
                summary = truncated.strip() or "No relevant summary found"

            if not summary or summary == "No relevant summary found":
                logger.warning("ChatGroq returned empty or irrelevant summary")
                return fallback_summarize_content(statement, content, max_sentences, max_length)

            logger.info(f"Generated summary for statement '{statement[:60]}...': {summary[:100]}...")
            return summary

        except Exception as api_error:
            logger.error(f"ChatGroq API error: {api_error}")
            # Fallback to heuristic-based summarization
            return fallback_summarize_content(statement, content, max_sentences, max_length)

    except Exception as e:
        logger.error(f"Error summarizing content: {e}")
        return fallback_summarize_content(statement, content, max_sentences, max_length)

# --- Fallback Heuristic-Based Summarization ---
def fallback_summarize_content(statement, content, max_sentences=5, max_length=2000):
    """Fallback heuristic-based summarization if generative AI fails."""
    try:
        # Tokenize content into sentences
        sentences = sent_tokenize(content)
        if not sentences:
            logger.warning("No sentences found in content")
            return "No relevant summary found"

        # Extract keywords from statement, excluding stop words
        stop_words = {'a', 'an', 'the', 'is', 'are', 'was', 'were', 'in', 'on', 'at', 'to', 'and', 'or', 'for', 'with', 'by', 'from', 'of'}
        statement_words = set(re.findall(r'\b\w+\b', statement.lower())) - stop_words
        if not statement_words:
            logger.warning("No meaningful keywords extracted from statement")
            return "No relevant summary found"

        # Score sentences based on keyword overlap and relevance
        scored_sentences = []
        for sentence in sentences:
            sentence_words = set(re.findall(r'\b\w+\b', sentence.lower()))
            if not sentence_words:
                continue
            overlap = len(statement_words.intersection(sentence_words))
            word_counts = Counter(sentence_words)
            repetition_score = sum(count > 2 for count in word_counts.values())
            sentence_length = len(sentence_words)
            is_valid = sentence_length > 3 and repetition_score == 0
            if overlap > 0 and is_valid:
                scored_sentences.append((sentence, overlap))

        # Sort by overlap score and select top sentences
        scored_sentences = sorted(scored_sentences, key=lambda x: x[1], reverse=True)[:max_sentences]
        summary_sentences = [s[0] for s in scored_sentences]

        if not summary_sentences:
            logger.warning("No relevant sentences found for summary")
            return "No relevant summary found"

        summary = " ".join(summary_sentences).strip()
        if len(summary) > max_length:
            truncated = ""
            current_length = 0
            for sentence in summary_sentences:
                if current_length + len(sentence) <= max_length:
                    truncated += sentence + " "
                    current_length += len(sentence) + 1
                else:
                    break
            summary = truncated.strip()

        if not summary:
            logger.warning("Summary is empty after processing")
            return "No relevant summary found"

        logger.info(f"Fallback summary for statement '{statement[:60]}...': {summary[:100]}...")
        return summary

    except Exception as e:
        logger.error(f"Error in fallback summarization: {e}")
        return "No relevant summary found"

In [5]:
# --- Step 2: Function to fetch content from a URL ---
def get_page_content(url):
    try:
        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, "html.parser")
        text_elements = soup.find_all(["p", "div", "article", "span"])
        text = " ".join([elem.get_text(strip=True) for elem in text_elements if elem.get_text(strip=True)])
        if not text:
            logger.warning(f"No content extracted from {url}")
            return "No content found"
        return text[:10000]  # Trim to 10000 chars
    except requests.exceptions.RequestException as e:
        logger.error(f"Error fetching {url}: {e}")
        return f"Error fetching content: {e}"
    except Exception as e:
        logger.error(f"Unexpected error for {url}: {e}")
        return f"Error fetching content: {e}"

In [6]:
# --- Step 3: Use googlesearch to get evidence URLs and content ---
def get_search_results(query, max_results=10):
    try:
        from googlesearch import search
        urls = list(search(query, num_results=max_results * 2))
        results = []
        skipped = 0
        for url in urls:
            if len(results) >= max_results:
                break
            if not url.startswith(("http://", "https://")):
                logger.warning(f"Skipping invalid URL: {url}")
                skipped += 1
                continue
            logger.info(f"Fetching content from {url}")
            content = get_page_content(url)
            if content.startswith("Error") or content == "No content found":
                logger.info(f"Skipping {url} due to {'error' if content.startswith('Error') else 'empty content'}")
                skipped += 1
            else:
                results.append({"url": url, "content": content})
            time.sleep(5)
        logger.info(f"Skipped {skipped} URLs, fetched {len(results)} valid results for query: {query[:60]}...")
        return results
    except Exception as e:
        logger.error(f"Search error for query '{query[:60]}...': {e}")
        return [{"url": None, "content": f"Search error: {e}"}]

In [7]:
# --- Step 4: Adjust prediction using BERT ---
def adjust_prediction_with_external_evidence(statement, evidence_data):
    """Use BERT to predict truthfulness based on statement and summarized evidence"""
    summaries = [
        summarize_content(statement, item["content"])
        for item in evidence_data
        if not item["content"].startswith("Error") and item["content"] != "No content found"
    ]

    if not summaries:
        logger.warning(f"No valid summaries found for statement: {statement[:60]}...")
        return 0.0, []  # Return 0.0 probability and empty scores list

    # Combine statement with each summarized evidence text
    probabilities = []
    summaries_lst = []
    for summary in summaries:
        if summary == "No relevant summary found":
            continue
        input_text = f"[CLS] {statement} [SEP] {summary} [SEP]"
        inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True, max_length=512)
        
        with torch.no_grad():
            outputs = model(**inputs)
            probs = torch.softmax(outputs.logits, dim=1).detach().numpy()[0]
            true_prob = float(probs[1])  # Probability of "true" class
            probabilities.append(true_prob)
            summaries_lst.append(summary)
    
    # Use max probability to avoid diluting strong signals
    final_prob = max(probabilities) if probabilities else 0.0
    logger.info(f"Adjusted probability for statement '{statement[:60]}...': {final_prob:.4f}")
    logger.info(f"Evidence probabilities: {probabilities}")
    return final_prob, probabilities, summaries_lst

In [8]:
# --- Step 5: Apply adjustment over DataFrame ---
TEST_PATH = "liar_dataset/test.tsv"
try:
    test_df = pd.read_csv(TEST_PATH, sep='\t')
except FileNotFoundError:
    logger.error(f"File {TEST_PATH} not found.")
    exit(1)

# Verify column for statements
statement_col = 'statement' if 'statement' in test_df.columns else 2

# Sample 5 random rows
random_indices = random.sample(range(len(test_df)), 1)
sample_df = test_df.iloc[random_indices].copy()

# Initialize lists to store results
adjusted_probs = []
web_data = []

In [9]:
# # Process sample
# for counter, (i, row) in enumerate(sample_df.iterrows(), 1):
#     statement = row[statement_col] if isinstance(statement_col, str) else row.iloc[statement_col]
#     logger.info(f"Processing statement {counter}/{len(sample_df)}: {statement[:60]}...")
    
#     # Fetch search results
#     search_results = get_search_results(statement)
    
#     # Calculate adjusted probability
#     if search_results:
#         adjusted_prob, evidence_probs = adjust_prediction_with_external_evidence(statement, search_results)
#         print(f"Statement {counter}: {statement[:60]}...")
#         print(f"Evidence probabilities: {evidence_probs}")
#         adjusted_probs.append(adjusted_prob)
#     else:
#         logger.warning(f"No search results for statement: {statement[:60]}...")
#         adjusted_probs.append(0.0)
#         evidence_probs = []

#     # Store web data with summaries and probabilities
#     valid_result_index = 0
#     for result in search_results:
#         original_content = result["content"]
#         summary = summarize_content(statement, original_content) if not original_content.startswith("Error") and original_content != "No content found" else original_content
#         prob = evidence_probs[valid_result_index] if valid_result_index < len(evidence_probs) and summary != "No relevant summary found" else None
#         web_data.append({
#             "statement": statement,
#             "url": result["url"],
#             "content": original_content,
#             "content_summary": summary,
#             "probability": prob
#         })
#         if summary != "No relevant summary found" and not original_content.startswith("Error") and original_content != "No content found":
#             valid_result_index += 1

# # Create DataFrame for web data
# web_df = pd.DataFrame(web_data)

# # Define threshold
# BEST_THRESHOLD = 0.6

# # Update sample DataFrame with adjusted results
# sample_df["Adjusted Final Probability"] = [f"{p:.4f}" for p in adjusted_probs]
# sample_df["Adjusted Final Class"] = ['true' if p > BEST_THRESHOLD else 'false' for p in adjusted_probs]

In [None]:
for counter, (i, row) in enumerate(sample_df.iterrows(), 1):
    statement = row[statement_col] if isinstance(statement_col, str) else row.iloc[statement_col]
    logger.info(f"Processing statement {counter}/{len(sample_df)}: {statement[:60]}...")
    
    # Fetch search results
    search_results = get_search_results(statement)
    
    # Calculate adjusted probability
    if search_results:
        adjusted_prob, evidence_probs, summaries_lst = adjust_prediction_with_external_evidence(statement, search_results)
        print(f"Statement {counter}: {statement[:60]}...")
        print(f"Evidence probabilities: {evidence_probs}")
        adjusted_probs.append(adjusted_prob)
    else:
        logger.warning(f"No search results for statement: {statement[:60]}...")
        adjusted_probs.append(0.0)
        evidence_probs = []
        summaries_lst = []  # Ensure summaries_lst is defined even if no search results

    # Store web data with summaries and probabilities
    valid_result_index = 0
    for idx, result in enumerate(search_results):
        original_content = result["content"]
        # Use summary from summaries_lst if available, otherwise use original_content or fallback
        summary = summaries_lst[idx] if idx < len(summaries_lst) and summaries_lst[idx] not in ["Error", "No content found", "No relevant summary found"] else original_content
        prob = evidence_probs[valid_result_index] if valid_result_index < len(evidence_probs) and summary != "No relevant summary found" else None
        web_data.append({
            "statement": statement,
            "url": result["url"],
            "content": original_content,
            "content_summary": summary,
            "probability": prob
        })
        if summary != "No relevant summary found" and not original_content.startswith("Error") and original_content != "No content found":
            valid_result_index += 1

# Create DataFrame for web data
web_df = pd.DataFrame(web_data)

# Define threshold
BEST_THRESHOLD = 0.6

# Update sample DataFrame with adjusted results
sample_df["Adjusted Final Probability"] = [f"{p:.4f}" for p in adjusted_probs]
sample_df["Adjusted Final Class"] = ['true' if p > BEST_THRESHOLD else 'false' for p in adjusted_probs]

2025-07-23 16:36:06,400 - INFO - Processing statement 1/1: One man sacrificed for his country. One man opposed a flawed...
2025-07-23 16:36:07,328 - INFO - Fetching content from https://www.politifact.com/factchecks/2008/feb/07/john-mccain/one-man-found-his-winning-message/
2025-07-23 16:36:12,531 - INFO - Fetching content from https://www.cbsnews.com/news/mccain-stresses-message-in-sc-ad/
2025-07-23 16:36:18,197 - INFO - Fetching content from https://www.nytimes.com/2007/09/28/us/politics/28adbox.html
2025-07-23 16:36:18,270 - ERROR - Error fetching https://www.nytimes.com/2007/09/28/us/politics/28adbox.html: 403 Client Error: Forbidden for url: https://www.nytimes.com/2007/09/28/us/politics/28adbox.html
2025-07-23 16:36:18,271 - INFO - Skipping https://www.nytimes.com/2007/09/28/us/politics/28adbox.html due to error
2025-07-23 16:36:23,278 - INFO - Fetching content from https://www.politifact.com/factchecks/2007/oct/02/john-mccain/he-told-you-so-/
2025-07-23 16:36:28,569 - INFO - Fet

In [None]:
sample_df

In [None]:
web_df

In [None]:
web_df['content'][1]

In [None]:
web_df['content_summary'][1]

In [None]:
web_df['statement'][1]