## Install relevant packages

In [None]:
!pip install bitsandbytes faiss-cpu sentence-transformers datasets

## Import all relevant libraries

In [None]:
import os
from huggingface_hub import login
import datasets
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd
import re

In [None]:
os.environ["HUGGINGFACE_TOKEN"] = "your_token"
login(os.environ["HUGGINGFACE_TOKEN"])

In [None]:
dataset = "chengxuphd/liar2"
dataset = datasets.load_dataset(dataset)

In [None]:
# Convert labels: {0, 1, 2, 3} -> 1 (aka fake), {4, 5} -> 0 (aka real)
def convert_labels(data):
    if data["label"] in [0, 1, 2, 3]:
        data["true_label"] = 1
    else:
        data["true_label"] = 0
    return data

In [None]:
dataset = dataset.map(convert_labels)
test_dataset = dataset["test"]

In [None]:
# Load embedding model
embedding_model = SentenceTransformer("all-mpnet-base-v2")

### Compute embeddings for justifications

In [None]:
# Extract justifications
justifications = [sample["justification"] for sample in test_dataset]

# Convert justifications into embeddings
justification_embeddings = embedding_model.encode(justifications, convert_to_numpy=True)

# Create FAISS index
dimension = justification_embeddings.shape[1]  # Get embedding size
faiss_index = faiss.IndexFlatL2(dimension)  # L2 (Euclidean) distance
faiss_index.add(justification_embeddings)   # Store embeddings

In [None]:
def retrieve_top_k_justifications(claim, k=2):
    """Retrieves top-k relevant justifications using FAISS"""
    
    # Convert claim into embedding
    claim_embedding = embedding_model.encode([claim], convert_to_numpy=True)
    
    # Retrieve top-k justifications
    distances, indices = faiss_index.search(claim_embedding, k)
    
    # Return retrieved justifications
    return [justifications[i] for i in indices[0]], indices[0]

In [None]:
def format_chat_prompt_v1(claim, speaker, speaker_description):
    """Formats the input prompt using retrieved justifications as context. (zero-shot)"""

    # Retrieve relevant justifications
    retrieved_justifications, indices = retrieve_top_k_justifications(claim, k=2)
    justification_text = "\n\n".join(retrieved_justifications)  # Combine into one text block
    indices = indices.tolist()
    
    chat_prompt = tokenizer.apply_chat_template(
        [
            {"role": "system", "content": "You are an AI that detects fake news using both linguistic analysis and contextual information retrieved from a knowledge base."},

            # Pass retrieved justifications as context
            {"role": "user", "content": f"""Here is the relevant information from the knowledge base:\n\n"{justification_text}"

Classify the following statement as FAKE or REAL, and highlight only the top five single keywords contributing to the classification.

Focus on both linguistic features and the given contextual evidence.

Statement: "{claim}" 

Speaker: "{speaker}"

Speaker Description: "{speaker_description}"
            
Output the result in the following structured format:
    
Classification: (FAKE or REAL)
    
Keywords: [list of ONLY five single keywords]
    
Reasoning: (Explain based on both linguistic features and the given contextual evidence)"""}
        ],
        tokenize=False,
        add_generation_prompt=True,
    )

    return chat_prompt, justification_text, indices

In [None]:
def format_chat_prompt_v2(claim, speaker, speaker_description):
    """Formats the input prompt using retrieved justifications as context. (two-shot)"""

    # Retrieve relevant justifications
    retrieved_justifications, indices = retrieve_top_k_justifications(claim, k=2)
    justification_text = "\n\n".join(retrieved_justifications)  # Combine into one text block
    indices = indices.tolist()
    
    chat_prompt = tokenizer.apply_chat_template(
        [
            {"role": "system", "content": "You are an AI that detects fake news using both linguistic analysis and contextual information retrieved from a knowledge base."},

            # Example 1 (FAKE)
            {"role": "user", "content": f"""Here is the relevant information from the knowledge base:

"According to CBS News White House reporter Mark Knoller, who keeps records of presidential travel, President Obama took 29 vacation trips over his eight years in the White House. The trips spanned all or part of 235 days (not 692). He also made 39 visits to Camp David, totaling around 93 days. Even though the original story was published with a disclaimer, versions of the account were copied and pasted in social media posts with no such explanation and circulated as real news.  We rate this !"

Classify the following statement as FAKE or REAL, and highlight only the top five single keywords contributing to the classification.

Focus on both linguistic features and the given contextual evidence.

Statement: "President Obama used government funds to pay for his personal vacations for the next 20 years and Trump sent him a bill." 

Speaker: "facebook posts"

Speaker Description: "Posters on Facebook and other social media networks."
            
Output the result in the following structured format:
    
Classification: (FAKE or REAL)
    
Keywords: [list of ONLY five single keywords]
    
Reasoning: (Explain based on both linguistic features and the given contextual evidence)"""},

            {"role": "assistant", "content": """Classification: FAKE
            
Keywords: Obama, vacations, government, bill, Trump
            
Reasoning: Linguistically, the statement uses sensational and emotionally charged language (“used government funds,” “for the next 20 years,” “Trump sent him a bill”) common in fake news. The exaggerated future timeline and improbable scenario (a former president receiving a bill from a sitting or later president) lacks credibility and factual basis.

From the contextual evidence, it is confirmed that President Obama took 29 vacation trips over eight years, totaling 235 days, and not 692 as claimed in viral posts. There is no verified report of misuse of government funds for future vacations or any bill sent by Trump. The original false claim was widely circulated on social media without context, further supporting that this is misinformation."""},

            # Example 2 (REAL)
            {"role": "user", "content": f"""Here is the relevant information from the knowledge base:

"Here is the relevant information from the knowledge base:
In sum, Graves’ numbers are correct. The VA’s budget has increased by more than 40 percent, and the number of pending claims has skyrocketed from some 391,000 to 890,000 under the Obama administration. Graves also was accurate in noting that the increase took place under the president’s watch. While the VA’s benefits system has been troubled since well before Obama took office, efforts during his administration have not prepared the department for its current challenges. We rate Graves’ statement ."

Classify the following statement as FAKE or REAL, and highlight only the top five single keywords contributing to the classification.

Focus on both linguistic features and the given contextual evidence.

Statement: "Despite having their budget increased by over 40 percent since 2009 … pending claims for benefits with the (Department of Veterans Affairs) have increased from 391,000 to 890,000 under the Obama Administration." 

Speaker: "tom graves"

Speaker Description: "Tom Graves represents Georgia's 9th Congressional District."
            
Output the result in the following structured format:
    
Classification: (FAKE or REAL)
    
Keywords: [list of ONLY five single keywords]
    
Reasoning: (Explain based on both linguistic features and the given contextual evidence)"""},

            {"role": "assistant", "content": """Classification: REAL
            
Keywords: budget, increased, claims, benefits, Obama
            
Reasoning: Linguistically, the statement is factual in tone, presenting specific numerical data without exaggeration or emotionally charged language. The use of figures and administrative references (e.g., "since 2009," "Department of Veterans Affairs") suggests a grounded, data-driven claim.

From the contextual evidence, the knowledge base confirms that the VA’s budget increased by over 40 percent under the Obama administration and that pending claims rose from approximately 391,000 to 890,000 during the same period. It also notes that although the issues existed before Obama’s tenure, the surge in claims and budget increase indeed occurred “under the president’s watch.” Therefore, the statement made by Tom Graves is consistent with the verified information."""},

            # Pass retrieved justifications as context
            {"role": "user", "content": f"""Here is the relevant information from the knowledge base:\n\n"{justification_text}"

Classify the following statement as FAKE or REAL, and highlight only the top five single keywords contributing to the classification.

Focus on both linguistic features and the given contextual evidence.

Statement: "{claim}" 

Speaker: "{speaker}"

Speaker Description: "{speaker_description}"
            
Output the result in the following structured format:
    
Classification: (FAKE or REAL)
    
Keywords: [list of ONLY five single keywords]
    
Reasoning: (Explain based on both linguistic features and the given contextual evidence)"""}
        ],
        tokenize=False,
        add_generation_prompt=True,
    )

    return chat_prompt, justification_text, indices

In [None]:
# Load model
quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
)

model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2", quantization_config=quantization_config, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")

In [None]:
# Extract classification, keywords, and reasoning from model output
def extract_prediction(response):
    # Extract classification (FAKE or REAL)
    classification_match = re.search(r'Classification:\s*(FAKE|REAL)', response)
    classification = classification_match.group(1) if classification_match else "UNKNOWN"

    # Extract keywords as a list
    keywords_match = re.search(r'Keywords:\s*(.*)', response)
    
    if keywords_match:
        keywords_text = keywords_match.group(1).strip()
        
        # Split by commas and remove any stray quotes/spaces
        keywords = [kw.strip().strip('"').strip("'") for kw in keywords_text.split(", ")]
    else:
        keywords = []

    # Extract reasoning
    reasoning_match = re.search(r'Reasoning:\s*(.*)', response, re.DOTALL)
    reasoning = reasoning_match.group(1).strip() if reasoning_match else "No reasoning provided."
    
    return classification, keywords, reasoning

In [None]:
# Perform evaluation to extract results from test dataset into a .csv file
def evaluate(shot=0):
    results = []
    unknown_count = 0
    
    for sample in test_dataset:
        unique_id = sample["id"]
        claim = sample["statement"]
        speaker = sample["speaker"]
        speaker_description = sample["speaker_description"]
        true_label = sample["true_label"]
        orig_label = sample["label"]
    
        # Format prompt using chat template
        if shot == 0: # zero-shot
            formatted_prompt, justification_text, indices = format_chat_prompt_v1(claim, speaker, speaker_description)
        else: # few-shot
            formatted_prompt, justification_text, indices = format_chat_prompt_v2(claim, speaker, speaker_description)
    
        # Tokenize and generate response
        inputs = tokenizer(formatted_prompt, return_tensors="pt").to("cuda")
        output_tokens = model.generate(**inputs, max_new_tokens=250)
        model_response = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
    
        cleaned_response = model_response.split("[/INST]")[-1].strip()
    
        # Extract structured outputs
        predicted_label, keywords, reasoning = extract_prediction(cleaned_response)
    
        # Convert predicted label to binary (FAKE=1, REAL=0)
        if predicted_label == "FAKE":
            pred_label = 1
        elif predicted_label == "REAL":
            pred_label = 0
        else: # Unknown label predicted
            print(predicted_label)
            pred_label = -1
            unknown_count += 1
        
        results.append({
                "id": unique_id,
                "Claim": claim,
                "Retrieved Justification": justification_text,
                "Retrieved Index": indices,
                "Prediction": pred_label,
                "Keywords": ", ".join(keywords),
                "Reasoning": reasoning,
                "True Label": true_label,
                "Original Label": orig_label
            })
    
    df = pd.DataFrame(results)
    df.to_csv("results_zeroshot_rag.csv", index=False)
    print(f"Number of test instances where model outputs an unknown classification: {unknown_count}")
    print(f"% of test instances where model outputs either FAKE or REAL: {(len(test_dataset) - unknown_count) / len(test_dataset)*100:.2f}")

In [None]:
evaluate(shot=0) # zero-shot

In [None]:
# Evaluation Metrics

df = pd.read_csv('results_zeroshot_rag.csv')

# Filter out rows with Prediction == -1
df = df[df["Prediction"] != -1]

df_class_0 = df[df["True Label"] == 0]
df_class_1 = df[df["True Label"] == 1]

minority_class_size = min(len(df_class_0), len(df_class_1))

metrics = {
    'accuracy': [],
    'precision': [],
    'recall': [],
    'f1': []
}

# Run the sampling and evaluation 5 times
for _ in range(5):
    if len(df_class_0) < len(df_class_1):
        sampled_majority = df_class_1.sample(n=minority_class_size, random_state=np.random.randint(0, 10000))
        balanced_df = pd.concat([df_class_0, sampled_majority])
    else:
        sampled_majority = df_class_0.sample(n=minority_class_size, random_state=np.random.randint(0, 10000))
        balanced_df = pd.concat([df_class_1, sampled_majority])

    # Shuffle the balanced dataset
    balanced_df = balanced_df.sample(frac=1).reset_index(drop=True)

    y_true = balanced_df["True Label"]
    y_pred = balanced_df["Prediction"]

    # Compute metrics
    metrics['accuracy'].append(accuracy_score(y_true, y_pred))
    metrics['precision'].append(precision_score(y_true, y_pred))
    metrics['recall'].append(recall_score(y_true, y_pred))
    metrics['f1'].append(f1_score(y_true, y_pred))

# Compute and print average metrics
avg_metrics = {k: np.mean(v) for k, v in metrics.items()}
print("Averaged Metrics over 5 runs:")
for metric, score in avg_metrics.items():
    print(f"{metric.capitalize()}: {score:.2f}")

## Debugging Purposes

In [None]:
# for i in range(10):
#     test_claim = test_dataset["statement"][i]
#     test_speaker = test_dataset["speaker"][i]
#     test_desc = test_dataset["speaker_description"][i]
#     test_label = test_dataset["true_label"][i]
#     # print(f"Claim:\n{train_claim}")
#     formatted_prompt, _, _ = format_chat_prompt_v2(test_claim, test_speaker, test_desc)
#     # print(f"Prompt:\n{formatted_prompt}")
    
#     inputs = tokenizer(formatted_prompt, return_tensors="pt").to("cuda")
#     output_tokens = model.generate(**inputs, max_new_tokens=250)
#     model_response = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
    
#     cleaned_response = model_response.split("[/INST]")[-1].strip()
#     print(f"Model Response:\n{cleaned_response}")
#     print(f"True Label:\n{test_label}")

#     print(extract_prediction(cleaned_response))
#     print("\n#########################")

In [None]:
# for i in range(10):
#     train_claim = dataset["train"]["statement"][i]
#     train_label = dataset["train"]["label"][i]
#     print(f"Claim:\n{train_claim}")
#     formatted_prompt = format_chat_prompt_v2(train_claim)
    
#     inputs = tokenizer(formatted_prompt, return_tensors="pt").to("cuda")
#     output_tokens = model.generate(**inputs, max_new_tokens=150)
#     model_response = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
    
#     cleaned_response = model_response.split("[/INST]")[-1].strip()
#     print(f"Model Response:\n{cleaned_response}")
#     print(f"True Label:\n{train_label}")
    
#     print(extract_prediction(cleaned_response))
#     print("\n#########################")