In [1]:
# ISOT Fake News Detection Model

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import os
from tqdm import tqdm
import optuna
from optuna.pruners import MedianPruner
from optuna.samplers import TPESampler
import re
import time
from lime.lime_text import LimeTextExplainer
from sklearn.model_selection import train_test_split

In [2]:
# function to perform minimal text preprocessing for transformer models
# arguments:
# text - input text to preprocess (string)
# max_length - maximum number of words to keep (default: 512)
#
# returns preprocessed text string
def preprocess_text(text, max_length=512):
    # step 1: handle edge cases
    if not isinstance(text, str):
        return ""
    
    # step 2: clean encoding issues
    # remove non-ascii characters that could cause problems
    text = text.encode('ascii', 'ignore').decode('ascii')
    
    # step 3: normalize text format
    # replace multiple whitespace characters with single space
    text = re.sub(r'\s+', ' ', text).strip()
    
    # step 4: truncate oversized inputs
    # transformer models have context length limits
    words = text.split()
    if len(words) > max_length:
        text = ' '.join(words[:max_length])
        
    return text

In [3]:
# function to load and preprocess WELFake dataset
# arguments:
# welfake_path - path to WELFake_Dataset.csv
# return - tuple of (train_df, valid_df, test_df) with preprocessed data
def load_welfake_dataset(welfake_path="data/welfake_dataset/WELFake_Dataset.csv"):
    # step 1: load the raw dataset
    print(f"Loading WELFake dataset from {welfake_path}...")
    welfake_df = pd.read_csv(welfake_path)

    # step 2: rename/reshape columns
    welfake_df['statement'] = welfake_df['title'] + " " + welfake_df['text'].fillna("")
    welfake_df['label'] = welfake_df['label']  # Already binary (1=real, 0=fake)
    
    # step 3: keep only needed columns
    welfake_df = welfake_df[['statement', 'label']]
    
    # step 4: apply preprocessing
    print("Applying text preprocessing...")
    welfake_df['statement'] = welfake_df['statement'].apply(preprocess_text)

    # step 5: do train/valid/test split
    train_df, temp_df = train_test_split(welfake_df, test_size=0.2, random_state=42)
    valid_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)
    
    # step 6: print dataset statistics
    print(f"WELFake dataset statistics:")
    print(f"  Total: {len(welfake_df)} samples")
    print(f"  Train: {len(train_df)} samples")
    print(f"  Valid: {len(valid_df)} samples")
    print(f"  Test: {len(test_df)} samples")
    
    # step 7: check class balance
    print(f"\nClass distribution:")
    print(f"  Overall: {welfake_df['label'].value_counts().to_dict()}")
    print(f"  Train: {train_df['label'].value_counts().to_dict()}")
    print(f"  Valid: {valid_df['label'].value_counts().to_dict()}")
    print(f"  Test: {test_df['label'].value_counts().to_dict()}")

    return train_df, valid_df, test_df

In [4]:
# dataset class for ISOT fake news detection
class FakeNewsDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=128):
        # step 1: store dataframe and extract necessary columns
        self.df = df
        self.texts = df['statement'].tolist()
        self.labels = df['label'].tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        # step 1: get text and label for the index
        text = self.texts[idx]
        label = self.labels[idx]
        
        # step 2: tokenize the text
        encodings = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        # step 3: return the encodings and label
        return {
            'input_ids': encodings['input_ids'].squeeze(),
            'attention_mask': encodings['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [5]:
# compute metrics for model evaluation
# arguments:
# eval_pred - tuple of (predictions, labels)
# return - dictionary with evaluation metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    return {
        'accuracy': accuracy_score(labels, predictions),
        'precision': precision_score(labels, predictions, zero_division=0),
        'recall': recall_score(labels, predictions),
        'f1': f1_score(labels, predictions),
        'roc_auc': roc_auc_score(labels, predictions) if len(np.unique(labels)) > 1 else 0
    }

In [6]:
# function to load and preprocess ISOT dataset
# arguments:
# isot_paths - tuple of paths to (fake_news_path, true_news_path)
# return - tuple of (train_df, valid_df, test_df) with preprocessed data
def load_isot_dataset(isot_paths=("data/isot_dataset/Fake.csv", "data/isot_dataset/True.csv")):
    # step 1: load the raw dataset
    fake_path, true_path = isot_paths
    print(f"Loading ISOT dataset from {fake_path} and {true_path}...")
    
    # Load fake news
    fake_df = pd.read_csv(fake_path)
    fake_df['label'] = 0  # 0 for fake
    
    # Load true news
    true_df = pd.read_csv(true_path)
    true_df['label'] = 1  # 1 for real
    
    # step 2: combine datasets
    isot_df = pd.concat([fake_df, true_df], ignore_index=True)
    
    # step 3: create statement column (title + text)
    isot_df['statement'] = isot_df['title'] + " " + isot_df['text'].fillna("")
    
    # step 4: keep only needed columns
    isot_df = isot_df[['statement', 'label']]
    
    # step 5: apply preprocessing
    print("Applying text preprocessing...")
    isot_df['statement'] = isot_df['statement'].apply(preprocess_text)

    # step 6: do train/valid/test split
    train_df, temp_df = train_test_split(isot_df, test_size=0.2, random_state=42)
    valid_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)
    
    # step 7: print dataset statistics
    print(f"ISOT dataset statistics:")
    print(f"  Total: {len(isot_df)} samples")
    print(f"  Train: {len(train_df)} samples")
    print(f"  Valid: {len(valid_df)} samples")
    print(f"  Test: {len(test_df)} samples")
    
    # step 8: check class balance
    print(f"\nClass distribution:")
    print(f"  Overall: {isot_df['label'].value_counts().to_dict()}")
    print(f"  Train: {train_df['label'].value_counts().to_dict()}")
    print(f"  Valid: {valid_df['label'].value_counts().to_dict()}")
    print(f"  Test: {test_df['label'].value_counts().to_dict()}")

    return train_df, valid_df, test_df

In [7]:
# function to implement curriculum learning for fake news detection
# arguments:
# isot_paths - tuple of paths to (fake_news_path, true_news_path) for ISOT dataset
# welfake_path - path to WELFake dataset
# phase1_epochs - number of epochs for initial ISOT training
# phase2_epochs - number of epochs for WELFake fine-tuning
#
# returns tuple of (trainer, model, tokenizer, evaluation_results)
def curriculum_learning_pipeline(
    isot_paths=("data/isot_dataset/Fake.csv", "data/isot_dataset/True.csv"),
    welfake_path="data/welfake_dataset/WELFake_Dataset.csv",
    phase1_epochs=3,
    phase2_epochs=2,
    model_name="distilbert-base-uncased"
):
    # step 1: load ISOT dataset for phase 1 training
    print("Loading ISOT dataset for phase 1 (initial training)...")
    isot_train_df, isot_valid_df, isot_test_df = load_isot_dataset(isot_paths)
    
    # step 2: initialize model and tokenizer
    print(f"Initializing {model_name} model and tokenizer...")
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, 
        num_labels=2
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    # step 3: create ISOT datasets
    print("Creating ISOT PyTorch datasets...")
    isot_train_dataset = FakeNewsDataset(isot_train_df, tokenizer, max_length=128)
    isot_valid_dataset = FakeNewsDataset(isot_valid_df, tokenizer, max_length=128)
    
    # step 4: Phase 1 - Train on ISOT dataset
    print("\n===== PHASE 1: Training on ISOT dataset =====")
    phase1_training_args = TrainingArguments(
        output_dir="./isot_pretrained",
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=phase1_epochs,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        push_to_hub=False,
        report_to="none",
        fp16=True,  # Use mixed precision for faster training
    )
    
    phase1_trainer = Trainer(
        model=model,
        args=phase1_training_args,
        train_dataset=isot_train_dataset,
        eval_dataset=isot_valid_dataset,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )
    
    # Train phase 1
    phase1_trainer.train()
    
    # Evaluate on ISOT after phase 1
    print("\nEvaluating Phase 1 model on ISOT test set...")
    isot_test_dataset = FakeNewsDataset(isot_test_df, tokenizer, max_length=128)
    phase1_isot_results = phase1_trainer.evaluate(isot_test_dataset)
    print("ISOT Results after Phase 1:")
    for key, value in phase1_isot_results.items():
        if key.startswith("eval_"):
            print(f"  {key[5:]}: {value:.4f}")
    
    # step 5: load WELFake dataset for phase 2 training
    print("\nLoading WELFake dataset for phase 2 (fine-tuning)...")
    welfake_train_df, welfake_valid_df, welfake_test_df = load_welfake_dataset(welfake_path)
    
    # step 6: create WELFake datasets
    print("Creating WELFake PyTorch datasets...")
    welfake_train_dataset = FakeNewsDataset(welfake_train_df, tokenizer, max_length=128)
    welfake_valid_dataset = FakeNewsDataset(welfake_valid_df, tokenizer, max_length=128)
    welfake_test_dataset = FakeNewsDataset(welfake_test_df, tokenizer, max_length=128)
    
    # Evaluate on WELFake before phase 2
    print("\nEvaluating Phase 1 model on WELFake test set (baseline)...")
    phase1_welfake_results = phase1_trainer.evaluate(welfake_test_dataset)
    print("WELFake Results before Phase 2:")
    for key, value in phase1_welfake_results.items():
        if key.startswith("eval_"):
            print(f"  {key[5:]}: {value:.4f}")
    
    # step 7: Phase 2 - Fine-tune on WELFake dataset with smaller learning rate
    print("\n===== PHASE 2: Fine-tuning on WELFake dataset =====")
    phase2_training_args = TrainingArguments(
        output_dir="./welfake_finetuned",
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=5e-6,  # Smaller learning rate to avoid catastrophic forgetting
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=phase2_epochs,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        push_to_hub=False,
        report_to="none"
    )
    
    phase2_trainer = Trainer(
        model=model,  # Continue with the same model from phase 1
        args=phase2_training_args,
        train_dataset=welfake_train_dataset,
        eval_dataset=welfake_valid_dataset,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )
    
    # Train phase 2
    phase2_trainer.train()
    
    # step 8: Evaluate on both datasets after phase 2
    print("\nEvaluating final model on both datasets...")
    
    # Evaluate on ISOT
    final_isot_results = phase2_trainer.evaluate(isot_test_dataset)
    print("\nISOT Results after Phase 2:")
    for key, value in final_isot_results.items():
        if key.startswith("eval_"):
            print(f"  {key[5:]}: {value:.4f}")
    
    # Evaluate on WELFake
    final_welfake_results = phase2_trainer.evaluate(welfake_test_dataset)
    print("\nWELFake Results after Phase 2:")
    for key, value in final_welfake_results.items():
        if key.startswith("eval_"):
            print(f"  {key[5:]}: {value:.4f}")
    
    # step 9: save the final model
    model_path = "curriculum_fake_news_model"
    phase2_trainer.save_model(model_path)
    tokenizer.save_pretrained(model_path)
    print(f"\nFinal model and tokenizer saved to {model_path}")
    
    # Prepare evaluation results
    evaluation_results = {
        "phase1_isot": {k.replace('eval_', ''): v for k, v in phase1_isot_results.items() if k.startswith('eval_')},
        "phase1_welfake": {k.replace('eval_', ''): v for k, v in phase1_welfake_results.items() if k.startswith('eval_')},
        "phase2_isot": {k.replace('eval_', ''): v for k, v in final_isot_results.items() if k.startswith('eval_')},
        "phase2_welfake": {k.replace('eval_', ''): v for k, v in final_welfake_results.items() if k.startswith('eval_')}
    }
    
    return phase2_trainer, model, tokenizer, evaluation_results

In [8]:
trainer, model, tokenizer, results = curriculum_learning_pipeline()

Loading ISOT dataset for phase 1 (initial training)...
Loading ISOT dataset from data/isot_dataset/Fake.csv and data/isot_dataset/True.csv...
Applying text preprocessing...
ISOT dataset statistics:
  Total: 44898 samples
  Train: 35918 samples
  Valid: 4490 samples
  Test: 4490 samples

Class distribution:
  Overall: {0: 23481, 1: 21417}
  Train: {0: 18748, 1: 17170}
  Valid: {0: 2348, 1: 2142}
  Test: {0: 2385, 1: 2105}
Initializing distilbert-base-uncased model and tokenizer...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Creating ISOT PyTorch datasets...

===== PHASE 1: Training on ISOT dataset =====


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
1,0.0042,9.7e-05,1.0,1.0,1.0,1.0,1.0
2,0.0013,2.3e-05,1.0,1.0,1.0,1.0,1.0
3,0.0,3.8e-05,1.0,1.0,1.0,1.0,1.0



Evaluating Phase 1 model on ISOT test set...


ISOT Results after Phase 1:
  loss: 0.0016
  accuracy: 0.9998
  precision: 1.0000
  recall: 0.9995
  f1: 0.9998
  roc_auc: 0.9998
  runtime: 12.1712
  samples_per_second: 368.9030
  steps_per_second: 23.0870

Loading WELFake dataset for phase 2 (fine-tuning)...
Loading WELFake dataset from data/welfake_dataset/WELFake_Dataset.csv...
Applying text preprocessing...
WELFake dataset statistics:
  Total: 72134 samples
  Train: 57707 samples
  Valid: 7213 samples
  Test: 7214 samples

Class distribution:
  Overall: {1: 37106, 0: 35028}
  Train: {1: 29768, 0: 27939}
  Valid: {1: 3667, 0: 3546}
  Test: {1: 3671, 0: 3543}
Creating WELFake PyTorch datasets...

Evaluating Phase 1 model on WELFake test set (baseline)...
WELFake Results before Phase 2:
  loss: 7.7914
  accuracy: 0.1497
  precision: 0.0080
  recall: 0.0054
  f1: 0.0065
  roc_auc: 0.1523
  runtime: 21.3556
  samples_per_second: 337.8040
  steps_per_second: 21.1190

===== PHASE 2: Fine-tuning on WELFake dataset =====


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
1,0.0604,0.060215,0.983641,0.978689,0.989365,0.983998,0.983543
2,0.0429,0.068791,0.982393,0.985997,0.979275,0.982624,0.982446



Evaluating final model on both datasets...



ISOT Results after Phase 2:
  loss: 8.5565
  accuracy: 0.0013
  precision: 0.0004
  recall: 0.0005
  f1: 0.0004
  roc_auc: 0.0013
  runtime: 12.9066
  samples_per_second: 347.8840
  steps_per_second: 21.7720

WELFake Results after Phase 2:
  loss: 0.0491
  accuracy: 0.9864
  precision: 0.9824
  recall: 0.9910
  f1: 0.9867
  roc_auc: 0.9863
  runtime: 22.2845
  samples_per_second: 323.7230
  steps_per_second: 20.2380

Final model and tokenizer saved to curriculum_fake_news_model


In [9]:
def predict_text(text, model, tokenizer, threshold=0.5):
    """
    Predict if a given news article is real or fake
    """
    # Preprocess text
    processed_text = preprocess_text(text)
    
    # Tokenize
    inputs = tokenizer(
        processed_text,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    
    # Move inputs to the same device as the model
    device = model.device
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # Get prediction
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Convert to probabilities
    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
    fake_prob = probabilities[0, 0].item()
    real_prob = probabilities[0, 1].item()
    
    # Get prediction
    predicted_class = 1 if real_prob > threshold else 0
    label = "REAL" if predicted_class == 1 else "FAKE"
    
    return {
        "prediction": label,
        "fake_probability": fake_prob,
        "real_probability": real_prob,
        "confidence": max(fake_prob, real_prob)
    }