<a href="https://colab.research.google.com/github/fawasafsal/BERT-for-Fake-News-Detection/blob/main/fake_vs_real_news_articles.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U transformers



In [None]:
# Install Required Libraries
# Install transformers library for BERT implementation
# Install datasets for data handling utilities
# Install evaluate for model evaluation metrics
# Install scikit-learn for data splitting and metrics
# Install pandas for data manipulation

!pip install transformers datasets evaluate scikit-learn pandas -q

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# Import Required Libraries
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
import os

# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

In [None]:
# Upload and Load Dataset
from google.colab import files
print("Please upload your Fake.csv and True.csv files:")
uploaded = files.upload()

# Load the fake news dataset
# Fake.csv contain fake news articles
# True.csv contain real news articles
print("Loading datasets...")
df_fake = pd.read_csv("Fake.csv")
df_real = pd.read_csv("True.csv")

# Display information about the datasets
print(f"Fake news articles: {len(df_fake)}")
print(f"Real news articles: {len(df_real)}")
print(f"Fake news columns: {df_fake.columns.tolist()}")
print(f"Real news columns: {df_real.columns.tolist()}")


Please upload your Fake.csv and True.csv files:


Saving Fake.csv to Fake.csv
Saving True.csv to True.csv
Loading datasets...
Fake news articles: 23481
Real news articles: 21417
Fake news columns: ['title', 'text', 'subject', 'date']
Real news columns: ['title', 'text', 'subject', 'date']


In [None]:
# Data Preprocessing
# Add labels: 0 for fake news, 1 for real news
df_fake["label"] = 0  # Fake news labeled as 0
df_real["label"] = 1  # Real news labeled as 1

# Combine both datasets
df = pd.concat([df_fake, df_real])

# Shuffle the dataset to ensure random distribution
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Create content column by combining title and text
# This gives the model more context for classification
df["content"] = df["title"] + " " + df["text"]

# Keep only the content and label columns
df = df[["content", "label"]]

# Display dataset statistics
print(f"Total articles: {len(df)}")
print(f"Label distribution:")
print(df["label"].value_counts())

# Check for missing values
print(f"Missing values: {df.isnull().sum().sum()}")

# Display sample articles
print("\nSample fake news (first 200 chars):")
print(df[df["label"] == 0]["content"].iloc[0][:200])
print("\nSample real news (first 200 chars):")
print(df[df["label"] == 1]["content"].iloc[0][:200])

Total articles: 44898
Label distribution:
label
0    23481
1    21417
Name: count, dtype: int64
Missing values: 0

Sample fake news (first 200 chars):
Ben Stein Calls Out 9th Circuit Court: Committed a ‘Coup d’état’ Against the Constitution 21st Century Wire says Ben Stein, reputable professor from, Pepperdine University (also of some Hollywood fame

Sample real news (first 200 chars):
Trump drops Steve Bannon from National Security Council WASHINGTON (Reuters) - U.S. President Donald Trump removed his chief strategist Steve Bannon from the National Security Council on Wednesday, re


In [None]:
# Data Splitting
# Split data into train, validation, and test sets
# Using smaller subsets for faster training

# First split: separate training data from temp data (80/20 split)
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    df["content"].tolist(),
    df["label"].tolist(),
    test_size=0.2,
    random_state=42,
    stratify=df["label"]  # Ensure balanced splits
)

# Reduce dataset size for faster training and experimentation
TRAIN_SIZE = 1000
VAL_TEST_SIZE = 500

train_texts = train_texts[:TRAIN_SIZE]
train_labels = train_labels[:TRAIN_SIZE]

# Second split: divide temp data into validation and test sets (50/50 split)
val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts[:VAL_TEST_SIZE],
    temp_labels[:VAL_TEST_SIZE],
    test_size=0.5,
    random_state=42
)

print(f"Training samples: {len(train_texts)}")
print(f"Validation samples: {len(val_texts)}")
print(f"Test samples: {len(test_texts)}")

Training samples: 1000
Validation samples: 250
Test samples: 250


In [None]:
# Initialize BERT Tokenizer
# Load BERT tokenizer for text preprocessing
# bert-base-uncased: lowercase, 12-layer, 768-hidden, 12-heads, 110M parameters
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

print(f"Tokenizer vocabulary size: {tokenizer.vocab_size}")
print(f"Max model input length: {tokenizer.model_max_length}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Tokenizer vocabulary size: 30522
Max model input length: 512


In [None]:
# Tokenize Text Data
# Convert text to tokens that BERT can understand
# truncation=True: Cut text if longer than max_length
# padding=True: Pad shorter sequences to max_length
# max_length=128: Reduced from 512 for faster training

MAX_LENGTH = 128

print("Tokenizing training data...")
train_encodings = tokenizer(
    train_texts,
    truncation=True,
    padding=True,
    max_length=MAX_LENGTH,
    return_tensors="pt"
)

print("Tokenizing validation data...")
val_encodings = tokenizer(
    val_texts,
    truncation=True,
    padding=True,
    max_length=MAX_LENGTH,
    return_tensors="pt"
)

print("Tokenizing test data...")
test_encodings = tokenizer(
    test_texts,
    truncation=True,
    padding=True,
    max_length=MAX_LENGTH,
    return_tensors="pt"
)

print(f"Training encoding shape: {train_encodings['input_ids'].shape}")
print(f"Validation encoding shape: {val_encodings['input_ids'].shape}")
print(f"Test encoding shape: {test_encodings['input_ids'].shape}")

Tokenizing training data...
Tokenizing validation data...
Tokenizing test data...
Training encoding shape: torch.Size([1000, 128])
Validation encoding shape: torch.Size([250, 128])
Test encoding shape: torch.Size([250, 128])


In [None]:
# Create PyTorch Dataset Class
# Custom dataset class to handle tokenized data for PyTorch training
class NewsDataset(torch.utils.data.Dataset):
    """
    Custom PyTorch Dataset for handling tokenized news articles
    """
    def __init__(self, encodings, labels):
        """
        Initialize dataset with tokenized encodings and labels

        Args:
            encodings: Dictionary containing tokenized text data
            labels: List of binary labels (0 for fake, 1 for real)
        """
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        """
        Get a single item from the dataset

        Args:
            idx: Index of the item to retrieve

        Returns:
            Dictionary containing input tensors and labels
        """
        # Extract tokenized data for the given index
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        # Add the corresponding label
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        """
        Return the total number of samples in the dataset
        """
        return len(self.labels)

# Create dataset instances
train_dataset = NewsDataset(train_encodings, train_labels)
val_dataset = NewsDataset(val_encodings, val_labels)
test_dataset = NewsDataset(test_encodings, test_labels)

print(f"Training dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

Training dataset size: 1000
Validation dataset size: 250
Test dataset size: 250


In [None]:
# Load Pre-trained BERT Model
# Load BERT model for sequence classification
# num_labels=2: Binary classification (fake vs real)
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2  # Binary classification
)

# Display model information
print(f"Model type: {type(model)}")
print(f"Number of parameters: {model.num_parameters():,}")

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
model.to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model type: <class 'transformers.models.bert.modeling_bert.BertForSequenceClassification'>
Number of parameters: 109,483,778
Using device: cpu


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
# Define Evaluation Metrics
def compute_metrics(eval_pred):
    """
    Compute evaluation metrics for the model

    Args:
        eval_pred: Tuple containing predictions and true labels

    Returns:
        Dictionary containing computed metrics
    """
    predictions, labels = eval_pred
    # Get predicted class (argmax of logits)
    predictions = np.argmax(predictions, axis=1)

    # Calculate accuracy
    accuracy = accuracy_score(labels, predictions)

    # Calculate precision, recall, and F1-score
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average='weighted'
    )

    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
# Configure Training Arguments
# Disable wandb logging to avoid authentication issues
os.environ["WANDB_DISABLED"] = "true"

# Define training hyperparameters
training_args = TrainingArguments(
    output_dir="./results-news",              # Directory to save model checkpoints
    per_device_train_batch_size=8,           # Batch size for training
    per_device_eval_batch_size=8,            # Batch size for evaluation
    num_train_epochs=3,                      # Number of training epochs
    learning_rate=2e-5,                      # Learning rate (typical for BERT)
    warmup_steps=100,                        # Number of warmup steps
    weight_decay=0.01,                       # Weight decay for regularization
    logging_dir="./logs",                    # Directory for storing logs
    logging_steps=50,                        # Log every 50 steps
    save_steps=200,                          # Save every 200 steps
    seed=42                                  # Set seed for reproducibility
)

print("Training arguments configured:")
print(f"- Batch size: {training_args.per_device_train_batch_size}")
print(f"- Learning rate: {training_args.learning_rate}")
print(f"- Epochs: {training_args.num_train_epochs}")
print(f"- Warmup steps: {training_args.warmup_steps}")

# Alternative approach for evaluation during training

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Training arguments configured:
- Batch size: 8
- Learning rate: 2e-05
- Epochs: 3
- Warmup steps: 100


In [None]:
# Initialize Trainer and Start Training
# Create Trainer instance with model, training arguments, and datasets
trainer = Trainer(
    model=model,                            # The pre-trained BERT model
    args=training_args,                     # Training configuration
    train_dataset=train_dataset,            # Training data
    eval_dataset=val_dataset,               # Validation data
    compute_metrics=compute_metrics,        # Evaluation metrics function
    tokenizer=tokenizer                     # Tokenizer for text processing
)

print("Starting training...")
print("=" * 50)

# Start the training process
training_results = trainer.train()

print("Training completed!")
print(f"Final training loss: {training_results.training_loss:.4f}")

# Evaluate on validation set after training
print("\nEvaluating on validation set...")
val_results = trainer.evaluate(val_dataset)

print("Validation Results:")
print("=" * 30)
for key, value in val_results.items():
    if isinstance(value, float):
        print(f"{key}: {value:.4f}")
    else:
        print(f"{key}: {value}")

  trainer = Trainer(


Starting training...


  item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}


Step,Training Loss
50,0.6468
100,0.2095
150,0.0144
200,0.0038
250,0.0011
300,0.0009


  item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}


Step,Training Loss
50,0.6468
100,0.2095
150,0.0144
200,0.0038
250,0.0011
300,0.0009
350,0.0008


Training completed!
Final training loss: 0.1171

Evaluating on validation set...


  item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}


Validation Results:
eval_loss: 0.0006
eval_accuracy: 1.0000
eval_f1: 1.0000
eval_precision: 1.0000
eval_recall: 1.0000
eval_runtime: 26.2169
eval_samples_per_second: 9.5360
eval_steps_per_second: 1.2210
epoch: 3.0000


In [None]:
# Evaluate on Test Set
print("\nEvaluating on test set...")
test_results = trainer.evaluate(test_dataset)

print("Test Results:")
print("=" * 30)
for key, value in test_results.items():
    if isinstance(value, float):
        print(f"{key}: {value:.4f}")
    else:
        print(f"{key}: {value}")


Evaluating on test set...


  item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}


Test Results:
eval_loss: 0.0368
eval_accuracy: 0.9920
eval_f1: 0.9920
eval_precision: 0.9920
eval_recall: 0.9920
eval_runtime: 27.8325
eval_samples_per_second: 8.9820
eval_steps_per_second: 1.1500
epoch: 3.0000


In [None]:
# Make Predictions on Sample Data
def predict_news(text, model, tokenizer, device):
    """
    Predict whether a news article is fake or real

    Args:
        text: News article text
        model: Trained BERT model
        tokenizer: BERT tokenizer
        device: Computing device (CPU/GPU)

    Returns:
        Tuple of (prediction, confidence_score)
    """
    # Tokenize the input text
    encoding = tokenizer(
        text,
        truncation=True,
        padding=True,
        max_length=128,
        return_tensors="pt"
    )

    # Move to device
    encoding = {k: v.to(device) for k, v in encoding.items()}

    # Make prediction
    model.eval()
    with torch.no_grad():
        outputs = model(**encoding)
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
        predicted_class = torch.argmax(predictions, dim=-1).item()
        confidence = predictions[0][predicted_class].item()

    return predicted_class, confidence

# Test with sample articles
sample_texts = [
    "Breaking: Scientists discover new species of dinosaur in remote jungle expedition.",
    "SHOCKING: Aliens have been secretly controlling world governments for decades, insider reveals!"
]

print("\nSample Predictions:")
print("=" * 40)
for i, text in enumerate(sample_texts):
    prediction, confidence = predict_news(text, model, tokenizer, device)
    label = "REAL" if prediction == 1 else "FAKE"
    print(f"Text {i+1}: {text[:100]}...")
    print(f"Prediction: {label} (Confidence: {confidence:.4f})")
    print("-" * 40)



Sample Predictions:
Text 1: Breaking: Scientists discover new species of dinosaur in remote jungle expedition....
Prediction: FAKE (Confidence: 0.9981)
----------------------------------------
Text 2: SHOCKING: Aliens have been secretly controlling world governments for decades, insider reveals!...
Prediction: FAKE (Confidence: 0.9985)
----------------------------------------


In [None]:
# Save the Fine-tuned Model
# Save the model and tokenizer for future use
model_save_path = "./fine-tuned-bert-news-classifier"
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)

print(f"\nModel saved to: {model_save_path}")
print("You can load this model later using:")
print(f"model = AutoModelForSequenceClassification.from_pretrained('{model_save_path}')")
print(f"tokenizer = AutoTokenizer.from_pretrained('{model_save_path}')")



Model saved to: ./fine-tuned-bert-news-classifier
You can load this model later using:
model = AutoModelForSequenceClassification.from_pretrained('./fine-tuned-bert-news-classifier')
tokenizer = AutoTokenizer.from_pretrained('./fine-tuned-bert-news-classifier')


In [None]:
# Training Summary

print("\n" + "=" * 60)
print("TRAINING SUMMARY")
print("=" * 60)
print(f"✓ Model: BERT-base-uncased")
print(f"✓ Task: Binary classification (Fake vs Real news)")
print(f"✓ Training samples: {len(train_dataset)}")
print(f"✓ Validation samples: {len(val_dataset)}")
print(f"✓ Test samples: {len(test_dataset)}")
print(f"✓ Training epochs: {training_args.num_train_epochs}")
print(f"✓ Final test accuracy: {test_results['eval_accuracy']:.4f}")
print(f"✓ Final test F1-score: {test_results['eval_f1']:.4f}")


TRAINING SUMMARY
✓ Model: BERT-base-uncased
✓ Task: Binary classification (Fake vs Real news)
✓ Training samples: 1000
✓ Validation samples: 250
✓ Test samples: 250
✓ Training epochs: 3
✓ Final test accuracy: 0.9920
✓ Final test F1-score: 0.9920
