In [1]:
## pip install statements

# %pip install transformers 
# %pip install pandas 
# %pip install numpy 
# %pip install scikit-learn 
# %pip install matplotlib 
# %pip install shap
# %pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
# %pip install tf-keras
# %pip install lime
# %pip install tokeniser

import os
import pandas as pd
import torch
from IPython.display import display
import tokeniser

In [2]:
# load dataset to pandas DataFrame

import pandas as pd

# load train, test, validation datasets
# for the purposes of this demo, we'll be using LIAR dataset :D
train_ds = "liar_dataset/train.tsv"
test_ds = "liar_dataset/test.tsv"
valid_ds = "liar_dataset/valid.tsv"

# now, i'll use pandas to read TSV files :D
# columns are as according to the README in liar_dataset directory :D

columns = [
    "id", "label", "statement", "subject", "speaker", "speaker_job_title",
    "state_info", "party_affiliation", "barely_true_counts", "false_counts",
    "half_true_counts", "mostly_true_counts", "pants_on_fire_counts", "context"
]

train_df = pd.read_csv(train_ds, sep='\t', names=columns)
test_df = pd.read_csv(test_ds, sep='\t', names=columns)
valid_df = pd.read_csv(valid_ds, sep='\t', names=columns)

# print statement to check the dataset has been loaded properly! T^T
print(train_df.head())

           id        label                                          statement  \
0   2635.json        false  Says the Annies List political group supports ...   
1  10540.json    half-true  When did the decline of coal start? It started...   
2    324.json  mostly-true  Hillary Clinton agrees with John McCain "by vo...   
3   1123.json        false  Health care reform legislation is likely to ma...   
4   9028.json    half-true  The economic turnaround started at the end of ...   

                              subject         speaker     speaker_job_title  \
0                            abortion    dwayne-bohac  State representative   
1  energy,history,job-accomplishments  scott-surovell        State delegate   
2                      foreign-policy    barack-obama             President   
3                         health-care    blog-posting                   NaN   
4                        economy,jobs   charlie-crist                   NaN   

  state_info party_affiliation  barely

In [3]:
# binarising labels! 

# since the labels have multiple classes, 
# for the sake of this feature prototype,
# i'll just simplify them to binary true/fake labels :)

# map labels to binary classes! :D
# 'pants-fire', 'false', 'barely-true' -> fake (0)
# others -> real (1)

def binarise(df):
    # validate expected labels exist before applying transformation!
    expected_labels = ["pants-fire", "false", "barely-true", "half-true", "mostly-true", "true"]
    unexpected_labels = set(df['label']) - set(expected_labels)
    if unexpected_labels:
        raise ValueError(f"Unexpected labels found: {unexpected_labels}")
    df['label'] = df['label'].apply(lambda x: 0 if x in ['pants-fire', 'false', 'barely-true'] else 1)
    return df


train_df = binarise(train_df)
test_df = binarise(test_df)
valid_df = binarise(valid_df)

# print statement to check df structure!
print(train_df.head())
print(test_df.head())
print(valid_df.head())

# checking that all labels in dataset are valid
print("Unique labels in training data:", train_df['label'].unique())
assert set(train_df['label'].unique()) == {0, 1}, "Labels must be binary (0 or 1)"

           id  label                                          statement  \
0   2635.json      0  Says the Annies List political group supports ...   
1  10540.json      1  When did the decline of coal start? It started...   
2    324.json      1  Hillary Clinton agrees with John McCain "by vo...   
3   1123.json      0  Health care reform legislation is likely to ma...   
4   9028.json      1  The economic turnaround started at the end of ...   

                              subject         speaker     speaker_job_title  \
0                            abortion    dwayne-bohac  State representative   
1  energy,history,job-accomplishments  scott-surovell        State delegate   
2                      foreign-policy    barack-obama             President   
3                         health-care    blog-posting                   NaN   
4                        economy,jobs   charlie-crist                   NaN   

  state_info party_affiliation  barely_true_counts  false_counts  \
0     

In [4]:
# tokenise statements
# i'll tokenise statements using Hugging Face's tokeniser! 

# import autotokeniser
from transformers import AutoTokenizer

# load tokeniser
tokeniser = AutoTokenizer.from_pretrained("google/mobilebert-uncased")

# tokenise data
def tokenise(df, tokeniser, max_length=128):
    return tokeniser(
        df['statement'].tolist(),
        truncation=True,
        padding=True,
        max_length=max_length,
        return_tensors="pt"
    )

train_encodings = tokenise(train_df, tokeniser)
test_encodings = tokenise(test_df, tokeniser)
valid_encodings = tokenise(valid_df, tokeniser)

# for debug
print("Max token length:", max([len(ids) for ids in train_encodings['input_ids']]))
print("Sample tokenized input IDs:", train_encodings['input_ids'][0])
print("Sample attention mask:", train_encodings['attention_mask'][0])
print("Number of tokens per sequence:", train_encodings['attention_mask'].sum(dim=1))


Max token length: 128
Sample tokenized input IDs: tensor([  101,  2758,  1996,  8194,  2015,  2862,  2576,  2177,  6753,  2353,
         1011, 12241, 20367, 11324,  2015,  2006,  5157,  1012,   102,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,

In [None]:
# Import necessary libraries for Trainer API
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Define a dataset class compatible with the Trainer API
class LIARDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_length=128):
        self.texts = df['statement'].tolist()
        self.labels = df['label'].tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        encodings = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encodings['input_ids'].squeeze(),
            'attention_mask': encodings['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Create dataset objects that perform tokenization on the fly
train_dataset = LIARDataset(train_df, tokeniser)
valid_dataset = LIARDataset(valid_df, tokeniser)
test_dataset = LIARDataset(test_df, tokeniser)

# Define metrics computation function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    return {
        'accuracy': accuracy_score(labels, predictions),
        'precision': precision_score(labels, predictions, zero_division=0),
        'recall': recall_score(labels, predictions),
        'f1': f1_score(labels, predictions),
        # Only compute ROC-AUC when we have both classes in the batch
        'roc_auc': roc_auc_score(labels, predictions) if len(np.unique(labels)) > 1 else 0
    }

# Load model as before
model = AutoModelForSequenceClassification.from_pretrained("google/mobilebert-uncased", num_labels=2)

# Configure training arguments
training_args = TrainingArguments(
    output_dir="mobilebert_fake_news",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,  # Start with fewer epochs to prevent overfitting
    weight_decay=0.01,   # Add regularization to help with overfitting
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    push_to_hub=False,
    report_to="none"     # Disable wandb or other reporting to simplify
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Save the final model and tokenizer
model_path = "mobilebert_fake_news_final"
trainer.save_model(model_path)
tokeniser.save_pretrained(model_path)
print(f"Model and tokenizer saved to {model_path}")

Some weights of MobileBertForSequenceClassification were not initialized from the model checkpoint at google/mobilebert-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
1,39373.7438,0.652544,0.629283,0.615942,0.763473,0.681818,0.62362


In [None]:
# Evaluation on test set using the trainer
import pandas as pd
from IPython.display import display

# Get test results
test_results = trainer.evaluate(test_dataset)

# Convert results to a more readable format
readable_results = {}
for key, value in test_results.items():
    # Extract the actual metric name from keys like 'eval_accuracy'
    if key.startswith('eval_'):
        metric_name = key[5:]  # Remove 'eval_' prefix
        readable_results[metric_name] = round(value, 4)  # Round to 4 decimal places
    else:
        readable_results[key] = value

# Create a DataFrame for better display
# First, prepare data as two columns
metrics = pd.DataFrame({
    'Metric': list(readable_results.keys()),
    'Value': list(readable_results.values())
})

# Display the table with improved formatting
print("\n===== FAKE NEWS DETECTION MODEL EVALUATION =====\n")
display(metrics.set_index('Metric').transpose())

# If you prefer a standard print output instead of the DataFrame:
print("\nDetailed Performance Metrics:")
for metric, value in readable_results.items():
    if isinstance(value, float):
        print(f"{metric.capitalize():20} : {value:.4f}")
    else:
        print(f"{metric.capitalize():20} : {value}")

# Get predictions for confusion matrix
test_predictions = trainer.predict(test_dataset)
predictions = np.argmax(test_predictions.predictions, axis=1)
true_labels = test_predictions.label_ids

# Create confusion matrix
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
cm = confusion_matrix(true_labels, predictions)
labels = train_df['label'].unique()
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Fake", "Real"])
plt.figure(figsize=(8, 6))
disp.plot(cmap="Blues")
plt.title("Fake News Detection Confusion Matrix")
plt.tight_layout()
plt.show()

In [None]:
from lime.lime_text import LimeTextExplainer
import numpy as np
import torch
import matplotlib.pyplot as plt
from IPython.display import display, HTML

# Define a wrapper for model predictions
class ModelWrapper:
    def __init__(self, model, tokenizer, max_length=128):
        self.model = model
        self.tokenizer = tokenizer
        self.max_length = max_length
        # Get the device from the model
        self.device = next(model.parameters()).device
    
    def predict_proba(self, texts):
        # Tokenize the input texts
        encodings = self.tokenizer(
            texts,
            truncation=True,
            padding=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        input_ids = encodings['input_ids'].to(self.device)
        attention_mask = encodings['attention_mask'].to(self.device)
        
        # Get model predictions
        with torch.no_grad():
            outputs = self.model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
        
        # Convert logits to probabilities
        probs = torch.softmax(logits, dim=-1).cpu().numpy()
        return probs

# Initialize the LIME explainer with class names matching your model's output
explainer = LimeTextExplainer(class_names=["Fake", "Real"])

# Wrap model and tokenizer
wrapper = ModelWrapper(model, tokeniser)

# Select a sample from test data to explain
sample_index = 0  # You can change this to explain different examples
sample_text = test_df['statement'].iloc[sample_index]
sample_label = test_df['label'].iloc[sample_index]

# Get the model's prediction for this sample (for logging purposes)
prediction = wrapper.predict_proba([sample_text])[0]
predicted_label = np.argmax(prediction)
predicted_class = "Real" if predicted_label == 1 else "Fake"

# Print basic information
print(f"Original Text: {sample_text}")
print(f"True Label: {'Real' if sample_label == 1 else 'Fake'}")
print(f"Predicted Label: {predicted_class} (confidence: {prediction[predicted_label]:.4f})")

# Generate explanation
explanation = explainer.explain_instance(
    sample_text, 
    wrapper.predict_proba, 
    num_features=10,
    top_labels=1  # Include both classes
)

# Display the visualization in the notebook
# This will generate the visualization similar to the one in the image
plt.figure(figsize=(10, 6))
explanation.show_in_notebook(text=True)

# If you want to customize the visualization colors, you can add:
# explanation.as_pyplot_figure(label=predicted_label)