In [1]:
## pip install statements

# %pip install transformers 
# %pip install pandas 
# %pip install numpy 
# %pip install scikit-learn 
# %pip install matplotlib 
# %pip install shap
# %pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124

In [2]:
# load dataset to pandas DataFrame

# import pandas \o/ 
import pandas as pd

# load train, test, validation datasets
# for the purposes of this demo, we'll be using LIAR dataset :D
train_ds = "liar_dataset/train.tsv"
test_ds = "liar_dataset/test.tsv"
valid_ds = "liar_dataset/valid.tsv"

# now, i'll use pandas to read TSV files :D
# columns are as according to the README in liar_dataset directory :D

columns = [
    "id", "label", "statement", "subject", "speaker", "speaker_job_title",
    "state_info", "party_affiliation", "barely_true_counts", "false_counts",
    "half_true_counts", "mostly_true_counts", "pants_on_fire_counts", "context"
]

train_df = pd.read_csv(train_ds, sep='\t', names=columns)
test_df = pd.read_csv(test_ds, sep='\t', names=columns)
valid_df = pd.read_csv(valid_ds, sep='\t', names=columns)

# print statement to check the dataset has been loaded properly! T^T
print(train_df.head())

           id        label                                          statement  \
0   2635.json        false  Says the Annies List political group supports ...   
1  10540.json    half-true  When did the decline of coal start? It started...   
2    324.json  mostly-true  Hillary Clinton agrees with John McCain "by vo...   
3   1123.json        false  Health care reform legislation is likely to ma...   
4   9028.json    half-true  The economic turnaround started at the end of ...   

                              subject         speaker     speaker_job_title  \
0                            abortion    dwayne-bohac  State representative   
1  energy,history,job-accomplishments  scott-surovell        State delegate   
2                      foreign-policy    barack-obama             President   
3                         health-care    blog-posting                   NaN   
4                        economy,jobs   charlie-crist                   NaN   

  state_info party_affiliation  barely

In [3]:
# binarising labels! 

# since the labels have multiple classes, 
# for the sake of this feature prototype,
# i'll just simplify them to binary true/fake labels :)

# map labels to binary classes! :D
# 'pants-fire', 'false', 'barely-true' -> fake (0)
# others -> real (1)

def binarise(df):
    # validate expected labels exist before applying transformation!
    expected_labels = ["pants-fire", "false", "barely-true", "half-true", "mostly-true", "true"]
    unexpected_labels = set(df['label']) - set(expected_labels)
    if unexpected_labels:
        raise ValueError(f"Unexpected labels found: {unexpected_labels}")
    df['label'] = df['label'].apply(lambda x: 0 if x in ['pants-fire', 'false', 'barely-true'] else 1)
    return df


train_df = binarise(train_df)
test_df = binarise(test_df)
valid_df = binarise(valid_df)

# print statement to check df structure!
print(train_df.head())
print(test_df.head())
print(valid_df.head())

# checking that all labels in dataset are valid
print("Unique labels in training data:", train_df['label'].unique())
assert set(train_df['label'].unique()) == {0, 1}, "Labels must be binary (0 or 1)"

           id  label                                          statement  \
0   2635.json      0  Says the Annies List political group supports ...   
1  10540.json      1  When did the decline of coal start? It started...   
2    324.json      1  Hillary Clinton agrees with John McCain "by vo...   
3   1123.json      0  Health care reform legislation is likely to ma...   
4   9028.json      1  The economic turnaround started at the end of ...   

                              subject         speaker     speaker_job_title  \
0                            abortion    dwayne-bohac  State representative   
1  energy,history,job-accomplishments  scott-surovell        State delegate   
2                      foreign-policy    barack-obama             President   
3                         health-care    blog-posting                   NaN   
4                        economy,jobs   charlie-crist                   NaN   

  state_info party_affiliation  barely_true_counts  false_counts  \
0     

In [4]:
# tokenise statements

# i'll tokenise statements using Hugging Face's tokeniser! 

# import autotokeniser
from transformers import AutoTokenizer

# load tokeniser
tokeniser = AutoTokenizer.from_pretrained("google/mobilebert-uncased")

# tokenise data
def tokenise(df, tokeniser, max_length=256):
    return tokeniser(
        df['statement'].tolist(),
        truncation=True,
        padding=True,
        max_length=max_length,
        return_tensors="pt"
    )

train_encodings = tokenise(train_df, tokeniser)
test_encodings = tokenise(test_df, tokeniser)
valid_encodings = tokenise(valid_df, tokeniser)

print("Max token length:", max([len(ids) for ids in train_encodings['input_ids']]))


Max token length: 256


In [None]:
# fine-tune BERT

# now that our dataframes are tokenised, 
# let's load pre-trained BERT.

from transformers import AutoModelForSequenceClassification

# load our model :D
model = AutoModelForSequenceClassification.from_pretrained("google/mobilebert-uncased", num_labels=2)

# prepare our dataset for pytorch! 
import torch

class LIARDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

train_dataset = LIARDataset(train_encodings, train_df['label'].tolist())
test_dataset = LIARDataset(test_encodings, test_df['label'].tolist())
valid_dataset = LIARDataset(valid_encodings, valid_df['label'].tolist())

# finally, we train the model! 

from torch.utils.data import DataLoader
from transformers import AdamW
from transformers import get_scheduler

# this codeblock for our dataloader! :D

# num_workers to use multiple cpu cores, pin_memory as we are training on GPU
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)
valid_loader = DataLoader(valid_dataset, batch_size=16)

# train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=4, pin_memory=True)
# test_loader = DataLoader(test_dataset, batch_size=16, num_workers=4, pin_memory=True)
# valid_loader = DataLoader(valid_dataset, batch_size=16, num_workers=4, pin_memory=True)

# this codeblock for optimiser! 
optimizer = AdamW(model.parameters(), lr=1e-5)

# this codeblock for scheduler! 
num_training_steps = len(train_loader) * 5  # 5 epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# this codeblock for device config! 
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

# this is our training loop. 
# i will also implement early stopping here
from tqdm import tqdm
from torch.cuda.amp import autocast, GradScaler  # for mixed-precision training
from torch.nn.utils import clip_grad_norm_

scaler = GradScaler()  # initialise the gradient scaler for mixed-precision training!

patience = 3  # stop training after 3 epoch without improvement!
best_loss = float('inf')  # track the best loss achieved so far
patience_counter = 0

accumulation_steps = 2  # Accumulate gradients over 2 steps
model.train()

# set training params: max epochs
max_epochs = 5  

for epoch in range(max_epochs):
    epoch_loss = 0  # accumulate epoch loss
    loop = tqdm(train_loader, leave=True)
    optimizer.zero_grad()  # reset gradients at start of each epoch

    for i, batch in enumerate(loop):
        # mve batch to device
        batch = {k: v.to(device) for k, v in batch.items()}

        # forward pass w/ mixed precision
        with autocast():  # this enables mixed precision :D
            outputs = model(**batch)
            loss = outputs.loss / accumulation_steps  # normalise loss for gradient accumulation

        # backward pass with gradient scaling for mixed precision
        scaler.scale(loss).backward()

        # gradient clipping. i do this to avoid exploding gradients.
        clip_grad_norm_(model.parameters(), max_norm=1.0)

        # after accumulation steps, update model weights & optimiser state
        if (i + 1) % accumulation_steps == 0 or (i + 1) == len(loop):
            scaler.step(optimizer)  # qpply scaled gradients to optimizer
            scaler.update()  # update scaler for next iteration
            optimizer.zero_grad()  # after step, reset gradient
            lr_scheduler.step()  # this steps learning rate scheduler

        # accumulate epoch loss
        epoch_loss += loss.item() * accumulation_steps  # Unscale the loss for logging

        # progress bar logic
        loop.set_description(f"Epoch {epoch}")
        loop.set_postfix(loss=loss.item())

    # log epoch loss 
    print(f"Epoch {epoch} Loss: {epoch_loss:.4f}")

    # introduce early stopping logic 
    if epoch_loss < best_loss:
        best_loss = epoch_loss
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("as improvements have ceased, triggering early stopping :D")
            break

  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at google/mobilebert-uncased were not used when initializing MobileBertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing MobileBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing MobileBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification

In [None]:
# evaluating the model!

# evaluate the model on test data :D

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

model.eval()
predictions, true_labels = [], []

with torch.no_grad():
    for batch in test_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        logits = outputs.logits
        predictions.extend(torch.argmax(logits, dim=-1).tolist())
        true_labels.extend(batch['labels'].tolist())

# metrics
accuracy = accuracy_score(true_labels, predictions)
precision = precision_score(true_labels, predictions, zero_division=0)

recall = recall_score(true_labels, predictions)
f1 = f1_score(true_labels, predictions)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# visualise results

from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

cm = confusion_matrix(true_labels, predictions)
labels = train_df['label'].unique()
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
disp.plot(cmap="Blues")


In [None]:
import shap

# create SHAP explainer using a wrapper function
def shap_preprocessing_wrapper(texts):
    # tokenise the input texts using the model's tokenizer
    tokenized = tokeniser(
        texts,  # list of input strings
        truncation=True,
        padding=True,
        max_length=128,
        return_tensors="pt"
    )
    # move tokenized inputs to the same device as the model
    tokenized = {key: val.to(device) for key, val in tokenized.items()}
    # pass the tokenized input through the model and return logits
    return model(**tokenized).logits.cpu().detach().numpy()  # Ensure SHAP receives numpy array

# generate SHAP explanations
try:
    # initialize the SHAP explainer with your preprocessing wrapper
    explainer = shap.Explainer(shap_preprocessing_wrapper, test_df['statement'].tolist())

    # compute SHAP values for the first 10 test samples
    shap_values = explainer(test_df['statement'].tolist()[:10])

    # visualize SHAP explanations
    shap.summary_plot(shap_values)
except Exception as e:
    print(f"SHAP explainability failed: {e}")


In [None]:
import torch
print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
torch.cuda.empty_cache()
# cuda 12.4