In [1]:
# THIS IS THE IMPORT STATEMENT FOR TORCH (if u want cuda)
# %pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124

In [2]:
##############################################
# main_pipeline.ipynb
# 
# A single notebook to demonstrate the entire
# pipeline, from preprocessing to training,
# evaluation, and a sample prediction.
##############################################

import os
import pandas as pd
import torch
from IPython.display import display

##############################################
# 1) DATA PREPROCESSING
##############################################

# For demonstration, we might just replicate or call 
# the critical steps from your preprocess_data.ipynb

# If your 'preprocess_data.ipynb' is short, you could copy 
# the relevant code here. For instance:
data_dir = os.path.join(os.getcwd(), "data/liar_dataset")  # Adjust if needed
train_path = os.path.join(data_dir, "train.tsv")
valid_path = os.path.join(data_dir, "valid.tsv")
test_path  = os.path.join(data_dir, "test.tsv")

def map_label_to_binary(label_str):
    # example of your mapping function
    fake_labels = ["pants-fire", "false", "barely-true"]
    if label_str in fake_labels:
        return 0
    else:
        return 1

# Load & preprocess
train_df = pd.read_csv(train_path, sep='\t', header=None, names=[
    "id", "label", "statement", "subject", "speaker", "speaker_jobtitle",
    "state_info", "party_affiliation", "barely_true_counts", "false_counts",
    "half_true_counts", "mostly_true_counts", "pants_on_fire_counts", "context"
])
valid_df = pd.read_csv(valid_path, sep='\t', header=None, names=train_df.columns)
test_df  = pd.read_csv(test_path,  sep='\t', header=None, names=train_df.columns)

train_df['binary_label'] = train_df['label'].apply(map_label_to_binary)
valid_df['binary_label'] = valid_df['label'].apply(map_label_to_binary)
test_df['binary_label']  = test_df['label'].apply(map_label_to_binary)

print("Data loaded & pre-processed.")
print(f"Train shape: {train_df.shape}")
print(f"Valid shape: {valid_df.shape}")
print(f"Test  shape: {test_df.shape}")

Data loaded & pre-processed.
Train shape: (10240, 15)
Valid shape: (1284, 15)
Test  shape: (1267, 15)


In [3]:
##############################################
# 2) MODEL TRAINING: FINE-TUNE MOBILEBERT
##############################################

# We'll replicate a simplified training approach here.
# If 'train_mobileBERT.ipynb' has many lines, you can adapt them below:

from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

model_name = "google/mobilebert-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

class LIARDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=128):
        self.texts  = df['statement'].tolist()
        self.labels = df['binary_label'].tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        encodings = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encodings['input_ids'].squeeze(),
            'attention_mask': encodings['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Create dataset objects
train_dataset = LIARDataset(train_df, tokenizer)
valid_dataset = LIARDataset(valid_df, tokenizer)

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

training_args = TrainingArguments(
    output_dir="mobilebert_output",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,  # use 1 or 2 for a quick demo
    logging_steps=10,
    load_best_model_at_end=True,
    save_total_limit=1
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset
)

trainer.train()

# Save the final model
tokenizer.save_pretrained("mobilebert_output")
print("MobileBERT model and tokenizer saved to 'mobilebert_output'.")


Some weights of MobileBertForSequenceClassification were not initialized from the model checkpoint at google/mobilebert-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.6587,0.663376


MobileBERT model and tokenizer saved to 'mobilebert_output'.


In [4]:
##############################################
# 3) EVALUATION
##############################################

# We'll do a quick pass on the test set to see metrics.
# You can replicate your more detailed code from evaluation.ipynb as needed.
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

test_dataset = LIARDataset(test_df, tokenizer)

# We'll have the trainer do predictions for convenience
preds_output = trainer.predict(test_dataset)
pred_labels = preds_output.predictions.argmax(axis=1)
true_labels = preds_output.label_ids

acc = accuracy_score(true_labels, pred_labels)
prec = precision_score(true_labels, pred_labels, average='binary')
rec = recall_score(true_labels, pred_labels, average='binary')
f1 = f1_score(true_labels, pred_labels, average='binary')
cm = confusion_matrix(true_labels, pred_labels)

print("\nTest Set Evaluation Metrics:")
print(f"Accuracy:  {acc:.3f}")
print(f"Precision: {prec:.3f}")
print(f"Recall:    {rec:.3f}")
print(f"F1 Score:  {f1:.3f}")
print("Confusion Matrix:")
print(cm)


Test Set Evaluation Metrics:
Accuracy:  0.627
Precision: 0.624
Recall:    0.847
F1 Score:  0.719
Confusion Matrix:
[[189 364]
 [109 605]]


In [5]:
##############################################
# 4) QUICK DEMO PREDICTION (CLI-Like)
##############################################

# Let's emulate your 'cli_predict.ipynb' with a sample statement
def predict_fake_news(statement, model_dir='mobilebert_output'):
    loaded_tokenizer = AutoTokenizer.from_pretrained(model_dir)
    loaded_model = AutoModelForSequenceClassification.from_pretrained(model_dir)
    loaded_model.eval()
    
    inputs = loaded_tokenizer(statement, return_tensors='pt',
                              max_length=128, padding='max_length', truncation=True)
    with torch.no_grad():
        out = loaded_model(**inputs)
    logits = out.logits
    probs = torch.softmax(logits, dim=1).squeeze()
    label_idx = torch.argmax(probs).item()
    
    label_str = "REAL" if label_idx == 1 else "FAKE"
    confidence = probs[label_idx].item()
    return label_str, confidence

sample_statements = [
    "Obama was born in Kenya.",
    "The sky is blue.",
    "Elvis Presley is still alive."
]

print("\nSample Predictions:")
for stmt in sample_statements:
    pred_label, conf = predict_fake_news(stmt)
    print(f"Statement: {stmt}")
    print(f"Prediction: {pred_label} (confidence: {conf:.2f})\n")


Sample Predictions:
Statement: Obama was born in Kenya.
Prediction: FAKE (confidence: 0.59)

Statement: The sky is blue.
Prediction: FAKE (confidence: 0.55)

Statement: Elvis Presley is still alive.
Prediction: FAKE (confidence: 0.58)

