In [None]:
# === Step 1: installing libaries ===
# Install necessary libraries
!pip install transformers[torch] datasets scikit-learn -q

In [None]:
# === Step 2: Load Dataset ===
from datasets import load_dataset
import pandas as pd # Import pandas for better display if needed

# Load the sst2 dataset
dataset = load_dataset("sst2")

# Use smaller subsets for quicker demo (optional, but recommended for speed)
train_sample_size = 1000
validation_sample_size = 200
test_sample_size = 872 # Use the full validation set as test set

# Shuffle and select subsets
small_train_dataset = dataset["train"].shuffle(seed=42).select(range(train_sample_size))
small_validation_dataset = dataset["validation"].shuffle(seed=42).select(range(validation_sample_size))
# Use the original validation set for final testing in this example
small_test_dataset = dataset["validation"].select(range(test_sample_size))

print("\nDataset loaded and split:")
print(f"Training examples: {len(small_train_dataset)}")
print(f"Validation examples: {len(small_validation_dataset)}")
print(f"Test examples: {len(small_test_dataset)}")
print("\nExample Training Data Point:")
# Displaying using pandas for potentially nicer formatting
display(pd.DataFrame([small_train_dataset[0]]))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



Dataset loaded and split:
Training examples: 1000
Validation examples: 200
Test examples: 872

Example Training Data Point:


Unnamed: 0,idx,sentence,label
0,32326,"klein , charming in comedies like american pie...",1


In [None]:
# === Step 3: Preprocess Data (Tokenization) ===
from transformers import AutoTokenizer

# Define the base model checkpoint
model_checkpoint = "distilbert-base-uncased"

# Load the tokenizer associated with the base model
# We will reuse this 'tokenizer' object later for the prediction pipeline
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
print(f"\nTokenizer loaded from '{model_checkpoint}'.")

# Function to tokenize the sentences
def preprocess_function(examples):
    return tokenizer(examples["sentence"], truncation=True, padding=True)

# Apply the tokenization function
print("Tokenizing datasets...")
tokenized_train_dataset = small_train_dataset.map(preprocess_function, batched=True)
tokenized_validation_dataset = small_validation_dataset.map(preprocess_function, batched=True)
tokenized_test_dataset = small_test_dataset.map(preprocess_function, batched=True)
print("Tokenization complete.")
print("\nExample of tokenized data keys:")
print(tokenized_train_dataset[0].keys()) # Show the new keys added by tokenizer



Tokenizer loaded from 'distilbert-base-uncased'.
Tokenizing datasets...


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Tokenization complete.

Example of tokenized data keys:
dict_keys(['idx', 'sentence', 'label', 'input_ids', 'attention_mask'])


In [None]:
# === Step 4: Load Pre-trained Model ===
from transformers import AutoModelForSequenceClassification
import torch # Ensure torch is imported

# Load the DistilBERT model configured for sequence classification with 2 labels
# The classification head weights will be random initially.
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)
print(f"\nPre-trained model '{model_checkpoint}' loaded for sequence classification.")
# Check if GPU is available and move model if needed
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Model moved to device: {device}")


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Pre-trained model 'distilbert-base-uncased' loaded for sequence classification.
Model moved to device: cuda


In [None]:
# === Step 5: Define Training Settings and Evaluation Metric ===
from transformers import TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import accuracy_score

# Define output directory
output_dir = "misleading-content-classifier"

# Configure training arguments (Corrected)
training_args = TrainingArguments(
    output_dir=output_dir,
    eval_strategy="epoch",             # Corrected from evaluation_strategy
    num_train_epochs=1,                # Keep low for demo
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,
    save_strategy="epoch",             # Save checkpoint at the end of each epoch
    load_best_model_at_end=True,       # Load the best checkpoint at the end
    metric_for_best_model="accuracy",  # Specify metric to determine the best model
    greater_is_better=True,            # Higher accuracy is better
    report_to="none",                  # Disable wandb integration
)
print("\nTraining arguments defined.")

# Define the function to compute accuracy
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis=1)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}

print("Metrics function defined.")


Training arguments defined.
Metrics function defined.


In [None]:
# === Step 6: Train the Model (Fine-tuning) ===

# Create the Trainer instance (Corrected: removed deprecated tokenizer argument)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_validation_dataset,
    compute_metrics=compute_metrics,
    # tokenizer=tokenizer, # Argument deprecated and removed
)
print("\nTrainer initialized. Starting training...")

# Start training
train_result = trainer.train()

print("\nTRAINING COMPLETE!")
# Log some training metrics
metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)



Trainer initialized. Starting training...


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6685,0.384761,0.85



TRAINING COMPLETE!
***** train metrics *****
  epoch                    =        1.0
  total_flos               =    12770GF
  train_loss               =     0.6244
  train_runtime            = 0:00:14.59
  train_samples_per_second =     68.527
  train_steps_per_second   =      4.317


In [None]:
# === Step 7: Evaluate the Trained Model ===

print("\nEVALUATING MODEL ON TEST DATA...")
# Use the test dataset for final evaluation
eval_results = trainer.evaluate(eval_dataset=tokenized_test_dataset)

print("\n--- Evaluation Results on Test Set ---")
print(f"Accuracy: {eval_results['eval_accuracy']:.4f}")
trainer.log_metrics("eval", eval_results)
trainer.save_metrics("eval", eval_results)
print("(Remember: Label 0 -> 'Potentially Misleading', Label 1 -> 'Potentially Credible')")


# === Step 7.5: Find the Best Model Path ===
import os

# Find the path to the best model checkpoint saved by the Trainer
best_model_checkpoint_path = trainer.state.best_model_checkpoint

if best_model_checkpoint_path and os.path.isdir(best_model_checkpoint_path):
    print(f"\nBest model checkpoint identified at: {best_model_checkpoint_path}")
    model_load_path = best_model_checkpoint_path
else:
    print(f"\nWarning: Could not find best model checkpoint path in trainer state: {best_model_checkpoint_path}")
    print(f"Attempting to load from base output directory: {output_dir}")
    try:
        print(f"Contents of '{output_dir}': {os.listdir(output_dir)}")
    except FileNotFoundError:
        print(f"Error: Output directory '{output_dir}' not found.")
    model_load_path = output_dir # Fallback

print(f"Using path for loading fine-tuned model: {model_load_path}")




EVALUATING MODEL ON TEST DATA...



--- Evaluation Results on Test Set ---
Accuracy: 0.8394
***** eval metrics *****
  epoch                   =        1.0
  eval_accuracy           =     0.8394
  eval_loss               =     0.4041
  eval_runtime            = 0:00:01.63
  eval_samples_per_second =    532.703
  eval_steps_per_second   =     33.599
(Remember: Label 0 -> 'Potentially Misleading', Label 1 -> 'Potentially Credible')

Best model checkpoint identified at: misleading-content-classifier/checkpoint-63
Using path for loading fine-tuned model: misleading-content-classifier/checkpoint-63


In [None]:
# === Step 8: Use the Trained Model for Predictions (Corrected) ===
from transformers import pipeline

print("\n--- Setting up Prediction Pipeline ---")

def main():
    try:
        # Load the fine-tuned MODEL explicitly from the best checkpoint path
        print(f"Loading fine-tuned model from: {model_load_path}")
        loaded_fine_tuned_model = AutoModelForSequenceClassification.from_pretrained(model_load_path)
        loaded_fine_tuned_model.to(device)  # Ensure model is on the correct device
        print("Fine-tuned model loaded successfully.")

        # Reuse the original tokenizer loaded earlier
        print(f"Reusing the original tokenizer from: '{model_checkpoint}'")
        prediction_tokenizer = tokenizer  # Ensure 'tokenizer' is properly defined earlier

        # Create the pipeline
        classifier_pipeline = pipeline(
            "text-classification",
            model=loaded_fine_tuned_model,
            tokenizer=prediction_tokenizer,
            device=0 if torch.cuda.is_available() else -1  # Use GPU if available
        )
        print("Prediction pipeline created successfully.")

        # Define label mappings
        label_map = {
            "LABEL_0": "Potentially Misleading/Unreliable",
            "LABEL_1": "Potentially Credible/Reliable"
        }

        # Example sentences
        test_sentences = [
            "This scientific study shows clear evidence for the new treatment.",
            "Everyone knows that eating pizza cures the common cold!",
            "The report cites multiple anonymous sources with conflicting stories.",
            "The politician made vague promises during the speech.",
            "An amazing film, truly inspiring and well-acted.",
            "A terrible waste of time, completely boring and predictable."
        ]

        print("\n--- Making Predictions on New Sentences ---")
        for sentence in test_sentences:
            result = classifier_pipeline(sentence)[0]
            predicted_label_name = label_map.get(result['label'], result['label'])
            print(f"Sentence: \"{sentence}\"")
            print(f"Prediction: {predicted_label_name} (Score: {result['score']:.4f})")
            print("-" * 20)

        # User input for live testing
        print("\n=== Try your own sentence! ===")
        my_sentence = input("Enter a sentence to classify: ")
        if my_sentence:
            my_result = classifier_pipeline(my_sentence)[0]
            my_predicted_label = label_map.get(my_result['label'], my_result['label'])
            print(f"Prediction: {my_predicted_label} (Score: {my_result['score']:.4f})")
        else:
            print("No input provided.")


    except Exception as e:
        print(f"\n--- An error occurred during prediction setup or execution ---")
        print(f"Model load path attempted: {model_load_path}")
        print(f"Error: {e}")
        print("\nPlease check the following:")
        print("1. Training completed without errors.")
        print(f"2. The path '{model_load_path}' exists and contains model files (like config.json, model weights).")
        print(f"3. The original tokenizer '{model_checkpoint}' is valid.")
        print("\n--- End of Script ---")


if __name__ == "__main__":
    main()


--- Setting up Prediction Pipeline ---
Loading fine-tuned model from: misleading-content-classifier/checkpoint-63
Fine-tuned model loaded successfully.
Reusing the original tokenizer from: 'distilbert-base-uncased'


Device set to use cuda:0


Prediction pipeline created successfully.

--- Making Predictions on New Sentences ---
Sentence: "This scientific study shows clear evidence for the new treatment."
Prediction: Potentially Credible/Reliable (Score: 0.7573)
--------------------
Sentence: "Everyone knows that eating pizza cures the common cold!"
Prediction: Potentially Misleading/Unreliable (Score: 0.6811)
--------------------
Sentence: "The report cites multiple anonymous sources with conflicting stories."
Prediction: Potentially Misleading/Unreliable (Score: 0.7144)
--------------------
Sentence: "The politician made vague promises during the speech."
Prediction: Potentially Misleading/Unreliable (Score: 0.7779)
--------------------
Sentence: "An amazing film, truly inspiring and well-acted."
Prediction: Potentially Credible/Reliable (Score: 0.9028)
--------------------
Sentence: "A terrible waste of time, completely boring and predictable."
Prediction: Potentially Misleading/Unreliable (Score: 0.8255)
----------------