In [1]:
import numpy as np
import pandas as pd
import torch
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
    EvalPrediction,
    EarlyStoppingCallback
)
from datasets import load_dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support 

In [5]:
# Load the IMDB dataset using Hugging Face datasets
dataset = load_dataset("imdb")

# Split dataset into train, validation, and test sets
train_dataset = dataset["train"].shuffle(seed=42).select(range(20000))
valid_dataset = dataset["train"].shuffle(seed=42).select(range(20000, 25000))
test_dataset = dataset["test"]

print(f"Training examples: {len(train_dataset)}")
print(f"Validation examples: {len(valid_dataset)}")
print(f"Testing examples: {len(test_dataset)}")

Training examples: 20000
Validation examples: 5000
Testing examples: 25000


In [6]:
# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize function
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=128
    )

# Tokenize datasets
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_valid = valid_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [7]:
# Load pre-trained model
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2
)
# Freeze BERT layers to prevent parameter updates (will save us some time)
for param in model.bert.parameters():
   param.requires_grad = False

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Define metrics computation function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    use_cpu=True, # important for Mac users with M1+ chips
    learning_rate=0.0005
)

In [None]:
# Initialize Trainer
# Optimizer defaults to Adam with momentum
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    compute_metrics=compute_metrics,
    processing_class=tokenizer,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)])

# Train the model
print('Fine-tuning BERT model...')
trainer.train()

In [None]:
# Evaluate the model
eval_results = trainer.evaluate(tokenized_test)
print(f"Test results: {eval_results}")

In [None]:
# Generate predictions for the test set
def get_predictions(trainer, dataset):
   # Run predictions with Hugging Face Trainer
   raw_predictions = trainer.predict(dataset)
  
   # Extract predictions and labels
   predictions = np.argmax(raw_predictions.predictions, axis=1)
   labels = raw_predictions.label_ids
  
   return predictions, labels


y_pred, y_true = get_predictions(trainer, tokenized_test)


# Create and visualize the confusion matrix
cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(10, 8))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Negative', 'Positive'])
disp.plot(cmap='Blues', values_format='d')
plt.title('Confusion Matrix for BBC News Classification (Hugging Face)')
plt.grid(False)
plt.show()


# Create classification report
report = classification_report(y_true, y_pred, target_names=['Negative', 'Positive'], output_dict=True)
print("\nClassification Report:")
print(classification_report(y_true, y_pred, target_names=['Negative', 'Positive']))

In [None]:
# Function to make predictions
# Function to make predictions
def predict_sentiment_hf(text):
   # Handle both single texts and lists
   if isinstance(text, str):
       examples = [text]
   else:
       examples = text
  
   # Tokenize inputs
   inputs = tokenizer(
       examples,
       padding=True,
       truncation=True,
       max_length=128,
       return_tensors="pt"
   )
  
   # Get predictions
   with torch.no_grad():
       outputs = model(**inputs)
       logits = outputs.logits
       predictions = torch.softmax(logits, dim=1)
  
    # Convert to numpy for easier handling
   probs = predictions[0].numpy()
  
   # Get the predicted category and confidence
   predicted_class_id = np.argmax(probs)
   predicted_sentiment = 'Postive' if predicted_class_id == 1 else 'Negative'
   confidence = float(probs[predicted_class_id])
  
   return {
       'text': text[:100] + '...' if len(text) > 100 else text,
       'predicted_sentiment': predicted_sentiment,
       'confidence': confidence}
predict_sentiment_hf(sample_text)

In [None]:
# Save the model and tokenizer
model_save_path = "./bert_sentiment_model_hf"
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)
print(f"Model and tokenizer saved to {model_save_path}")

In [None]:
tokenizer = BertTokenizer.from_pretrained(model_save_path)
model = BertForSequenceClassification.from_pretrained(model_save_path)