In [1]:
!pip -q install "transformers>=4.44.0" "datasets>=2.20.0" "accelerate>=0.33.0" "evaluate>=0.4.2" scikit-learn

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m81.9/84.1 kB[0m [31m150.9 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m81.9/84.1 kB[0m [31m150.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m934.6 kB/s[0m eta [36m0:00:00[0m
[?25h

In [11]:
import torch, random, os, numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback
from datasets import load_dataset, DatasetDict


print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU device:", torch.cuda.get_device_name(0))
else:
    print(" Enable GPU: Runtime → Change runtime type → Hardware accelerator → GPU")

# Reproducibility: same shuffles/initializations across runs
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# Core libraries
from datasets import load_dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
)



PyTorch version: 2.8.0+cu126
CUDA available: True
GPU device: Tesla T4


In [4]:
DATASET_NAME = "imdb"
TWEET_EVAL_SUBSET = "sentiment"  # used only if DATASET_NAME == "tweet_eval"

MODEL_CHECKPOINT = "bert-base-uncased"  # lowercase, 12-layer BERT

# Sequence length tradeoff: longer → more context, slower; shorter → faster, less context
MAX_LENGTH = 256

# Optional: use smaller subsets to make a quick demo (set to None for full data)
TRAIN_SUBSET = None  # e.g., 5000
EVAL_SUBSET  = None  # e.g., 2000



In [12]:
from datasets import load_dataset
if DATASET_NAME == "tweet_eval":
    raw = load_dataset("tweet_eval", TWEET_EVAL_SUBSET)
    dataset = raw  # already has train/validation/test
elif DATASET_NAME in ["imdb", "yelp_polarity"]:
    raw = load_dataset(DATASET_NAME)
    # Create validation split from train (90% train, 10% val)
    raw_split = raw["train"].train_test_split(test_size=0.1, seed=SEED)
    dataset = DatasetDict(
        train=raw_split["train"],
        validation=raw_split["test"],  # our "val"
        test=raw["test"]
    )
else:
    raise ValueError("Unsupported dataset. Use 'imdb', 'yelp_polarity', or 'tweet_eval'.")

for split in dataset:
    print(f"{split}: {len(dataset[split])} samples")

# Optional downsampling (useful for quick runs)
def maybe_select(ds, n):
    if n is None or n >= len(ds):
        return ds
    return ds.shuffle(seed=SEED).select(range(n))

dataset = DatasetDict({
    "train": maybe_select(dataset["train"], TRAIN_SUBSET),
    "validation": maybe_select(dataset["validation"], EVAL_SUBSET),
    "test": maybe_select(dataset["test"], EVAL_SUBSET),
})

print("\nAfter optional subsample:")
for split in dataset:
    print(f"{split}: {len(dataset[split])} samples")


train: 22500 samples
validation: 2500 samples
test: 25000 samples

After optional subsample:
train: 22500 samples
validation: 2500 samples
test: 25000 samples


In [13]:
labels_feature = dataset["train"].features.get("label", None)
if labels_feature and hasattr(labels_feature, "names") and labels_feature.names:
    id2label = {i: name for i, name in enumerate(labels_feature.names)}
    label2id = {name: i for i, name in id2label.items()}
else:
    unique_labels = sorted(list(set(dataset["train"]["label"])))
    id2label = {i: str(i) for i in unique_labels}
    label2id = {str(i): i for i in unique_labels}

num_labels = len(id2label)
print("Number of labels:", num_labels)
print("id2label:", id2label)


Number of labels: 2
id2label: {0: 'neg', 1: 'pos'}


In [14]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT, use_fast=True)

TEXT_COLUMN = "text"

def tokenize_fn(batch):
    return tokenizer(
        batch[TEXT_COLUMN],
        truncation=True,
        max_length=MAX_LENGTH,
        # Padding will be done dynamically per batch by the DataCollator (next cell)
    )

# Remove everything except the label—tokenizer adds model inputs (input_ids, attention_mask, etc.)
encoded = dataset.map(
    tokenize_fn,
    batched=True,
    remove_columns=[c for c in dataset["train"].column_names if c != "label"]
)
print(encoded)


Map:   0%|          | 0/22500 [00:00<?, ? examples/s]

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 22500
    })
    validation: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2500
    })
    test: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25000
    })
})


In [15]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)

    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average="weighted", zero_division=0
    )
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}


In [16]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_CHECKPOINT,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
)



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
import transformers
print("Transformers version:", transformers.__version__)
print(transformers.TrainingArguments.__init__.__code__.co_varnames)

Transformers version: 4.57.1
('self', 'output_dir', 'overwrite_output_dir', 'do_train', 'do_eval', 'do_predict', 'eval_strategy', 'prediction_loss_only', 'per_device_train_batch_size', 'per_device_eval_batch_size', 'per_gpu_train_batch_size', 'per_gpu_eval_batch_size', 'gradient_accumulation_steps', 'eval_accumulation_steps', 'eval_delay', 'torch_empty_cache_steps', 'learning_rate', 'weight_decay', 'adam_beta1', 'adam_beta2', 'adam_epsilon', 'max_grad_norm', 'num_train_epochs', 'max_steps', 'lr_scheduler_type', 'lr_scheduler_kwargs', 'warmup_ratio', 'warmup_steps', 'log_level', 'log_level_replica', 'log_on_each_node', 'logging_dir', 'logging_strategy', 'logging_first_step', 'logging_steps', 'logging_nan_inf_filter', 'save_strategy', 'save_steps', 'save_total_limit', 'save_safetensors', 'save_on_each_node', 'save_only_model', 'restore_callback_states_from_checkpoint', 'no_cuda', 'use_cpu', 'use_mps_device', 'seed', 'data_seed', 'jit_mode_eval', 'bf16', 'fp16', 'fp16_opt_level', 'half_pr

In [19]:
OUTPUT_DIR = "results-bert-sentiment"  # folder for checkpoints & logs

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy="epoch",       # evaluate every epoch
    save_strategy="epoch",             # save checkpoint every epoch
    logging_strategy="steps",
    logging_steps=100,                 # print logs every 100 steps
    learning_rate=2e-5,                # standard for BERT FT
    per_device_train_batch_size=16,    # reduce to 8 if OOM
    per_device_eval_batch_size=32,
    num_train_epochs=3,                # good starting point
    weight_decay=0.01,                 # L2 regularization
    warmup_ratio=0.1,                  # helps stabilize early training
    load_best_model_at_end=True,       # back to best validation F1
    metric_for_best_model="f1",
    greater_is_better=True,
    fp16=torch.cuda.is_available(),    # mixed precision on GPU (faster)
    save_total_limit=2,                # keep last 2 checkpoints
    report_to="none",                  # disable wandb by default
)


In [20]:
callbacks = [EarlyStoppingCallback(early_stopping_patience=2)]

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded["train"],
    eval_dataset=encoded["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=callbacks,
)


  trainer = Trainer(


In [21]:
train_result = trainer.train()
print(" Training finished.")


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2493,0.233009,0.9112,0.911297,0.9112,0.91119
2,0.1702,0.310378,0.9112,0.912447,0.9112,0.911148
3,0.0862,0.369542,0.9212,0.921219,0.9212,0.921197


 Training finished.


In [22]:
val_metrics = trainer.evaluate(encoded["validation"])
test_metrics = trainer.evaluate(encoded["test"])
print(" Validation metrics:", val_metrics)
print(" Test metrics:", test_metrics)


 Validation metrics: {'eval_loss': 0.36954179406166077, 'eval_accuracy': 0.9212, 'eval_precision': 0.9212186608493935, 'eval_recall': 0.9212, 'eval_f1': 0.921197389923083, 'eval_runtime': 15.3598, 'eval_samples_per_second': 162.762, 'eval_steps_per_second': 5.143, 'epoch': 3.0}
 Test metrics: {'eval_loss': 0.34331610798835754, 'eval_accuracy': 0.92092, 'eval_precision': 0.9210332411005263, 'eval_recall': 0.92092, 'eval_f1': 0.920914682303238, 'eval_runtime': 121.5044, 'eval_samples_per_second': 205.754, 'eval_steps_per_second': 6.436, 'epoch': 3.0}


In [23]:
from transformers import pipeline

# 1) Raw model → manual tokenization
def predict_label(text: str):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=MAX_LENGTH)
    if torch.cuda.is_available():
        model.cuda()
        inputs = {k: v.cuda() for k, v in inputs.items()}
    with torch.no_grad():
        logits = model(**inputs).logits
    pred_id = int(torch.argmax(logits, dim=-1).cpu().item())
    return id2label[pred_id]

print("Raw-model prediction:", predict_label("The movie was absolutely fantastic!"))

# 2) Pipeline → simpler for batches / demos
clf = pipeline("text-classification", model=model, tokenizer=tokenizer,
               device=0 if torch.cuda.is_available() else -1)

examples = [
    "I loved the acting and the story!",
    "This was a boring and terrible film.",
    "It was okay, not great but not awful either."
    "Hero was handsome and loved his body",
]
print("Pipeline predictions:", clf(examples))


Device set to use cuda:0


Raw-model prediction: pos
Pipeline predictions: [{'label': 'pos', 'score': 0.9980083107948303}, {'label': 'neg', 'score': 0.9986646175384521}, {'label': 'neg', 'score': 0.6622096300125122}]


In [24]:
SAVE_DIR = "sentiment-model-final"
os.makedirs(SAVE_DIR, exist_ok=True)
model.save_pretrained(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)
print(f" Saved to: {SAVE_DIR}")


 Saved to: sentiment-model-final
