In [4]:
# Core libs for dataset, model, tokenizer, trainer
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import accelerate
import torch
import os





In [5]:
# Disable Weights & Biases logging for now to reduce overhead
os.environ["WANDB_DISABLED"] = "true"

In [6]:
# Load the "emotion" dataset
ds = load_dataset("emotion")

# Use a smaller subset for quick speed tests
# e.g. 2000 train samples, 500 validation samples
train_small = ds["train"].select(range(2000))
eval_small = ds["validation"].select(range(500))

len(train_small), len(eval_small)


(2000, 500)

In [7]:
# Use DistilBERT: lighter and faster than bert-base
model_name = "distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=6
)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# Tokenization: text -> input_ids + attention_mask
def tok(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=64
    )

train_small_tok = train_small.map(tok, batched=True)
eval_small_tok = eval_small.map(tok, batched=True)


In [9]:
# Remove raw text, rename label -> labels, set torch format
train_small_tok = train_small_tok.remove_columns(["text"])
eval_small_tok = eval_small_tok.remove_columns(["text"])

train_small_tok = train_small_tok.rename_column("label", "labels")
eval_small_tok = eval_small_tok.rename_column("label", "labels")

train_small_tok.set_format(type="torch")
eval_small_tok.set_format(type="torch")

train_small_tok[0]


{'labels': tensor(0),
 'input_ids': tensor([  101,  1045,  2134,  2102,  2514, 26608,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])}

# Force Use CPU for Training:
``no_cuda=True``

In [40]:
# Small, quick run to test speed
args = TrainingArguments(
    output_dir="text_out_debug",          # where to save
    per_device_train_batch_size=16,       # same as before, but on small dataset
    num_train_epochs=1,                   # 1 epoch for now
    logging_steps=20,                     # log often to see step timing
    no_cuda=True                          # force CPU; keep it explicit for now
)

print("Using CUDA:", torch.cuda.is_available())


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Using CUDA: False


In [41]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_small_tok,
    eval_dataset=eval_small_tok
)

# Optional: Jupyter magic to time the cell
%time trainer.train()


Step,Training Loss
20,1.6256
40,1.5541
60,1.3574
80,1.215
100,1.0796
120,1.0888


CPU times: total: 17min 48s
Wall time: 5min


TrainOutput(global_step=125, training_loss=1.3074206581115724, metrics={'train_runtime': 299.9994, 'train_samples_per_second': 6.667, 'train_steps_per_second': 0.417, 'total_flos': 33119212032000.0, 'train_loss': 1.3074206581115724, 'epoch': 1.0})

# Use GPU if available:
``no_cuda`` removed from params

In [10]:
args = TrainingArguments(
    output_dir="text_out_gpu",
    per_device_train_batch_size=4,   # smaller because MX450 only 2GB
    num_train_epochs=1,
    logging_steps=20,
    report_to=[],
    # GPU allowed automatically if available
)


In [11]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_small_tok,
    eval_dataset=eval_small_tok
)

# Optional: Jupyter magic to time the cell
%time trainer.train()




Step,Training Loss
20,1.7245
40,1.6203
60,1.5397
80,1.5539
100,1.3741
120,1.223
140,1.2421
160,1.1279
180,1.1467
200,1.1783


CPU times: total: 24min 6s
Wall time: 7min 4s


TrainOutput(global_step=500, training_loss=0.9800766468048095, metrics={'train_runtime': 423.887, 'train_samples_per_second': 4.718, 'train_steps_per_second': 1.18, 'total_flos': 33119212032000.0, 'train_loss': 0.9800766468048095, 'epoch': 1.0})

# GPU took longer so let's verify if the GPU was used.

In [1]:
torch.cuda.is_available(), torch.version.cuda, torch.backends.cudnn.enabled

  import pynvml  # type: ignore[import]


(False, None, True)

In [12]:
pip install pynvml

Note: you may need to restart the kernel to use updated packages.


In [14]:
import pynvml

pynvml.nvmlInit()
handle = pynvml.nvmlDeviceGetHandleByIndex(0)

mode = pynvml.nvmlDeviceGetComputeMode(handle)
print("Compute mode:", mode)

modes = {
    0: "WDDM (Default)",
    1: "Exclusive",
    2: "Prohibited",
    3: "Exclusive Process"
}

print("Mode:", modes.get(mode, "Unknown"))

Compute mode: 0
Mode: WDDM (Default)


The Current System is using WDDM which doesn't allow CUDA operations which is required. This restriction is speciifc to Windows, this same graphic crad can be used in Linux system.

# So, For Windows we can use WSL for training, via creating a virtual env there.