In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


In [2]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [None]:
import datasets
import numpy as np
import polars as pl
import matplotlib.pyplot as plt
import torch.cuda
import transformers
import wandb
from datasets import DatasetDict
import torch.nn.functional as F
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, DataCollatorWithPadding, \
    TrainingArguments, Trainer
import evaluate

def getCorrectDevice():
    if torch.cuda.is_available():
        return torch.device("cuda")  # For NVIDIA GPUs
    elif torch.backends.mps.is_available():
        return torch.device("mps")  # For Apple Silicon Macs
    else:
        return torch.device("cpu")   # Fallback to CPU

def dynamicBatchSize():
    if torch.cuda.is_available():
        gpu_name = torch.cuda.get_device_name(0).lower()
        vramGiB = torch.cuda.get_device_properties(0).total_memory / (1024 ** 3)  # Convert to GB

        if "a100" in gpu_name:   # A100 (40GB+ VRAM)
            batch_size = 128
        elif "v100" in gpu_name:  # V100 (16GB/32GB VRAM)
            batch_size = 64 if vramGiB >= 32 else 32
        elif "p100" in gpu_name:  # P100 (16GB VRAM)
            batch_size = 32
        elif "t4" in gpu_name:    # Tesla T4 (16GB VRAM, common in Colab/Kaggle)
            batch_size = 32  # Maybe try 64 if no OOM
        elif "rtx 3090" in gpu_name or vramGiB >= 24:  # RTX 3090 (24GB VRAM)
            batch_size = 64
        elif vramGiB >= 16:   # Any other 16GB+ VRAM GPUs
            batch_size = 32
        elif vramGiB >= 8:    # 8GB VRAM GPUs (e.g., RTX 2080, 3060, etc.)
            batch_size = 16
        elif vramGiB >= 6:    # 6GB VRAM GPUs (e.g., RTX 2060)
            batch_size = 8
        else:
            batch_size = 4  # Safe fallback for smaller GPUs
    else:
        batch_size = 4  # CPU mode, keep it small

    return batch_size

def getGpuName():
    gpu_name = torch.cuda.get_device_name(0).lower()
    return gpu_name

if __name__ == '__main__':
    print(getGpuName())

    # constants / parameters
    MODEL_NAME = "distilbert-base-uncased"
    DATASET_NAME = "imdb"
    # dynamic batch size (kaggle vs my laptop)
    BATCH_SIZE = dynamicBatchSize()  # kaggle supports batch size = 64 for T4 gpu

    # enable some logs to debug properly, change wandb to offline mode for now
    transformers.logging.set_verbosity_debug()  # Set to 'INFO' for fewer logs
    wandb.init(mode="offline")  # Logs only locally

    # load imdb data
    imdb_datasets_dict = datasets.load_dataset(DATASET_NAME)

    # Drop unnecessary columns to speed up the process
    isMyLaptop = "nvidia geforce rtx 2060" in getGpuName()

    if isMyLaptop:
        # my laptop is not meant to do actual bert training. just some quick runs to makesure my code is ok.
        # else I need to debug the code in kaggle which will be a hassle
        imdb_datasets_dict = DatasetDict({
            "train": imdb_datasets_dict["train"].select(range(25)),
            # Select the first 25 entries from the train dataset
            "test": imdb_datasets_dict["test"].select(range(25))  # Select the first 25 entries from the test dataset
        })
    else:
        imdb_datasets_dict = DatasetDict({
            "train": imdb_datasets_dict["train"],
            "test": imdb_datasets_dict["test"]
        })

    # check gpu availability
    isGpuAvailable = torch.cuda.is_available()

    # load tokenizer
    distilBertTokenizer = DistilBertTokenizer.from_pretrained(pretrained_model_name_or_path=MODEL_NAME)


    # preprocess / map data to tokenized_data
    def tokenization_function(entry):
        try:
            value = entry["text"]
            tokenized_value = tokenized_value = distilBertTokenizer(text=value, padding="max_length", truncation=True)
            return tokenized_value
        except Exception as x:
            print(f"Tokenization function error: {x = }")
            return None


    tokenized_dataset_dict = imdb_datasets_dict.map(function=tokenization_function, batched=True)

    # drop unnecessary table from tokenized dataset
    print("creating tokenized_dataset")
    tokenized_dataset_dict = tokenized_dataset_dict.remove_columns(["text"])  # we don't need text column
    tokenized_dataset_dict = tokenized_dataset_dict.rename_column("label", "labels")  # cz huggingface wants y = labels
    tokenized_dataset_dict.set_format("torch")  # convert to pytorch objects

    print("creating DataCollatorWithPadding")
    # Question: What is data collector? / what does it do?
    # Data collator for padding batches dynamically
    data_collator = DataCollatorWithPadding(tokenizer=distilBertTokenizer)

    # load the bert model
    bert_model = (DistilBertForSequenceClassification
                  .from_pretrained(pretrained_model_name_or_path=MODEL_NAME, num_labels=2))
    if isGpuAvailable:
        bert_model = bert_model.to("cuda")

    # init the training_args
    print("init training_args")
    training_args = TrainingArguments(
        run_name="exp-bert-2",
        output_dir="./bert-imdb",
        eval_strategy="epoch",
        save_strategy="epoch",
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        num_train_epochs=3,
        weight_decay=0.01,
        logging_dir="./logs"
    )
    # create trainer object

    # Load desired metrics
    # Load metrics
    accuracy_metric = evaluate.load("accuracy")
    f1_metric = evaluate.load("f1")
    roc_auc_metric = evaluate.load("roc_auc")


    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=1)  # Get predicted class

        positive_logits = logits[:,
                          1]  # convert 2d array into 1d array like this: logits[0][1], logits[1][1], logits[2][1], ... ..., logits[n][1]
        print("----- debug start ----")
        print(f"{logits = }")  # a 2d array.
        print(f"{labels = }")  # 1d array
        print(f"{predictions = }")  # 1d array
        print(f"{positive_logits = }")
        print("----- debug end ----")

        accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
        f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")["f1"]
        # roc_auc = roc_auc_metric.compute(prediction_scores=logits, references=labels)["roc_auc"] # 2d array vs 1d array matrix dim mismatch
        roc_auc = roc_auc_metric.compute(prediction_scores=positive_logits, references=labels)[
            "roc_auc"]  # using positive_logits repairs the error

        return {
            "accuracy": accuracy,
            "f1": f1,
            "roc_auc": roc_auc
        }


    print("create trainer")
    trainer = Trainer(
        model=bert_model,
        args=training_args,
        train_dataset=tokenized_dataset_dict["train"],  # train
        eval_dataset=tokenized_dataset_dict["test"],  # validate
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    # trainer.start, trainer.end
    # the training!
    print("trainer.train()!")
    trainer.train()  # this will fine tune the dataset for 3 epochs!
    print("trainer.evaluate()!")
    trainer.evaluate()  # evaluate


    # test_results = trainer.evaluate(test_dataset) # <-- this is the actual test

    def predict_sentiment(text):
        device = getCorrectDevice()
        tokenized_text = distilBertTokenizer(text, return_tensors="pt", padding=True, truncation=True)
        tokenized_text = {key: value.to(device) for key, value in tokenized_text.items()}

        with torch.no_grad():
            outputs = bert_model(**tokenized_text)

        logits = outputs.logits
        probabilities = F.softmax(logits, dim=1)  # Convert logits to probabilities
        predicted_class = torch.argmax(probabilities, dim=1).item()  # Get class with max probability

        return f"Prediction: {'Positive' if predicted_class == 1 else 'Negative'}, Probabilities: {probabilities.tolist()}"


    print("predict_statement")
    print(predict_sentiment("I really loved this movie! It was fantastic."))
    print(predict_sentiment("This was the worst movie I have ever seen."))
    # make some predictions

    # new topic: explain the bert model, ie why it works / does not work



tesla t4


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/12040accade4e8a0f71eabdb258fecc2e7e948be/vocab.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/12040accade4e8a0f71eabdb258fecc2e7e948be/tokenizer_config.json
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/12040accade4e8a0f71eabdb258fecc2e7e948be/tokenizer.json
loading file chat_template.jinja from cache at None
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'DistilBertTokenizer'.


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

creating tokenized_dataset
creating DataCollatorWithPadding


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/12040accade4e8a0f71eabdb258fecc2e7e948be/config.json
Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.47.0",
  "vocab_size": 30522
}



model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/12040accade4e8a0f71eabdb258fecc2e7e948be/model.safetensors
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification 

init training_args


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/9.54k [00:00<?, ?B/s]

create trainer
trainer.train()!


Currently training with a batch size of: 64
***** Running training *****
  Num examples = 25,000
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Training with DataParallel so batch size has been adjusted to: 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 1,173
  Number of trainable parameters = 66,955,010
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Epoch,Training Loss,Validation Loss,Accuracy,F1,Roc Auc
1,No log,0.195391,0.92432,0.924291,0.977723
2,0.232200,0.191575,0.93144,0.931427,0.981022
3,0.096800,0.230199,0.93468,0.93468,0.981445



***** Running Evaluation *****
  Num examples = 25000
  Batch size = 64


----- debug start ----
logits = array([[ 2.7276287 , -2.574289  ],
       [ 1.5091671 , -1.5340017 ],
       [ 1.7302245 , -1.5589788 ],
       ...,
       [-0.19799697,  0.08877969],
       [-0.13899536, -0.06270194],
       [-1.2416326 ,  1.104221  ]], dtype=float32)
labels = array([0, 0, 0, ..., 1, 1, 1])
predictions = array([0, 0, 0, ..., 1, 1, 1])
positive_logits = array([-2.574289  , -1.5340017 , -1.5589788 , ...,  0.08877969,
       -0.06270194,  1.104221  ], dtype=float32)
----- debug end ----


Saving model checkpoint to ./bert-imdb/checkpoint-391
Configuration saved in ./bert-imdb/checkpoint-391/config.json
Model weights saved in ./bert-imdb/checkpoint-391/model.safetensors

***** Running Evaluation *****
  Num examples = 25000
  Batch size = 64


----- debug start ----
logits = array([[ 3.2685633 , -3.0231888 ],
       [ 2.1437113 , -2.0330033 ],
       [ 2.6006508 , -2.2614734 ],
       ...,
       [-0.6448241 ,  0.5076494 ],
       [-0.78823406,  0.5428132 ],
       [-1.8470358 ,  1.789934  ]], dtype=float32)
labels = array([0, 0, 0, ..., 1, 1, 1])
predictions = array([0, 0, 0, ..., 1, 1, 1])
positive_logits = array([-3.0231888, -2.0330033, -2.2614734, ...,  0.5076494,  0.5428132,
        1.789934 ], dtype=float32)
----- debug end ----


Saving model checkpoint to ./bert-imdb/checkpoint-782
Configuration saved in ./bert-imdb/checkpoint-782/config.json
Model weights saved in ./bert-imdb/checkpoint-782/model.safetensors
Saving model checkpoint to ./bert-imdb/checkpoint-1173
Configuration saved in ./bert-imdb/checkpoint-1173/config.json
Model weights saved in ./bert-imdb/checkpoint-1173/model.safetensors

***** Running Evaluation *****
  Num examples = 25000
  Batch size = 64


----- debug start ----
logits = array([[ 3.535918  , -3.2899764 ],
       [ 3.120323  , -2.93264   ],
       [ 3.1935651 , -2.8434386 ],
       ...,
       [-0.19418688,  0.19878528],
       [-1.7908183 ,  1.6664159 ],
       [-2.6256824 ,  2.661802  ]], dtype=float32)
labels = array([0, 0, 0, ..., 1, 1, 1])
predictions = array([0, 0, 0, ..., 1, 1, 1])
positive_logits = array([-3.2899764 , -2.93264   , -2.8434386 , ...,  0.19878528,
        1.6664159 ,  2.661802  ], dtype=float32)
----- debug end ----


Saving model checkpoint to ./bert-imdb/checkpoint-1173
Configuration saved in ./bert-imdb/checkpoint-1173/config.json
Model weights saved in ./bert-imdb/checkpoint-1173/model.safetensors


Training completed. Do not forget to share your model on huggingface.co/models =)



***** Running Evaluation *****
  Num examples = 25000
  Batch size = 64


trainer.evaluate()!
