In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


"""

Coding Task:

    Fine-tune BERT on a small dataset (e.g., IMDb movie reviews for sentiment classification).
    Use transformers and Trainer from Hugging Face.

Hint: Use the datasets library to load IMDb:
"""
import torch
import transformers
from datasets import load_dataset, DatasetDict
from transformers import DistilBertTokenizer, DataCollatorWithPadding, DistilBertForSequenceClassification, TrainingArguments, \
    Trainer
import wandb

MODEL_NAME = "distilbert-base-uncased" # "bert-base-uncased"

if __name__ == '__main__':
    isGpuAvailable = torch.cuda.is_available()
    print(f"{isGpuAvailable = }")  # Should print: True
    print(f"{torch.cuda.device_count() = }")  # Should print: 1 (if you have one GPU)
    if isGpuAvailable:
        print(f"{torch.cuda.get_device_name(0) = }")  # on my laptop, it should print: NVIDIA GeForce RTX 2060 Max-Q. on kaggle, it prints tesla t4
    else:
        print("No gpu found!")

    transformers.logging.set_verbosity_debug()  # Set to 'INFO' for fewer logs
    wandb.init(mode="offline")  # Logs only locally

    imdb_dataset = load_dataset("imdb")

    imdb_dataset = DatasetDict({        # keep only what I need
        "train": imdb_dataset["train"],
        "test": imdb_dataset["test"]
    })



    print(f"{imdb_dataset = }")
    # 2 columns, text, and label

    # load tokenizer
    tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME)

    # we need a tokenizer function. for now follow tutorial. java styled code organization later
    def tokenize_function(example):
        try:
            return tokenizer(example["text"], padding="max_length", truncation=True)
        except Exception as e:
            print(f"Error in tokenization: {e}")
            return None  # Or handle differently

    tokenized_dataset = imdb_dataset.map(tokenize_function, batched=True)

    print(f"{tokenized_dataset = }")
    """
    tokenized_dataset = DatasetDict({
        train: Dataset({
            features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
            num_rows: 25000
        })
        test: Dataset({
            features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
            num_rows: 25000
        })
        unsupervised: Dataset({
            features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
            num_rows: 50000
        })
    })    
    """

    # convert data to pytorch format
    print("creating tokenized_dataset")
    tokenized_dataset = tokenized_dataset.remove_columns(["text"]) # we don't need text column
    tokenized_dataset = tokenized_dataset.rename_column("label", "labels") # huggingface library wants y = labels
    tokenized_dataset.set_format("torch") # convert matrices into pytorch

    print("creating DataCollatorWithPadding")
    # Data collator for padding batches dynamically
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)  # Question: What is data collector? / what does it do?

    # load the model
    print("creating Model")
    model = (DistilBertForSequenceClassification
             .from_pretrained(MODEL_NAME, num_labels = 2))

    if isGpuAvailable:
        print("before setting model to cuda")
        model = model.to("cuda")  # <-- Ensure Trainer runs on GPU
        print("after setting model to cuda")

    # define training arguments, and trainer
    BATCH_SIZE = 32
    print("creating train_args")
    train_args = TrainingArguments(
        run_name="bert_imdb_experiment",  # Set a custom run name to repair wandDb warning
        output_dir="./bert-imdb",
        eval_strategy="epoch",
        save_strategy="epoch",
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        num_train_epochs=3,
        weight_decay=0.01,
        logging_dir="./logs",
        # device="cuda" # looks like different version had this param
    )

    # question: what about optimizer, loss function?

    # initialize the trainer
    print("creating trainer")
    trainer = Trainer(
        model=model,
        args=train_args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["test"],
        # tokenizer=tokenizer, # tokenizer is deprecated. use data_collector, or process_class instead :/
        data_collator=data_collator,
        # device="cuda" # looks like different version had this param
    )

    # the training!
    print("trainer.train()!")
    trainer.train() # this will fine tune the dataset for 3 epochs!
    print("trainer.evaluate()!")
    trainer.evaluate() # evaluate

    def predict_sentiment(text):
        tokenized_text = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
        with torch.no_grad():    # forgot the braces. 
            outputs = model(**tokenized_text)
        logits = outputs.logits
        prediction = torch.argmax(logits, dim=1).item()
        return "Positive" if prediction == 1 else "Negative"

    print("predict_statement")
    print(predict_sentiment("I really loved this movie! It was fantastic."))
    print(predict_sentiment("This was the worst movie I have ever seen."))

    pass


isGpuAvailable = True
torch.cuda.device_count() = 2
torch.cuda.get_device_name(0) = 'Tesla T4'


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

imdb_dataset = DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
})


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/12040accade4e8a0f71eabdb258fecc2e7e948be/vocab.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/12040accade4e8a0f71eabdb258fecc2e7e948be/tokenizer_config.json
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/12040accade4e8a0f71eabdb258fecc2e7e948be/tokenizer.json
loading file chat_template.jinja from cache at None


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/12040accade4e8a0f71eabdb258fecc2e7e948be/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.47.0",
  "vocab_size": 30522
}



Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

tokenized_dataset = DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 25000
    })
})
creating tokenized_dataset
creating DataCollatorWithPadding
creating Model


loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/12040accade4e8a0f71eabdb258fecc2e7e948be/config.json
Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.47.0",
  "vocab_size": 30522
}



model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/12040accade4e8a0f71eabdb258fecc2e7e948be/model.safetensors
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification 

before setting model to cuda


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


after setting model to cuda
creating train_args
creating trainer
trainer.train()!


Currently training with a batch size of: 64
***** Running training *****
  Num examples = 25,000
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Training with DataParallel so batch size has been adjusted to: 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 1,173
  Number of trainable parameters = 66,955,010
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Epoch,Training Loss,Validation Loss
1,No log,0.203102
2,0.232600,0.184285
3,0.098400,0.235509



***** Running Evaluation *****
  Num examples = 25000
  Batch size = 64
Saving model checkpoint to ./bert-imdb/checkpoint-391
Configuration saved in ./bert-imdb/checkpoint-391/config.json
Model weights saved in ./bert-imdb/checkpoint-391/model.safetensors

***** Running Evaluation *****
  Num examples = 25000
  Batch size = 64
Saving model checkpoint to ./bert-imdb/checkpoint-782
Configuration saved in ./bert-imdb/checkpoint-782/config.json
Model weights saved in ./bert-imdb/checkpoint-782/model.safetensors
Saving model checkpoint to ./bert-imdb/checkpoint-1173
Configuration saved in ./bert-imdb/checkpoint-1173/config.json
Model weights saved in ./bert-imdb/checkpoint-1173/model.safetensors

***** Running Evaluation *****
  Num examples = 25000
  Batch size = 64
Saving model checkpoint to ./bert-imdb/checkpoint-1173
Configuration saved in ./bert-imdb/checkpoint-1173/config.json
Model weights saved in ./bert-imdb/checkpoint-1173/model.safetensors


Training completed. Do not forget to 

trainer.evaluate()!




predict_statement


AttributeError: __enter__