In [1]:
# %pip install transformers datasets

In [7]:
import numpy as np
import pandas as pd
import sklearn.metrics
import torch
import datasets
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, Trainer, TrainingArguments, pipeline

torch.cuda.empty_cache()

In [3]:
# DATA
# dataset = datasets.load_dataset('emotion')
dataset = datasets.load_dataset('tamilmixsentiment')

dataset = dataset.filter(lambda ds, idx: idx < 100, with_indices=True)

dataset

Found cached dataset tamilmixsentiment (/home/studio-lab-user/.cache/huggingface/datasets/tamilmixsentiment/default/0.0.0/887420eecaf868ac6c10990649e49d10467e4cd4dffb98a6f20e4fe7c58df390)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at /home/studio-lab-user/.cache/huggingface/datasets/tamilmixsentiment/default/0.0.0/887420eecaf868ac6c10990649e49d10467e4cd4dffb98a6f20e4fe7c58df390/cache-2064b39f38993a33.arrow
Loading cached processed dataset at /home/studio-lab-user/.cache/huggingface/datasets/tamilmixsentiment/default/0.0.0/887420eecaf868ac6c10990649e49d10467e4cd4dffb98a6f20e4fe7c58df390/cache-bcf4a38db0654ddc.arrow
Loading cached processed dataset at /home/studio-lab-user/.cache/huggingface/datasets/tamilmixsentiment/default/0.0.0/887420eecaf868ac6c10990649e49d10467e4cd4dffb98a6f20e4fe7c58df390/cache-13d615b43bcb9f85.arrow


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 100
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 100
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 100
    })
})

In [4]:
# PARAMS
num_labels = len(set(dataset['train']['label']))
print('num_labels:', num_labels)

# batch_size, model_ckpt = 64, 'distilbert-base-uncased'
batch_size, model_ckpt = 8, 'distilbert-base-uncased'  # sagemaker studio lab
# batch_size, model_ckpt = 32, 'bert-base-multilingual-cased'
# batch_size, model_ckpt = 16, 'bert-base-multilingual-cased'
# batch_size, model_ckpt = 32, 'xlm-roberta-large'


# DEVICE
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

num_labels: 5


device(type='cuda')

In [12]:
# TOKENS
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)


def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True)


dataset_encoded = dataset.map(tokenize, batched=True, batch_size=None)
dataset_encoded


print(dataset_encoded['train'].format)
dataset_encoded.set_format('torch')
print(dataset_encoded['train'].format)


# MODEL
model = AutoModelForSequenceClassification.from_pretrained(
    model_ckpt, num_labels=num_labels
).to(device)

print('model.device', model.device)


# METRICS
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = sklearn.metrics.precision_recall_fscore_support(labels, preds, average='weighted')
    acc = sklearn.metrics.accuracy_score(labels, preds)
    bal = sklearn.metrics.balanced_accuracy_score(labels, preds)
    res = {
        'accuracy': acc,
        'balanced': bal,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }
    print('EVALUANDO', flush=True)
    return res


# TRAINER
training_args = TrainingArguments(
    output_dir='resultados',
    num_train_epochs=2,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    disable_tqdm=False,
    evaluation_strategy='steps',
    logging_steps=batch_size,
    report_to='none',
)


my_weights = 1 / dataset['train'].to_pandas()['label'].value_counts(normalize=True).sort_index()
my_weights = np.log(my_weights)
my_weights = my_weights.tolist()
print(my_weights)


class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get('labels')
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get('logits')
        # compute custom loss (suppose one has 3 labels with different weights)
        # loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([1.0, 2.0, 3.0]))
        loss_fct = torch.nn.CrossEntropyLoss(weight=torch.tensor(my_weights).to(device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss


trainer = WeightedTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=dataset_encoded['train'],
    eval_dataset=dataset_encoded['validation'],
    # data_collator=data_collator,
    compute_metrics=compute_metrics,
)





# trainer = Trainer(
#     model=model,
#     args=training_args,
#     compute_metrics=compute_metrics,
#     train_dataset=dataset_encoded['train'],
#     eval_dataset=dataset_encoded['validation'],
#     tokenizer=tokenizer,
# )

print('trainer.args.device:', trainer.args.device, flush=True)

loading configuration file config.json from cache at /home/studio-lab-user/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.25.1",
  "vocab_size": 30522
}

loading file vocab.txt from cache at /home/studio-lab-user/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/vocab.txt
loading file tokenizer.json from cache at /home/studio-lab-user/.cache/huggin

{'type': None, 'format_kwargs': {}, 'columns': ['text', 'label', 'input_ids', 'attention_mask'], 'output_all_columns': False}
{'type': 'torch', 'format_kwargs': {}, 'columns': ['text', 'label', 'input_ids', 'attention_mask'], 'output_all_columns': False}


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

model.device cuda:0
[0.4462871026284195, 2.120263536200091, 1.7719568419318752, 2.8134107167600364, 4.605170185988091]
trainer.args.device: cuda:0


In [13]:
trainer.train()
trainer.save_model('resultados2')

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 100
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 26
  Number of trainable parameters = 66957317
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy,Balanced,F1,Precision,Recall
8,1.6113,1.571323,0.69,0.2,0.563432,0.4761,0.69
16,1.5289,1.530702,0.69,0.2,0.563432,0.4761,0.69
24,1.5387,1.519486,0.67,0.194203,0.553653,0.471735,0.67


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 8


EVALUANDO


  _warn_prf(average, modifier, msg_start, len(result))
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 8


EVALUANDO


  _warn_prf(average, modifier, msg_start, len(result))
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 8


EVALUANDO


  _warn_prf(average, modifier, msg_start, len(result))


Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to resultados2
Configuration saved in resultados2/config.json
Model weights saved in resultados2/pytorch_model.bin
tokenizer config file saved in resultados2/tokenizer_config.json
Special tokens file saved in resultados2/special_tokens_map.json


In [None]:
preds = trainer.predict(dataset_encoded['validation'])

In [None]:
preds.metrics

In [None]:
preds.predictions[:10]

In [None]:
np.argmax(preds.predictions, axis=1)

In [None]:
pipe = pipeline("text-classification", 'resultados2')

In [None]:
res = pipe(dataset['validation']['text'][:10])

In [None]:
res