In [22]:
import numpy as np
import torch
from datasets import list_datasets, load_dataset
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, Trainer, TrainingArguments, pipeline

In [4]:
from sklearn.metrics import accuracy_score, f1_score


# DATA
# dataset = load_dataset('emotion')
dataset = load_dataset('tamilmixsentiment')
dataset


# PARAMS
num_labels = len(set(dataset['train']['label']))
print('num_labels:', num_labels)
model_ckpt = 'distilbert-base-uncased'


# DEVICE
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device


# TOKENS
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)


def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True)


dataset_encoded = dataset.map(tokenize, batched=True, batch_size=None)
dataset_encoded


print(dataset_encoded['train'].format)
dataset_encoded.set_format('torch')
print(dataset_encoded['train'].format)


# MODEL
model = AutoModelForSequenceClassification.from_pretrained(
    model_ckpt, num_labels=num_labels
).to(device)

print('model.device', model.device)


def compute_metrics(pred):
    res = {'acc': 1.0, 'F1': 2.0}
    return res


batch_size = 64
training_args = TrainingArguments(
    output_dir='resultados',
    num_train_epochs=2,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    disable_tqdm=False,
    evaluation_strategy='steps',
    logging_steps=batch_size,
    report_to='none',
)

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=dataset_encoded['train'],
    eval_dataset=dataset_encoded['validation'],
    tokenizer=tokenizer,
)

print('trainer.args.device:', trainer.args.device)

Found cached dataset tamilmixsentiment (/home/studio-lab-user/.cache/huggingface/datasets/tamilmixsentiment/default/0.0.0/887420eecaf868ac6c10990649e49d10467e4cd4dffb98a6f20e4fe7c58df390)


  0%|          | 0/3 [00:00<?, ?it/s]

loading configuration file config.json from cache at /home/studio-lab-user/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.25.1",
  "vocab_size": 30522
}

loading file vocab.txt from cache at /home/studio-lab-user/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/vocab.txt
loading file tokenizer.json from cache at /home/studio-lab-user/.cache/huggin

num_labels: 5
{'type': None, 'format_kwargs': {}, 'columns': ['text', 'label', 'input_ids', 'attention_mask'], 'output_all_columns': False}
{'type': 'torch', 'format_kwargs': {}, 'columns': ['text', 'label', 'input_ids', 'attention_mask'], 'output_all_columns': False}


loading configuration file config.json from cache at /home/studio-lab-user/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.25.1",
  "vocab_size": 30522
}

loading weights file p

model.device cuda:0
trainer.args.device: cuda:0


In [5]:
trainer.train()
trainer.save_model('resultados2')

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 11335
  Num Epochs = 2
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 356
  Number of trainable parameters = 66957317
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Acc,F1
64,1.1105,0.972891,1.0,2.0
128,0.9752,0.931508,1.0,2.0
192,0.9089,0.920353,1.0,2.0
256,0.8886,0.896192,1.0,2.0
320,0.9108,0.889655,1.0,2.0


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1260
  Batch size = 64
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1260
  Batch size = 64
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 

In [9]:
preds = trainer.predict(dataset_encoded['validation'])

The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 1260
  Batch size = 64


In [10]:
preds

PredictionOutput(predictions=array([[ 2.445549  ,  0.06706481,  0.25771052, -0.44319278, -2.4854672 ],
       [ 1.1598071 ,  0.88811815,  0.4816053 , -0.60402685, -2.1378903 ],
       [ 2.0937824 ,  0.4869788 ,  0.3770932 , -0.6468798 , -2.522209  ],
       ...,
       [ 2.6462312 , -0.30929148,  0.18467437, -0.6333913 , -2.1159906 ],
       [ 2.1340406 ,  0.2322666 ,  0.33947524, -0.39600858, -2.4105506 ],
       [ 2.940393  , -0.24938315,  0.30048946, -0.78538334, -2.3407638 ]],
      dtype=float32), label_ids=array([0, 1, 2, ..., 1, 3, 0]), metrics={'test_loss': 0.8892676830291748, 'test_acc': 1.0, 'test_F1': 2.0, 'test_runtime': 3.6025, 'test_samples_per_second': 349.759, 'test_steps_per_second': 5.552})

In [11]:
preds.metrics

{'test_loss': 0.8892676830291748,
 'test_acc': 1.0,
 'test_F1': 2.0,
 'test_runtime': 3.6025,
 'test_samples_per_second': 349.759,
 'test_steps_per_second': 5.552}

In [12]:
preds.predictions

array([[ 2.445549  ,  0.06706481,  0.25771052, -0.44319278, -2.4854672 ],
       [ 1.1598071 ,  0.88811815,  0.4816053 , -0.60402685, -2.1378903 ],
       [ 2.0937824 ,  0.4869788 ,  0.3770932 , -0.6468798 , -2.522209  ],
       ...,
       [ 2.6462312 , -0.30929148,  0.18467437, -0.6333913 , -2.1159906 ],
       [ 2.1340406 ,  0.2322666 ,  0.33947524, -0.39600858, -2.4105506 ],
       [ 2.940393  , -0.24938315,  0.30048946, -0.78538334, -2.3407638 ]],
      dtype=float32)

In [20]:
np.argmax(preds.predictions, axis=1)

array([0, 0, 0, ..., 0, 0, 0])

In [28]:
pipe = pipeline("text-classification", 'resultados2')

loading configuration file resultados2/config.json
Model config DistilBertConfig {
  "_name_or_path": "resultados2",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.25.1",
  "vocab_size": 30522
}

loading configuration file resultados2/config.json
Model config DistilBertConfig {
  

In [31]:
res = pipe(dataset['validation']['text'])

In [32]:
res

[{'label': 'LABEL_0', 'score': 0.788820743560791},
 {'label': 'LABEL_0', 'score': 0.40355509519577026},
 {'label': 'LABEL_0', 'score': 0.6874706745147705},
 {'label': 'LABEL_0', 'score': 0.5512889623641968},
 {'label': 'LABEL_0', 'score': 0.8393746614456177},
 {'label': 'LABEL_0', 'score': 0.8926364183425903},
 {'label': 'LABEL_0', 'score': 0.6351165175437927},
 {'label': 'LABEL_0', 'score': 0.8444382548332214},
 {'label': 'LABEL_0', 'score': 0.646977424621582},
 {'label': 'LABEL_0', 'score': 0.8845893144607544},
 {'label': 'LABEL_0', 'score': 0.591627836227417},
 {'label': 'LABEL_0', 'score': 0.6226977109909058},
 {'label': 'LABEL_0', 'score': 0.7120057940483093},
 {'label': 'LABEL_0', 'score': 0.6443173885345459},
 {'label': 'LABEL_0', 'score': 0.8968734741210938},
 {'label': 'LABEL_0', 'score': 0.8685049414634705},
 {'label': 'LABEL_0', 'score': 0.8806589245796204},
 {'label': 'LABEL_0', 'score': 0.8020860552787781},
 {'label': 'LABEL_0', 'score': 0.883712112903595},
 {'label': 'LAB