In [2]:
# %pip install transformers datasets evaluate emojis

In [3]:
# import os
import sys

import numpy as np
import pandas as pd

import torch
import torch.nn.functional as F
import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
from transformers import pipeline
from transformers import Trainer, TrainingArguments
from transformers import logging
import evaluate

import datasets
from datasets import load_metric

import emojis

from sklearn.metrics import precision_recall_fscore_support, accuracy_score, balanced_accuracy_score, classification_report

print('sys.executable:', sys.executable)
print('sys.version   :', sys.version.replace('\n', ''))
print('sys.path      :')
for x in sys.path:
    print('               ', x)

# os.environ['WANDB_DISABLED'] = 'true'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

if torch.cuda.is_available():
    print('cuda n:', torch.cuda.device_count())
    current = torch.cuda.current_device()
    print('current', current, torch.cuda.device(current))
else:
    print('CPU')

sys.executable: /home/ec2-user/anaconda3/envs/pytorch_p39/bin/python
sys.version   : 3.9.13 | packaged by conda-forge | (main, May 27 2022, 16:56:21) [GCC 10.3.0]
sys.path      :
                /home/ec2-user/SageMaker
                /home/ec2-user/anaconda3/envs/pytorch_p39/lib/python39.zip
                /home/ec2-user/anaconda3/envs/pytorch_p39/lib/python3.9
                /home/ec2-user/anaconda3/envs/pytorch_p39/lib/python3.9/lib-dynload
                
                /home/ec2-user/anaconda3/envs/pytorch_p39/lib/python3.9/site-packages
                /home/ec2-user/anaconda3/envs/pytorch_p39/lib/python3.9/site-packages/IPython/extensions
                /home/ec2-user/.ipython
cpu
CPU


In [4]:
dataset = datasets.load_dataset('tamilmixsentiment')
type(dataset)
dataset

testset = dataset.pop('test')
testset

my_emojis = [list(emojis.get(x)) for x in dataset['train']['text']]
my_emojis = [y for x in my_emojis for y in x]
print(pd.Series(my_emojis).value_counts())

Found cached dataset tamilmixsentiment (/home/ec2-user/.cache/huggingface/datasets/tamilmixsentiment/default/0.0.0/887420eecaf868ac6c10990649e49d10467e4cd4dffb98a6f20e4fe7c58df390)


  0%|          | 0/3 [00:00<?, ?it/s]

🤣    108
🤔     98
🤩     91
🤘     70
🤗     41
🥰     39
🤦     16
🤙     15
🤟      9
🦁      5
🧐      5
🤞      5
🥳      4
🤓      4
🤖      4
🧡      4
🤪      4
🥁      4
🤫      3
🤭      3
🤒      3
🤢      3
🤯      3
🤨      3
🤝      3
🤕      3
🤐      3
🤑      2
🦸      2
🥂      2
🤚      2
🤮      2
🤜      2
🤳      1
🤷      1
🥺      1
🤥      1
🦆      1
🤬      1
🤛      1
🤠      1
🤤      1
🧨      1
🦂      1
🥵      1
🤧      1
dtype: int64


In [5]:
# make subset of the data
def red(ds):
    p = 0.1
    n = int(ds.num_rows * p)
    res = ds.shuffle(seed=2023).select(range(n))
    return res


dataset = datasets.DatasetDict({
    'train': red(dataset['train']),
    'validation': red(dataset['validation']),
})
dataset

print(dataset['train'].to_pandas()['label'].value_counts(normalize=True))
print(dataset['validation'].to_pandas()['label'].value_counts(normalize=True))

Loading cached shuffled indices for dataset at /home/ec2-user/.cache/huggingface/datasets/tamilmixsentiment/default/0.0.0/887420eecaf868ac6c10990649e49d10467e4cd4dffb98a6f20e4fe7c58df390/cache-e0dc15cb3461e70b.arrow
Loading cached shuffled indices for dataset at /home/ec2-user/.cache/huggingface/datasets/tamilmixsentiment/default/0.0.0/887420eecaf868ac6c10990649e49d10467e4cd4dffb98a6f20e4fe7c58df390/cache-ef0be8f2bf001be5.arrow


0    0.688438
1    0.125331
2    0.106796
3    0.044131
4    0.035305
Name: label, dtype: float64
0    0.674603
1    0.142857
2    0.095238
3    0.063492
4    0.023810
Name: label, dtype: float64


In [6]:
model_name = 'bert-base-multilingual-cased'


tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=512)
print(len(tokenizer))  # 119547
assert tokenizer.tokenize('🧡') == ['[UNK]']

tokenizer.add_tokens(list(set(my_emojis)))
print(len(tokenizer))  # 119593
assert tokenizer.tokenize('🧡') == ['🧡']


data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

num_labels = len(set(dataset['train']['label']))
print('num_labels:', num_labels)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
)
model.num_labels
model.resize_token_embeddings(len(tokenizer))
print('model.device', model.device)
model.to(device)
print('model.device', model.device)


def tokenize_function(ds):
    res = tokenizer(
        ds['text'],
        padding=True,
        truncation=True,
        # max_length=512,
        return_tensors='pt',
    )
    return res


tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

tokenized_dataset['train']


119547
119593
num_labels: 5


Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

model.device cpu
model.device cpu


  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1133
})

In [7]:

metric = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    res = metric.compute(predictions=predictions, references=labels)
    print('EVALUANDO', flush=True)
    print(type(res), flush=True)
    print(res, flush=True)
    return res


training_args = TrainingArguments(
    output_dir='./results_bert',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=1,
    save_strategy='epoch',  # 'no'
    evaluation_strategy='steps',  # 'no'
    report_to='none'
)


my_weights = 1 / dataset['train'].to_pandas()['label'].value_counts(normalize=True)
my_weights = np.log(my_weights)
my_weights = my_weights.tolist()
my_weights


class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get('labels')
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get('logits')
        # compute custom loss (suppose one has 3 labels with different weights)
        # loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([1.0, 2.0, 3.0]))
        loss_fct = torch.nn.CrossEntropyLoss(weight=torch.tensor(my_weights))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss
    
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [8]:
print('trainer.args.device:', trainer.args.device)

trainer.args.device: cpu


In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1133
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 213
  Number of trainable parameters = 177892613
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
trainer.save_model('./results_bert_final_trained_2')

In [None]:
# Evaluation
ts = testset.to_pandas()

pip = pipeline('sentiment-analysis', './results_bert_final_trained_2')

res = pip(ts['text'].tolist())
res
ts[['pred', 'prob']] = pd.DataFrame(res).values
ts
ts['pred'].value_counts()
ts['pred'].value_counts(normalize=True)

ts['pred_n'] = ts['pred'].str.replace('LABEL_', '').astype(int)

pd.crosstab(ts['label'], ts['pred'])
pd.crosstab(ts['label'], ts['pred_n'])

precision_recall_fscore_support(ts['label'], ts['pred_n'])
pd.DataFrame(precision_recall_fscore_support(ts['label'], ts['pred_n']))

precision_recall_fscore_support(ts['label'], ts['pred_n'], average='weighted')

print(classification_report(ts['label'], ts['pred_n']))