# 

In this notebook 

http://archive.ics.uci.edu/ml/datasets/TamilSentiMix

In [1]:
%pip install matplotlib transformers datasets emojis

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Note: you may need to restart the kernel to use updated packages.


In [2]:
import numpy as np
import pandas as pd
import sklearn.metrics
import emojis
import torch
import transformers
import datasets
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, pipeline
import datasets

transformers.logging.set_verbosity_error()
datasets.logging.set_verbosity_error()

# DEVICE
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [3]:
# PARAMETERS
dataset_name = 'tamilmixsentiment'

# batch_size, language_model_name = 64, 'distilbert-base-uncased'
# batch_size, language_model_name = 8, 'distilbert-base-uncased'  # good for sagemaker studio lab
# batch_size, language_model_name = 64, 'bert-base-multilingual-cased'
# batch_size, language_model_name = 32, 'bert-base-multilingual-cased'  # good for sagemaker notebook ml.p3.2xlarge
# batch_size, language_model_name = 16, 'bert-base-multilingual-cased'
batch_size, language_model_name = 16, 'xlm-roberta-large'  # good for sagemaker notebook ml.p3.2xlarge PROBAR CON ml.g5.2xlarge


# DATA
# dataset = datasets.load_dataset('emotion')
dataset = datasets.load_dataset(dataset_name)

# dataset = datasets.DatasetDict(
#     {k: v.shuffle(2023).select(range(int(v.num_rows * 1))) for k, v in dataset.items()}
# )

print(dataset)
dataset['train'].to_pandas()

  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 11335
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1260
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 3149
    })
})


Unnamed: 0,text,label
0,Trailer late ah parthavanga like podunga,0
1,Move pathutu vanthu trailer pakurvnga yaru,0
2,Puthupetai dhanush ah yarellam pathinga,0
3,"Dhanush oda character ,puthu sa erukay , mass ta",0
4,vera level ippa pesungada mokka nu thalaivaaaaaa,0
...,...,...
11330,Yuvan shankar Raja anna fan's like here...,0
11331,A masterpiece best revenge film I’ve ever scene,0
11332,Enna pa thala ya kamiya than katringa,0
11333,R A A S H I K H A N N A,3


In [4]:
# TOKENS
tokenizer = AutoTokenizer.from_pretrained(language_model_name)

# the tokenizer is not prepared for emojis
print(tokenizer.tokenize('🤘'))

# here I add emojis as new tokens
my_emojis = [list(emojis.get(x)) for x in dataset['train']['text']]
my_emojis = [y for x in my_emojis for y in x]
print(pd.Series(my_emojis).value_counts())

tokenizer.add_tokens(list(set(my_emojis)))

print(tokenizer.tokenize('🤘'))

# now we can generate embeddings for our text
def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True)


dataset_encoded = dataset.map(tokenize, batched=True, batch_size=None)
dataset_encoded = dataset_encoded.remove_columns('text')  # this avoids some unnedesary warnings later
dataset_encoded.set_format('torch')
dataset_encoded

['▁', '🤘']
🤣    108
🤔     98
🤩     91
🤘     70
🤗     41
🥰     39
🤦     16
🤙     15
🤟      9
🦁      5
🧐      5
🤞      5
🥳      4
🤓      4
🤖      4
🧡      4
🤪      4
🥁      4
🤫      3
🤭      3
🤒      3
🤢      3
🤯      3
🤨      3
🤝      3
🤕      3
🤐      3
🤑      2
🦸      2
🥂      2
🤚      2
🤮      2
🤜      2
🤳      1
🤷      1
🥺      1
🤥      1
🦆      1
🤬      1
🤛      1
🤠      1
🤤      1
🧨      1
🦂      1
🥵      1
🤧      1
dtype: int64
['▁', '🤘']


  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 11335
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 1260
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 3149
    })
})

In [5]:
# MODEL
num_labels = len(set(dataset['train']['label']))
print('num_labels:', num_labels)

model = AutoModelForSequenceClassification.from_pretrained(
    language_model_name, num_labels=num_labels
).to(device)
model.resize_token_embeddings(len(tokenizer))

print('model.device', model.device)


# METRICS
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = sklearn.metrics.precision_recall_fscore_support(labels, preds, average='weighted')
    acc = sklearn.metrics.accuracy_score(labels, preds)
    bal = sklearn.metrics.balanced_accuracy_score(labels, preds)
    res = {
        'accuracy': acc,
        'balanced': bal,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }
    return res


# TRAINER
training_args = TrainingArguments(
    output_dir='resultados',
    num_train_epochs=3,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    save_strategy='no',
    disable_tqdm=False,
    evaluation_strategy='steps',
    logging_steps=batch_size,
    # logging_steps=100,
    report_to='none',
)


my_weights = 1 / dataset['train'].to_pandas()['label'].value_counts(normalize=True).sort_index()
my_weights = np.log(my_weights)
my_weights = my_weights.tolist()
print(my_weights)


class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get('labels')
        # forward computation
        outputs = model(**inputs)
        logits = outputs.get('logits')
        # compute weighted loss
        loss_fct = torch.nn.CrossEntropyLoss(weight=torch.tensor(my_weights).to(device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss


trainer = WeightedTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=dataset_encoded['train'],
    eval_dataset=dataset_encoded['validation'],
    # data_collator=data_collator,
    compute_metrics=compute_metrics,
)

print('trainer.args.device:', trainer.args.device, flush=True)

num_labels: 5
model.device cuda:0
[0.3962007007773908, 2.0577119899959677, 2.178694198325993, 2.9238322952317324, 3.4275676247726983]
trainer.args.device: cuda:0


In [6]:
# In[13]:
trainer.train()
trainer.save_model('resultados2')

The following columns in the training set don't have a corresponding argument in `XLMRobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `XLMRobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 11335
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 2127
  Number of trainable parameters = 559920133
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy,Balanced,F1,Precision,Recall
16,1.5722,1.529967,0.680159,0.2,0.550681,0.462616,0.680159
32,1.5797,1.526597,0.269048,0.188733,0.282935,0.45601,0.269048
48,1.4868,1.525716,0.680159,0.2,0.550681,0.462616,0.680159
64,1.554,1.529951,0.130952,0.2,0.030326,0.017149,0.130952
80,1.5226,1.537261,0.130952,0.2,0.030326,0.017149,0.130952
96,1.5586,1.527071,0.130952,0.2,0.030326,0.017149,0.130952
112,1.5496,1.523909,0.680159,0.2,0.550681,0.462616,0.680159
128,1.645,1.522909,0.680159,0.2,0.550681,0.462616,0.680159
144,1.6021,1.530273,0.130952,0.2,0.030326,0.017149,0.130952
160,1.5421,1.523302,0.680159,0.2,0.550681,0.462616,0.680159


The following columns in the evaluation set don't have a corresponding argument in `XLMRobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `XLMRobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1260
  Batch size = 16
  _warn_prf(average, modifier, msg_start, len(result))
The following columns in the evaluation set don't have a corresponding argument in `XLMRobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `XLMRobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1260
  Batch size = 16
  _warn_prf(average, modifier, msg_start, len(result))
The following columns in the evaluation set don't have a corresponding argument in `XLMRobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `XLMRobertaForSequence

In [7]:
# EVALUATION
df = dataset['test'].to_pandas()

pipe = pipeline('sentiment-analysis', 'resultados2')
predictions = pipe(dataset['test']['text'])

df[['pred', 'prob']] = pd.DataFrame(predictions).values
df['pred'] = df['pred'].str.replace('LABEL_', '').astype(int)

df

loading configuration file resultados2/config.json
Model config XLMRobertaConfig {
  "_name_or_path": "resultados2",
  "architectures": [
    "XLMRobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.

ValueError: Wrong index found for 🤜: should be 250009 but found 119547.

In [None]:
pd.crosstab(df['label'], df['pred'])

In [None]:
cm = sklearn.metrics.confusion_matrix(df['label'], df['pred'])
cm

In [None]:
%matplotlib inline

disp = sklearn.metrics.ConfusionMatrixDisplay(cm, display_labels=dataset['test'].features['label'].names)
disp.plot()


In [None]:
print(sklearn.metrics.classification_report(df['label'], df['pred'], target_names=dataset['test'].features['label'].names))