In [6]:
# %pip install transformers datasets evaluate emojis
# %pip install torch scikit-learn  # sagemaker studio LAB

Collecting scikit-learn
  Downloading scikit_learn-1.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.5 MB)
[K     |████████████████████████████████| 9.5 MB 7.4 MB/s eta 0:00:01
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Collecting joblib>=1.1.1
  Downloading joblib-1.2.0-py3-none-any.whl (297 kB)
[K     |████████████████████████████████| 297 kB 84.4 MB/s eta 0:00:01
[?25hCollecting scipy>=1.3.2
  Downloading scipy-1.10.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.4 MB)
[K     |████████████████████████████████| 34.4 MB 108.2 MB/s eta 0:00:01
[?25hInstalling collected packages: threadpoolctl, scipy, joblib, scikit-learn
Successfully installed joblib-1.2.0 scikit-learn-1.2.0 scipy-1.10.0 threadpoolctl-3.1.0
Note: you may need to restart the kernel to use updated packages.


In [7]:
# import os
import sys

import numpy as np
import pandas as pd

import torch
import torch.nn.functional as F
import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
from transformers import pipeline
from transformers import Trainer, TrainingArguments
from transformers import logging
import evaluate

import datasets
from datasets import load_metric

import emojis

from sklearn.metrics import precision_recall_fscore_support, accuracy_score, balanced_accuracy_score, classification_report

print('sys.executable:', sys.executable)
print('sys.version   :', sys.version.replace('\n', ''))
print('sys.path      :')
for x in sys.path:
    print('               ', x)

# os.environ['WANDB_DISABLED'] = 'true'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

if torch.cuda.is_available():
    print('cuda n:', torch.cuda.device_count())
    current = torch.cuda.current_device()
    print('current', current, torch.cuda.device(current))
else:
    print('CPU')

sys.executable: /home/studio-lab-user/.conda/envs/default/bin/python
sys.version   : 3.9.13 | packaged by conda-forge | (main, May 27 2022, 16:58:50) [GCC 10.3.0]
sys.path      :
                /home/studio-lab-user/tamil/tamil_mix_sentiment_analysis
                /home/studio-lab-user/.conda/envs/default/lib/python39.zip
                /home/studio-lab-user/.conda/envs/default/lib/python3.9
                /home/studio-lab-user/.conda/envs/default/lib/python3.9/lib-dynload
                
                /home/studio-lab-user/.conda/envs/default/lib/python3.9/site-packages
                /home/studio-lab-user/.conda/envs/default/lib/python3.9/site-packages/IPython/extensions
                /home/studio-lab-user/.ipython
cpu
CPU


In [None]:
dataset = datasets.load_dataset('tamilmixsentiment')
type(dataset)
dataset

testset = dataset.pop('test')
testset

my_emojis = [list(emojis.get(x)) for x in dataset['train']['text']]
my_emojis = [y for x in my_emojis for y in x]
print(pd.Series(my_emojis).value_counts())

In [None]:
# make subset of the data
def red(ds):
    p = 0.1
    n = int(ds.num_rows * p)
    res = ds.shuffle(seed=2023).select(range(n))
    return res


dataset = datasets.DatasetDict({
    'train': red(dataset['train']),
    'validation': red(dataset['validation']),
})
dataset

print(dataset['train'].to_pandas()['label'].value_counts(normalize=True))
print(dataset['validation'].to_pandas()['label'].value_counts(normalize=True))

In [None]:
model_name = 'bert-base-multilingual-cased'


tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=512)
print(len(tokenizer))  # 119547
assert tokenizer.tokenize('🧡') == ['[UNK]']

tokenizer.add_tokens(list(set(my_emojis)))
print(len(tokenizer))  # 119593
assert tokenizer.tokenize('🧡') == ['🧡']


data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

num_labels = len(set(dataset['train']['label']))
print('num_labels:', num_labels)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
)
model.num_labels
model.resize_token_embeddings(len(tokenizer))
print('model.device', model.device)
model.to(device)
print('model.device', model.device)


def tokenize_function(ds):
    res = tokenizer(
        ds['text'],
        padding=True,
        truncation=True,
        # max_length=512,
        return_tensors='pt',
    )
    return res


tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

tokenized_dataset_train = tokenizer(
        dataset['train']['text'],
        padding=True,
        truncation=True,
        # max_length=512,
        return_tensors='pt',
    )

tokenized_dataset_validation = tokenizer(
        dataset['validation']['text'],
        padding=True,
        truncation=True,
        # max_length=512,
        return_tensors='pt',
    )


# tokenized_dataset_test = tokenizer(
#         dataset['test']['text'],
#         padding=True,
#         truncation=True,
#         # max_length=512,
#         return_tensors='pt',
#     )


# dataset = datasets.DatasetDict({
#     'train': red(dataset['train']),
#     'validation': red(dataset['validation']),
# })
# dataset


tokenized_dataset.set_format(type='torch')

In [None]:
tokenized_dataset_train

tokenized_dataset

# tokenized_dataset.to('pytorch')
# type(tokenized_dataset['train']['attention_mask'])

# tokenized_dataset['train']['attention_mask']



In [None]:

metric = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    res = metric.compute(predictions=predictions, references=labels)
    print('EVALUANDO', flush=True)
    print(type(res), flush=True)
    print(res, flush=True)
    return res


training_args = TrainingArguments(
    output_dir='./results_bert',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=1,
    save_strategy='epoch',  # 'no'
    evaluation_strategy='steps',  # 'no'
    report_to='none'
)


my_weights = 1 / dataset['train'].to_pandas()['label'].value_counts(normalize=True)
my_weights = np.log(my_weights)
my_weights = my_weights.tolist()
my_weights


class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get('labels')
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get('logits')
        # compute custom loss (suppose one has 3 labels with different weights)
        # loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([1.0, 2.0, 3.0]))
        loss_fct = torch.nn.CrossEntropyLoss(weight=torch.tensor(my_weights))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss
    
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    # train_dataset=tokenized_dataset_train,
    # eval_dataset=tokenized_dataset_validation,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
print('trainer.args.device:', trainer.args.device)

In [None]:

trainer.train()

In [None]:
trainer.save_model('./results_bert_final_trained_2')

In [None]:
# Evaluation
ts = testset.to_pandas()

pip = pipeline('sentiment-analysis', './results_bert_final_trained_2')

res = pip(ts['text'].tolist())
res
ts[['pred', 'prob']] = pd.DataFrame(res).values
ts
ts['pred'].value_counts()
ts['pred'].value_counts(normalize=True)

ts['pred_n'] = ts['pred'].str.replace('LABEL_', '').astype(int)

pd.crosstab(ts['label'], ts['pred'])
pd.crosstab(ts['label'], ts['pred_n'])

precision_recall_fscore_support(ts['label'], ts['pred_n'])
pd.DataFrame(precision_recall_fscore_support(ts['label'], ts['pred_n']))

precision_recall_fscore_support(ts['label'], ts['pred_n'], average='weighted')

print(classification_report(ts['label'], ts['pred_n']))