In [55]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/emotion-dataset/Emotion_classify_Data.csv


In [56]:
!pip install -qqq datasets

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


# Emotions Classifier

## Objetivo

O objetivo deste artigo é criar um modelo de NLP baseado no modelo deberta-v3-small, o modelo a ser produzido tem como alvo classificar uma frase dentro de três emoções, Joy, Anger e Fear. Vou estar utilizando Transformers da biblioteca do Huggin Face, pandas para tratar os dados tabulares, e o numpy para calcular o coeficiente pearson.

## Data Frames

In [57]:
import pandas as pd
import numpy as np

Como eu não tenho um dado separado para validação e para testes, vou fazer um split do meu Data Frame para usa-lo para testar o modelo mais a frente.

In [58]:
df_base = pd.read_csv('/kaggle/input/emotion-dataset/Emotion_classify_Data.csv')
df = df_base.iloc[:-100, :]  # Todas as linhas, exceto as últimas 100
eval_df = df_base.iloc[-100:, :]  # As últimas 100 linhas
df.head()

Unnamed: 0,Comment,Emotion
0,i seriously hate one subject to death but now ...,fear
1,im so full of life i feel appalled,anger
2,i sit here to write i start to dig out my feel...,fear
3,ive been really angry with r and i feel like a...,joy
4,i feel suspicious if there is no one outside l...,fear


In [59]:
df

Unnamed: 0,Comment,Emotion
0,i seriously hate one subject to death but now ...,fear
1,im so full of life i feel appalled,anger
2,i sit here to write i start to dig out my feel...,fear
3,ive been really angry with r and i feel like a...,joy
4,i feel suspicious if there is no one outside l...,fear
...,...,...
5832,i feel the need to be distracted,anger
5833,i feel like im sinking and i feel helpless and...,fear
5834,i am worried that you might feel pressured or ...,fear
5835,i wouldn t make too big of a deal out of the s...,fear


In [60]:
df.describe(include='object')

Unnamed: 0,Comment,Emotion
count,5837,5837
unique,5835,3
top,i feel pretty tortured because i work a job an...,anger
freq,2,1971


In [61]:
df['Emotion'].value_counts()

Emotion
anger    1971
joy      1958
fear     1908
Name: count, dtype: int64

Como nosso modelo não reconhece palavras como labels, nós vamos normaliza-los como floats

In [62]:
mapping = {
    'anger': 0.0,
    'joy': 1.0,
    'fear': 2.0,
}
df['Emotion'] = df['Emotion'].map(mapping)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Emotion'] = df['Emotion'].map(mapping)


In [63]:
df.head()

Unnamed: 0,Comment,Emotion
0,i seriously hate one subject to death but now ...,2.0
1,im so full of life i feel appalled,0.0
2,i sit here to write i start to dig out my feel...,2.0
3,ive been really angry with r and i feel like a...,1.0
4,i feel suspicious if there is no one outside l...,2.0


In [64]:
from datasets import Dataset,DatasetDict

ds = Dataset.from_pandas(df)
ds

Dataset({
    features: ['Comment', 'Emotion'],
    num_rows: 5837
})

## Tokenização

Chegou a hora de tokenizarmos nosso modelo, para isso nós utilizaremos o deberta-v3-small

In [65]:
model_nm = 'microsoft/deberta-v3-small'

Um grande passo na construção de modelos NLP é a tokenização, onde nós dividimos nossas palavras em sub-palavras, ou melhor, tokens, que é como nosso modelo entende as palavras

In [66]:
from transformers import AutoModelForSequenceClassification,AutoTokenizer
tokz = AutoTokenizer.from_pretrained(model_nm)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [67]:
tokz.tokenize("G'day folks, I'm Jeremy from fast.ai!")

['▁G',
 "'",
 'day',
 '▁folks',
 ',',
 '▁I',
 "'",
 'm',
 '▁Jeremy',
 '▁from',
 '▁fast',
 '.',
 'ai',
 '!']

In [68]:
def tok_func(x): return tokz(x["Comment"])

In [69]:
tok_ds = ds.map(tok_func, batched=True)

  0%|          | 0/6 [00:00<?, ?ba/s]

In [70]:
row = tok_ds[0]
row['Comment'], row['input_ids'][:10]

('i seriously hate one subject to death but now i feel reluctant to drop it',
 [1, 584, 3218, 3254, 311, 1284, 264, 1142, 304, 394])

In [71]:
tokz.vocab['of']

1580

Na hora de treinar nosso modelo ele ira se basear na coluna labels, por isso vamos renomear Emotions no nosso DataSet

In [72]:
tok_ds = tok_ds.rename_columns({'Emotion':'labels'})

In [73]:
tok_ds

Dataset({
    features: ['Comment', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 5837
})

In [74]:
dds = tok_ds.train_test_split(0.25, seed=42)
dds

DatasetDict({
    train: Dataset({
        features: ['Comment', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4377
    })
    test: Dataset({
        features: ['Comment', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1460
    })
})

In [75]:
def corr(x,y): return np.corrcoef(x,y)[0][1]
def corr_d(eval_pred): return {'pearson': corr(*eval_pred)}

## Treinamento do modelo

In [76]:
from transformers import TrainingArguments,Trainer

In [77]:
bs = 128
epochs = 5
lr = 8e-5

In [78]:
args = TrainingArguments('outputs', learning_rate=lr, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=True,
    evaluation_strategy="epoch", per_device_train_batch_size=bs, per_device_eval_batch_size=bs*2,
    num_train_epochs=epochs, weight_decay=0.01, report_to='none')

In [79]:
model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=1)
trainer = Trainer(model, args, train_dataset=dds['train'], eval_dataset=dds['test'],
                  tokenizer=tokz, compute_metrics=corr_d)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['pooler.dense.bias', 'classifier.weight', 'pooler.dense.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [80]:
trainer.train()

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Pearson
1,No log,0.665374,0.156086
2,No log,0.352589,0.708439
3,No log,0.159591,0.88037
4,No log,0.137179,0.905475
5,No log,0.137723,0.909077


TrainOutput(global_step=90, training_loss=0.41603622436523435, metrics={'train_runtime': 88.2175, 'train_samples_per_second': 248.08, 'train_steps_per_second': 1.02, 'total_flos': 346886542032720.0, 'train_loss': 0.41603622436523435, 'epoch': 5.0})

## Testando os resultados do modelo

In [81]:
eval_ds = Dataset.from_pandas(eval_df).map(tok_func, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [82]:
preds = trainer.predict(eval_ds).predictions.astype(float)

In [83]:
preds[:10]

array([[ 1.18655729],
       [ 1.15956688],
       [-0.01406447],
       [ 1.84000731],
       [-0.06878754],
       [ 1.43245757],
       [ 1.08519363],
       [ 2.05148673],
       [ 1.16897416],
       [ 2.14890075]])

In [84]:
mapping_reverse = {
    0.0: 'anger',
    1.0: 'joy',
    2.0: 'fear'
}
rounded_preds = [round(num[0]) for num in preds.tolist()]
result = [mapping_reverse.get(val, 'unknown') for val in rounded_preds]
#result
#result = [round(num[0]) for num in preds.tolist()]
result[:10]

['joy', 'joy', 'anger', 'fear', 'anger', 'joy', 'joy', 'fear', 'joy', 'fear']

In [85]:
eval_df['Preds'] = result

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  eval_df['Preds'] = result


In [86]:
eval_df

Unnamed: 0,Comment,Emotion,Preds
5837,i still feel like a kid eager to blow the cand...,joy,joy
5838,i will never forget that walk out of the docto...,joy,joy
5839,i feel that you couldnt be bothered anymore,anger,anger
5840,i get this strange feeling that even with peop...,joy,fear
5841,i think i m feeling dissatisfied with my life,anger,anger
...,...,...,...
5932,i begun to feel distressed for you,fear,fear
5933,i left feeling annoyed and angry thinking that...,anger,anger
5934,i were to ever get married i d have everything...,joy,fear
5935,i feel reluctant in applying there because i w...,fear,fear


## Deploy no HugginFace

In [42]:
!pip install -qqq huggingface_hub

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [43]:
from huggingface_hub import login
access_token_write = '...'
login(token = access_token_write)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [44]:
trainer.push_to_hub("rafaelcarvalhoj/emotion-classifier")

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/568M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/3.96k [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

'https://huggingface.co/rafaelcarvalhoj/outputs/tree/main/'

Aplicação do nosso modelo no HugginFace Spaces pra quem tiver interesse em testa-lo

https://huggingface.co/spaces/fastaioncampus/emotions-classifier