# Minicurso Processamento de Linguagem Natural - Prática 3

Autores:
* Fernando Sola Pereira
* Eduardo Soares de Paiva

In [1]:
!pip -q install transformers

In [2]:
##########################################
# libs python
##########################################
import os
import re
import time
import warnings

##########################################
# libs externas
##########################################
from IPython.display import display, HTML, Latex, Markdown
import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.layers import Dense

from transformers import AutoTokenizer
from transformers import TFBertModel

##########################################
# configurações
##########################################
warnings.filterwarnings('ignore')
sns.set_style("whitegrid")
pd.options.display.max_rows = 2000
pd.options.display.max_colwidth = 200

##########################################
# variáveis globais
##########################################
DEFAULT_RANDOM_STATE = 42

# Local utilizado para armazenar arquivos de dados e checkpoints de modelos 
# (altere de acordo com a sua necessidade).
# Por padrão supõe-se que está sendo executado no google colab e que 
# o google drive do usuário está acessível.
DATA_PATH = '/content/drive/MyDrive/Colab Notebooks/sbsi/data'

In [3]:
if not os.path.exists(DATA_PATH):
  try:
    from google.colab import drive
    drive.mount('/content/drive')
    if not os.path.exists(DATA_PATH):
      os.makedirs(DATA_PATH)
      print('Diretório criado!')
  except:
    print('Não está executando no ambiente Google Colab!')
else:
  print('Diretório existente!')

Diretório existente!


In [4]:
##########################################
# dataset 
##########################################
df_lame = pd.read_csv('https://docs.google.com/uc?export=download&id=1_EKfnjomkWks4VqTMIpcEIb6nB5P0Xz2')
df_lame.columns = ['label','text']
df_lame['label'] = df_lame['label'].apply(lambda x: 1 if x == 'positivo' else 0)

SAMPLE_SIZE = 2000
s_labels = df_lame['label'].value_counts(normalize=True).sort_index()
df_lame = pd.concat([
    df_lame[df_lame['label']==0].sample(int(SAMPLE_SIZE * s_labels[0]), random_state=DEFAULT_RANDOM_STATE), # ~0.427427
    df_lame[df_lame['label']==1].sample(int(SAMPLE_SIZE * s_labels[1]), random_state=DEFAULT_RANDOM_STATE), # ~0.572573
])

In [5]:
X_train, X_test, y_train, y_test = train_test_split(df_lame.drop(columns='label'), df_lame['label'], stratify=df_lame['label'], test_size=.2, random_state=DEFAULT_RANDOM_STATE)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, stratify=y_train, test_size=.2, random_state=DEFAULT_RANDOM_STATE)

tokenizer = AutoTokenizer.from_pretrained("neuralmind/bert-base-portuguese-cased")

s_dct = X_train["text"].apply(lambda x: tokenizer(x, padding="max_length", truncation=True, max_length=512))
X_train['input_ids'] = s_dct.apply(lambda x: x['input_ids'])
X_train['attention_mask'] = s_dct.apply(lambda x: x['attention_mask'])

s_dct = X_val["text"].apply(lambda x: tokenizer(x, padding="max_length", truncation=True, max_length=512))
X_val['input_ids'] = s_dct.apply(lambda x: x['input_ids'])
X_val['attention_mask'] = s_dct.apply(lambda x: x['attention_mask'])

s_dct = X_test["text"].apply(lambda x: tokenizer(x, padding="max_length", truncation=True, max_length=512))
X_test['input_ids'] = s_dct.apply(lambda x: x['input_ids'])
X_test['attention_mask'] = s_dct.apply(lambda x: x['attention_mask'])

In [6]:
X_train.head()

Unnamed: 0,text,input_ids,attention_mask
9588,"Comprei o produto no dia 13/12/2017 para entrega no máximo em 09/01/2018, mais do que tempo suficiente para encomendar o produto, fabricar o produto e transportar até mim, o comprador. Até agora n...","[101, 2174, 8393, 146, 3576, 202, 644, 1492, 120, 1242, 120, 5096, 221, 9358, 202, 5882, 173, 17791, 120, 13778, 120, 6437, 117, 325, 171, 179, 596, 4974, 221, 19385, 22282, 146, 3576, 117, 7875, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
70073,"Produto de qualidade péssima, solta os pelos com facilidade, não é exato 1,50x100cm , é ralinho, diferente do da imagem.","[101, 8169, 183, 125, 3322, 20938, 699, 117, 969, 154, 259, 954, 170, 12143, 117, 346, 253, 20811, 205, 117, 3055, 22312, 9789, 22289, 22287, 117, 253, 646, 3552, 268, 117, 3575, 171, 180, 3294, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
25613,Chegou em ótimas condições e antes do esperado! Além de ser um ótimo album,"[101, 15400, 173, 18809, 22281, 2955, 122, 1075, 171, 9873, 106, 1629, 125, 333, 222, 20576, 313, 917, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
23148,"Aparelho de qualidade,vale cada centavo................................................................................................................................................................","[101, 8872, 22290, 268, 125, 3322, 117, 5488, 1078, 1070, 20383, 22280, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
78559,NÃO RECOMENDO TAPETE DE PÉSSIMA QUALIDADE BEM DIFERENTE DA FOTO SEM SE COMPARA.,"[101, 248, 16484, 257, 5476, 18178, 12547, 18504, 267, 7864, 9208, 22309, 10836, 212, 22352, 6236, 13270, 22301, 5226, 3168, 22327, 6392, 11836, 22309, 241, 13344, 250, 18152, 5054, 12547, 16017, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [7]:
print(X_train.iloc[0, 1])

[101, 2174, 8393, 146, 3576, 202, 644, 1492, 120, 1242, 120, 5096, 221, 9358, 202, 5882, 173, 17791, 120, 13778, 120, 6437, 117, 325, 171, 179, 596, 4974, 221, 19385, 22282, 146, 3576, 117, 7875, 22282, 146, 3576, 122, 16129, 548, 9726, 117, 146, 16007, 22282, 119, 3998, 2535, 3874, 106, 8787, 22280, 7425, 123, 1589, 4299, 131, 107, 13030, 5835, 173, 5255, 107, 106, 106, 106, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [8]:
bert_model = TFBertModel.from_pretrained("neuralmind/bert-base-portuguese-cased", from_pt=True)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already

## Tokens Especiais

In [9]:
# tokens especias
pd.DataFrame({'ID': tokenizer.all_special_ids, 'Token': tokenizer.convert_ids_to_tokens(tokenizer.all_special_ids)})

Unnamed: 0,ID,Token
0,100,[UNK]
1,102,[SEP]
2,0,[PAD]
3,101,[CLS]
4,103,[MASK]


In [10]:
len(tokenizer.get_vocab())

29794

In [11]:
# dicionário
dicionario = tokenizer.get_vocab()
dicionario = sorted(dicionario.items(), key=lambda x:x[1])[:500]
print(dicionario)

[('[PAD]', 0), ('[unused1]', 1), ('[unused2]', 2), ('[unused3]', 3), ('[unused4]', 4), ('[unused5]', 5), ('[unused6]', 6), ('[unused7]', 7), ('[unused8]', 8), ('[unused9]', 9), ('[unused10]', 10), ('[unused11]', 11), ('[unused12]', 12), ('[unused13]', 13), ('[unused14]', 14), ('[unused15]', 15), ('[unused16]', 16), ('[unused17]', 17), ('[unused18]', 18), ('[unused19]', 19), ('[unused20]', 20), ('[unused21]', 21), ('[unused22]', 22), ('[unused23]', 23), ('[unused24]', 24), ('[unused25]', 25), ('[unused26]', 26), ('[unused27]', 27), ('[unused28]', 28), ('[unused29]', 29), ('[unused30]', 30), ('[unused31]', 31), ('[unused32]', 32), ('[unused33]', 33), ('[unused34]', 34), ('[unused35]', 35), ('[unused36]', 36), ('[unused37]', 37), ('[unused38]', 38), ('[unused39]', 39), ('[unused40]', 40), ('[unused41]', 41), ('[unused42]', 42), ('[unused43]', 43), ('[unused44]', 44), ('[unused45]', 45), ('[unused46]', 46), ('[unused47]', 47), ('[unused48]', 48), ('[unused49]', 49), ('[unused50]', 50), ('[

In [12]:
# mostrar exemplo de saída do bert

In [13]:
class BertClassifier(tf.keras.Model):
    def __init__(self, bert: TFBertModel, num_classes: int):
        super().__init__()
        self.bert = bert
        self.classifier = Dense(num_classes, activation='sigmoid')
        
    @tf.function
    def call(self, input_ids, attention_mask=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        cls_output = outputs[1]
        cls_output = self.classifier(cls_output)
        return cls_output

model = BertClassifier(bert_model, num_classes=1)

In [14]:
NR_EPOCHS = 1
BATCH_SIZE = 16

steps_per_epoch = X_train.shape[0] // BATCH_SIZE
validation_steps = X_test.shape[0] // BATCH_SIZE

# Loss Function
loss_object = tf.keras.losses.BinaryCrossentropy()
train_loss = tf.keras.metrics.Mean(name='train_loss')
validation_loss = tf.keras.metrics.Mean(name='val_loss')

# Optimizer
total_steps = steps_per_epoch * NR_EPOCHS
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)

# Metrics
train_auc_metrics = tf.metrics.BinaryAccuracy()
validation_auc_metrics = tf.metrics.BinaryAccuracy()

In [15]:
@tf.function
def train_step(model, token_ids, masks, labels):
  labels = tf.dtypes.cast(labels, tf.float32)

  with tf.GradientTape() as tape:
    predictions = model(token_ids, attention_mask=masks)
    loss = loss_object(labels, predictions)

    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(grads_and_vars=zip(gradients, model.trainable_variables))

    train_loss(loss)
    train_auc_metrics.update_state(labels, predictions)


@tf.function
def validation_step(model, token_ids, masks, labels):
    labels = tf.dtypes.cast(labels, tf.float32)

    predictions = model(token_ids, attention_mask=masks, training=False)
    v_loss = loss_object(labels, predictions)

    validation_loss(v_loss)
    validation_auc_metrics.update_state(labels, predictions)


def train(model, train_dataset, val_dataset, train_steps_per_epoch, val_steps_per_epoch, epochs):
    for epoch in range(epochs):
        print('=' * 50, f"EPOCH {epoch + 1}", '=' * 50)

        start = time.time()

        for i, (token_ids, masks, labels) in enumerate(train_dataset):
            train_step(model, token_ids, masks, labels)
            if i % 50 == 49:
                print(f'Train Step: {i+1}, Loss: {train_loss.result()}, Accuracy {train_auc_metrics.result()}, Rows: {(i+1) * BATCH_SIZE}')
        train_auc_metrics.reset_states()
        
        for i, (token_ids, masks, labels) in enumerate(val_dataset):
            validation_step(model, token_ids, masks, labels)

        print(f'\nEpoch {epoch+1}, Validation Loss: {validation_loss.result()}, Validation Accuracy {validation_auc_metrics.result()}, Time: {time.time()-start}\n')

        print('\n')


train_input_ids = [l for l in X_train["input_ids"].values]
train_attention_mask = [l for l in X_train["attention_mask"].values]

val_input_ids = [l for l in X_val["input_ids"].values]
val_attention_mask = [l for l in X_val["attention_mask"].values]

test_input_ids = [l for l in X_test["input_ids"].values]
test_attention_mask = [l for l in X_test["attention_mask"].values]

In [16]:
train_dataset = tf.data.Dataset.from_tensor_slices((train_input_ids, train_attention_mask, y_train))
train_dataset.shuffle(len(train_dataset))
train_dataset = train_dataset.batch(BATCH_SIZE)

validation_dataset = tf.data.Dataset.from_tensor_slices((val_input_ids, val_attention_mask, y_val))
validation_dataset.shuffle(len(validation_dataset))
validation_dataset = validation_dataset.batch(BATCH_SIZE)

test_dataset = tf.data.Dataset.from_tensor_slices((test_input_ids, test_attention_mask, y_test))
test_dataset.shuffle(len(test_dataset))
test_dataset = test_dataset.batch(BATCH_SIZE)

In [17]:
train(model, train_dataset, validation_dataset, train_steps_per_epoch=steps_per_epoch, val_steps_per_epoch=validation_steps, epochs=NR_EPOCHS)

Train Step: 50, Loss: 0.25402477383613586, Accuracy 0.8924999833106995, Rows: 800

Epoch 1, Validation Loss: 0.1464507132768631, Validation Accuracy 0.9437500238418579, Time: 107.40972948074341





In [18]:
all_predictions = []
for i, (token_ids, masks, labels) in enumerate(test_dataset):
  predictions = model(input_ids=token_ids, attention_mask=masks, training=False)
  all_predictions.extend(predictions)

In [19]:
from sklearn.metrics import classification_report
print(classification_report(y_test, (np.array(all_predictions)>=0.5).astype(int).reshape(-1)))

              precision    recall  f1-score   support

           0       0.89      0.98      0.93       171
           1       0.98      0.91      0.95       229

    accuracy                           0.94       400
   macro avg       0.94      0.94      0.94       400
weighted avg       0.94      0.94      0.94       400



In [20]:
# exemplos de frases e predição
frases = [
  "O produto é de baixa qualidade e chegou atrasado.",
  "O produto é muito bom mas não parece atender as minhas necessidades.",
  "Parabéns, você é excelente em fazer péssimos produtos.",
]

t_frases = tokenizer(frases, padding="max_length", truncation=True, max_length=512)
t_input_ids = np.array(t_frases['input_ids']).reshape(-1, 512)
t_attention_mask = np.array(t_frases['attention_mask']).reshape(-1, 512)

preds = model(t_input_ids, attention_mask=t_attention_mask, training=False).numpy().reshape(-1)

for f, p in zip(frases, preds):
  display(Markdown(f'__{f}__: {p*100:.02f}%'))

__O produto é de baixa qualidade e chegou atrasado.__: 3.13%

__O produto é muito bom mas não parece atender as minhas necessidades.__: 4.95%

__Parabéns, você é excelente em fazer péssimos produtos.__: 93.44%