# Minicurso Processamento de Linguagem Natural - Prática 3

Autores:
* Fernando Sola Pereira
* Eduardo Soares de Paiva

In [None]:
!pip -q install transformers

[K     |████████████████████████████████| 4.0 MB 33.3 MB/s 
[K     |████████████████████████████████| 6.6 MB 64.0 MB/s 
[K     |████████████████████████████████| 596 kB 65.9 MB/s 
[K     |████████████████████████████████| 77 kB 9.2 MB/s 
[K     |████████████████████████████████| 880 kB 63.3 MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [None]:
##########################################
# libs python
##########################################
import re
import time
import warnings

##########################################
# libs externas
##########################################
from IPython.display import display, HTML, Latex, Markdown
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.layers import Dense

from transformers import AutoTokenizer
from transformers import TFBertModel

##########################################
# configurações
##########################################
warnings.filterwarnings('ignore')
sns.set_style("whitegrid")
pd.options.display.max_rows = 2000
pd.options.display.max_colwidth = 200

##########################################
# variáveis globais
##########################################
DEFAULT_RANDOM_STATE = 42

In [None]:
##########################################
# dataset 
##########################################
df_lame = pd.read_csv('https://docs.google.com/uc?export=download&id=1_EKfnjomkWks4VqTMIpcEIb6nB5P0Xz2')
df_lame.columns = ['label','text']
df_lame['label'] = df_lame['label'].apply(lambda x: 1 if x == 'positivo' else 0)

# SAMPLE_SIZE = 10000
# s_labels = df_lame['label'].value_counts(normalize=True).sort_index()
# df_lame = pd.concat([
#     df_lame[df_lame['label']==0].sample(int(SAMPLE_SIZE * s_labels[0]), random_state=DEFAULT_RANDOM_STATE), # ~0.427427
#     df_lame[df_lame['label']==1].sample(int(SAMPLE_SIZE * s_labels[1]), random_state=DEFAULT_RANDOM_STATE), # ~0.572573
# ])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_lame.drop(columns='label'), df_lame['label'], stratify=df_lame['label'], test_size=.3, random_state=DEFAULT_RANDOM_STATE)
X_train, X_val, y_train, y_val = train_test_split(df_lame.drop(columns='label'), df_lame['label'], stratify=df_lame['label'], test_size=.2, random_state=DEFAULT_RANDOM_STATE)

tokenizer = AutoTokenizer.from_pretrained("neuralmind/bert-base-portuguese-cased")

s_dct = X_train["text"].apply(lambda x: tokenizer(x, padding="max_length", truncation=True, max_length=512))
X_train['input_ids'] = s_dct.apply(lambda x: x['input_ids'])
X_train['token_type_ids'] = s_dct.apply(lambda x: x['token_type_ids'])
X_train['attention_mask'] = s_dct.apply(lambda x: x['attention_mask'])

s_dct = X_val["text"].apply(lambda x: tokenizer(x, padding="max_length", truncation=True, max_length=512))
X_val['input_ids'] = s_dct.apply(lambda x: x['input_ids'])
X_val['token_type_ids'] = s_dct.apply(lambda x: x['token_type_ids'])
X_val['attention_mask'] = s_dct.apply(lambda x: x['attention_mask'])

s_dct = X_test["text"].apply(lambda x: tokenizer(x, padding="max_length", truncation=True, max_length=512))
X_test['input_ids'] = s_dct.apply(lambda x: x['input_ids'])
X_test['token_type_ids'] = s_dct.apply(lambda x: x['token_type_ids'])
X_test['attention_mask'] = s_dct.apply(lambda x: x['attention_mask'])

Downloading:   0%|          | 0.00/43.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/647 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/205k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
X_train.head()

Unnamed: 0,text,input_ids,token_type_ids,attention_mask
42913,Excelente. Compro semprep este site. Entrega rápida. Parabéns....,"[101, 18116, 403, 119, 2174, 157, 1684, 22291, 860, 3834, 119, 1524, 421, 6740, 119, 959, 22295, 13986, 119, 119, 119, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
54296,"O produto não é bom nem ruim, apenas não é um item que se torne indispensável no meu dia a dia.","[101, 231, 3576, 346, 253, 4062, 2798, 16173, 117, 820, 346, 253, 222, 18685, 179, 176, 745, 22279, 21229, 1050, 202, 7343, 644, 123, 644, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
81183,Muito bom.. Ficou um pouco pequeno pro meu notebook Samsung 15.6 mas foi erro meu não ter ligo direito as especificações. Mesmo assim é um material que estica um pouco e coube perfeitamente. Muito...,"[101, 12925, 4062, 119, 119, 13639, 222, 1695, 3265, 258, 7343, 202, 185, 9081, 3021, 2515, 833, 997, 119, 888, 449, 262, 7441, 7343, 346, 370, 1692, 22280, 2368, 260, 17117, 315, 119, 4823, 1016,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
83273,Atendeu minhas expectativas... super recomendo ! Imagem e som excelentes !,"[101, 7354, 2071, 7122, 22281, 15686, 119, 119, 119, 1229, 9099, 22280, 106, 8737, 705, 122, 4081, 19710, 106, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
31241,"Ele ficou lindo na minha Sala, super funcional, Adorei.","[101, 787, 1767, 1863, 243, 229, 7122, 8088, 117, 1229, 8346, 117, 19183, 8393, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [None]:
bert_model = TFBertModel.from_pretrained("neuralmind/bert-base-portuguese-cased", from_pt=True)

Downloading:   0%|          | 0.00/418M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already

In [None]:
bert_model.

In [None]:
# mostrar a saída do bert

In [None]:
class BertClassifier(tf.keras.Model):
    def __init__(self, bert: TFBertModel, num_classes: int):
        super().__init__()
        self.bert = bert
        self.classifier = Dense(num_classes, activation='sigmoid')
        
    @tf.function
    def call(self, input_ids, attention_mask=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        cls_output = outputs[1]
        cls_output = self.classifier(cls_output)
        return cls_output

model = BertClassifier(bert_model, num_classes=1)

In [None]:
NR_EPOCHS = 3
BATCH_SIZE = 16

steps_per_epoch = X_train.shape[0] // BATCH_SIZE
validation_steps = X_test.shape[0] // BATCH_SIZE

# Loss Function
loss_object = tf.keras.losses.BinaryCrossentropy()
train_loss = tf.keras.metrics.Mean(name='train_loss')
validation_loss = tf.keras.metrics.Mean(name='test_loss')

# Optimizer
total_steps = steps_per_epoch * NR_EPOCHS
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)

# Metrics
train_auc_metrics = tf.metrics.BinaryAccuracy()
validation_auc_metrics = tf.metrics.BinaryAccuracy()

In [None]:
# def create_dataset(data_tuple, epochs=1, batch_size=1, buffer_size=100, train=True):
#     dataset = tf.data.Dataset.from_tensor_slices(data_tuple)
#     # if train:
#     #     dataset = dataset.shuffle(buffer_size=buffer_size)
#     # dataset = dataset.repeat(epochs)
#     dataset = dataset.batch(batch_size)
#     # if train:
#     #     dataset = dataset.prefetch(1)
#     return dataset


@tf.function
def train_step(model, token_ids, masks, labels):
  labels = tf.dtypes.cast(labels, tf.float32)

  with tf.GradientTape() as tape:
    predictions = model(token_ids, attention_mask=masks)
    loss = loss_object(labels, predictions)

    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(grads_and_vars=zip(gradients, model.trainable_variables))

    train_loss(loss)
    train_auc_metrics.update_state(labels, predictions)


@tf.function
def validation_step(model, token_ids, masks, labels):
    labels = tf.dtypes.cast(labels, tf.float32)

    predictions = model(token_ids, attention_mask=masks, training=False)
    v_loss = loss_object(labels, predictions)

    validation_loss(v_loss)
    validation_auc_metrics.update_state(labels, predictions)


def train(model, train_dataset, val_dataset, train_steps_per_epoch, val_steps_per_epoch, epochs):
    for epoch in range(epochs):
        print('=' * 50, f"EPOCH {epoch + 1}", '=' * 50)

        start = time.time()

        for i, (token_ids, masks, labels) in enumerate(train_dataset):
            train_step(model, token_ids, masks, labels)
            if i % 50 == 0:
                print(f'Train Step: {i+1}, Loss: {train_loss.result()}, Accuracy {train_auc_metrics.result()}, Rows: {(i+1) * BATCH_SIZE}')
        train_auc_metrics.reset_states()
        
        for i, (token_ids, masks, labels) in enumerate(val_dataset):
            validation_step(model, token_ids, masks, labels)

        print(f'\nEpoch {epoch+1}, Validation Loss: {validation_loss.result()}, Validation Accuracy {validation_auc_metrics.result()}, Time: {time.time()-start}\n')
        validation_auc_metrics.reset_states()

        print('\n')


train_input_ids = [l for l in X_train["input_ids"].values]
train_attention_mask = [l for l in X_train["attention_mask"].values]

val_input_ids = [l for l in X_val["input_ids"].values]
val_attention_mask = [l for l in X_val["attention_mask"].values]

test_input_ids = [l for l in X_test["input_ids"].values]
test_attention_mask = [l for l in X_test["attention_mask"].values]

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((train_input_ids, train_attention_mask, y_train))
train_dataset.shuffle(len(train_dataset))
train_dataset = train_dataset.batch(BATCH_SIZE)

validation_dataset = tf.data.Dataset.from_tensor_slices((val_input_ids, val_attention_mask, y_val))
validation_dataset.shuffle(len(validation_dataset))
validation_dataset = validation_dataset.batch(BATCH_SIZE)

test_dataset = tf.data.Dataset.from_tensor_slices((test_input_ids, test_attention_mask, y_test))
test_dataset.shuffle(len(test_dataset))
test_dataset = test_dataset.batch(BATCH_SIZE)

# train_dataset = create_dataset((train_input_ids, train_attention_mask, y_train), epochs=NR_EPOCHS, batch_size=BATCH_SIZE)
# validation_dataset = create_dataset((test_input_ids, test_attention_mask, y_test), epochs=NR_EPOCHS, batch_size=BATCH_SIZE)



In [None]:
# tf.config.set_soft_device_placement(True)
# tf.debugging.set_log_device_placement(True)

train(model, train_dataset, validation_dataset, train_steps_per_epoch=steps_per_epoch, val_steps_per_epoch=validation_steps, epochs=NR_EPOCHS)

Train Step: 1, Loss: 0.7177930474281311, Accuracy 0.4375, Rows: 16
Train Step: 51, Loss: 0.256509929895401, Accuracy 0.8982843160629272, Rows: 816
Train Step: 101, Loss: 0.1849445104598999, Accuracy 0.9319307208061218, Rows: 1616
Train Step: 151, Loss: 0.17170076072216034, Accuracy 0.9387417435646057, Rows: 2416
Train Step: 201, Loss: 0.16438493132591248, Accuracy 0.9430969953536987, Rows: 3216
Train Step: 251, Loss: 0.15565107762813568, Accuracy 0.9454681277275085, Rows: 4016
Train Step: 301, Loss: 0.1541113555431366, Accuracy 0.947051465511322, Rows: 4816
Train Step: 351, Loss: 0.14687561988830566, Accuracy 0.9496082663536072, Rows: 5616
Train Step: 401, Loss: 0.143682062625885, Accuracy 0.9505922794342041, Rows: 6416
Train Step: 451, Loss: 0.14266610145568848, Accuracy 0.9513580799102783, Rows: 7216
Train Step: 501, Loss: 0.13896913826465607, Accuracy 0.9525948166847229, Rows: 8016
Train Step: 551, Loss: 0.13810397684574127, Accuracy 0.9530399441719055, Rows: 8816
Train Step: 601, L

In [None]:
ds = tf.data.Dataset.from_tensor_slices([1,2,3])
ds = ds.shuffle(len(ds))

In [None]:
[d for d in ds]

[<tf.Tensor: shape=(), dtype=int32, numpy=2>,
 <tf.Tensor: shape=(), dtype=int32, numpy=1>,
 <tf.Tensor: shape=(), dtype=int32, numpy=3>]

In [None]:
all_predictions = []
for i, (token_ids, masks, labels) in enumerate(test_dataset):
  predictions = model(token_ids, attention_mask=masks, training=False)
  all_predictions.extend(predictions)

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, (np.array(all_predictions)>=0.5).astype(int).reshape(-1)))

              precision    recall  f1-score   support

           0       0.96      0.98      0.97     10727
           1       0.99      0.97      0.98     14387

    accuracy                           0.97     25114
   macro avg       0.97      0.98      0.97     25114
weighted avg       0.98      0.97      0.97     25114



In [None]:
token_data = tokenizer("O produto é interessante mas não parece atender as minhas necessidades.", padding="max_length", truncation=True, max_length=512)

t_input_ids = np.array(token_data['input_ids']).reshape(-1, 512)
t_attention_mask = np.array(token_data['attention_mask']).reshape(-1, 512)

predictions = model(t_input_ids, attention_mask=t_attention_mask, training=False)
predictions.numpy()

array([[0.01512252]], dtype=float32)