# Minicurso Processamento de Linguagem Natural - Prática 2

Autores:
* Fernando Sola Pereira
* Eduardo Soares de Paiva

In [1]:
import os
import requests
import zipfile
import warnings

import pandas as pd
import tensorflow as tf

from gensim.models import KeyedVectors
from IPython.display import Markdown, HTML, display, Latex
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

from tensorflow.keras.layers import (Dense, Input, LSTM, Embedding, Dropout,
  Activation, Flatten, GlobalAveragePooling1D, GlobalMaxPooling1D, Conv1D,
  Bidirectional, GlobalMaxPool1D)
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

##########################################
# configurações
##########################################
warnings.filterwarnings('ignore')
pd.options.display.max_rows = 2000
pd.options.display.max_colwidth = 200

################################################################################
# Constantes utilizadas
################################################################################

# Local utilizado para armazenar arquivos de dados e checkpoints de modelos 
# (altere de acordo com a sua necessidade).
# Por padrão supõe-se que está sendo executado no google colab e que 
# o google drive do usuário está acessível.
DATA_PATH = '/content/drive/MyDrive/Colab Notebooks/sbsi/data'

# semente fixa para garantir reprodutibilidade
DEFAULT_RANDOM_SEED = 42

# tamanho do vetor de embedings
EMBED_SIZE = 50 

# número de palavras únicas do dicionário
MAX_FEATURES = 10000

# número de palavras máximo para documento do corpus
MAX_LEN = 300

In [2]:
if not os.path.exists(DATA_PATH):
  try:
    from google.colab import drive
    drive.mount('/content/drive')

    os.makedirs(DATA_PATH)
  except:
    print('Não está executando no ambiente Google Colab!')
else:
  print('Diretório existente!')

Diretório existente!


In [3]:
################################################################################
# Busca as informações de Embeddings no repositório do NILC e salva localmente.
# http://nilc.icmc.usp.br/embeddings
# Exemplo utilizando a embedding mais simples.
################################################################################
file_model = f'http://143.107.183.175:22980/download.php?file=embeddings/word2vec/cbow_s{EMBED_SIZE}.zip'
vec_file = os.path.join(DATA_PATH, 'vec.zip')

if not os.path.exists(vec_file):
  with open(vec_file, 'wb') as f:
    f.write(requests.get(file_model).content)
  with zipfile.ZipFile(vec_file, 'r') as z:
    z.extractall(DATA_PATH)

In [4]:
################################################################################
# Dataset de reviews de produtos no site das lojas americanas - B2W-Reviews01
# Versão um pouco reduzida e balanceada da original.
################################################################################
df_review = pd.read_csv('https://docs.google.com/uc?export=download&id=1_EKfnjomkWks4VqTMIpcEIb6nB5P0Xz2')
df_review.columns = ['label','text']
df_review['label'] = df_review['label'].apply(
    lambda x: 1 if x == 'positivo' else 0)

###############################################################################
# Se for necessário trabalhar com uma amostra menor.
###############################################################################
# SAMPLE_SIZE = 1000
# s_labels = df_review['label'].value_counts(normalize=True).sort_index()
# df_review = pd.concat([
#     df_review[df_review['label']==0].sample(int(SAMPLE_SIZE * s_labels[0]), random_state=DEFAULT_RANDOM_SEED), # ~0.427427
#     df_review[df_review['label']==1].sample(int(SAMPLE_SIZE * s_labels[1]), random_state=DEFAULT_RANDOM_SEED), # ~0.572573
# ])

df_review.head()

Unnamed: 0,label,text
0,1,"A entrega foi no prazo, as americanas estão de parabéns. A smart tv é muito boa, a navegação na internete e pelos aplicativos e excelente, não trava, sem falar da imagem que é de surpreender. reco..."
1,1,"Excelente produto, por fora em material acrílico super resistente e por dentro em adamantio, faz milagre com qualquer bebida. Sugiro aproveitarem a promoção antes que acabe."
2,1,"produto mto bom, com essa garrafinha vc pode até servir água pro megazord. To pensando em vender minha tv pra comprar 1 garrafa dessa. RECOMENDO"
3,1,O barulho e minimo e o vento é bem forte na velocidade 2
4,0,MEU PRODUTO NAO FOI ENTREGUE E A AMERICANAS ESTA DESCONTANDO NA FATURA DO MEU CARTÃO


In [5]:
################################################################################
# Divisão do dataset em treinamento e teste.
################################################################################

X_train_o, X_test_o, y_train, y_test  = train_test_split(df_review[['text']], df_review['label'], test_size=0.2, random_state=DEFAULT_RANDOM_SEED)

tokenizer = Tokenizer(num_words=MAX_FEATURES)
tokenizer.fit_on_texts(list(X_train_o['text']))

X_train = tokenizer.texts_to_sequences(X_train_o['text'])
X_test = tokenizer.texts_to_sequences(X_test_o['text'])

X_train = pad_sequences(X_train, maxlen=MAX_LEN, padding='post', truncating='post')
X_test = pad_sequences(X_test, maxlen=MAX_LEN, padding='post', truncating='post')

In [6]:
print(X_train_o['text'].iloc[0])
X_train[0, :]

Demoram um mês para dizer que não tem o produto. Minha reclamação não é sobre o produto. É sim a empresa que fornece


array([2676,   13,  152,   12,  489,    7,    5,   41,    2,    6,   46,
        405,    5,   10,  181,    2,    6,   10,  347,    4,  118,    7,
       3067,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [7]:
print([tokenizer.index_word[x] for x in X_train[0, :] if x!=0])

['demoram', 'um', 'mês', 'para', 'dizer', 'que', 'não', 'tem', 'o', 'produto', 'minha', 'reclamação', 'não', 'é', 'sobre', 'o', 'produto', 'é', 'sim', 'a', 'empresa', 'que', 'fornece']


In [8]:
################################################################################
# Vetores de palavras pré-treinados
################################################################################

import numpy as np

w2vec_pretrained = os.path.join(DATA_PATH, f'cbow_s{EMBED_SIZE}.txt')
w2vec_model = KeyedVectors.load_word2vec_format(w2vec_pretrained)

word_index = tokenizer.index_word

vocabulary_size = min(len(word_index), MAX_FEATURES)
embedding_matrix = np.zeros((vocabulary_size, EMBED_SIZE))
for i, word in word_index.items():
    if i>=vocabulary_size:
        continue
    try:
        embedding_vector = w2vec_model[word]
        embedding_matrix[i] = embedding_vector
    except KeyError:
        embedding_matrix[i]=np.random.normal(0,np.sqrt(0.25),EMBED_SIZE)

In [9]:
w2vec_model['bom']

array([ 0.153647, -0.465788,  0.251488,  0.318377, -0.480387,  0.19145 ,
        0.00732 ,  0.006801,  0.02494 , -0.205871,  0.034185,  0.067732,
       -0.192845,  0.024924,  0.214222, -0.316504, -0.107316, -0.102628,
       -0.04959 ,  0.242558,  0.269601, -0.386686, -0.300041,  0.2861  ,
        0.212669,  0.171129,  0.048528, -0.088312,  0.033313,  0.245244,
       -0.233488,  0.006428,  0.109928,  0.313075,  0.224555, -0.103989,
        0.560228, -0.382119, -0.008523,  0.100352,  0.230134,  0.214044,
       -0.333335, -0.155709, -0.37962 , -0.134065,  0.032078,  0.181226,
       -0.337817, -0.110887], dtype=float32)

In [10]:
tokenizer.word_index['bom']

22

In [11]:
embedding_matrix[22]

array([ 0.15364701, -0.46578801,  0.251488  ,  0.31837699, -0.480387  ,
        0.19145   ,  0.00732   ,  0.006801  ,  0.02494   , -0.205871  ,
        0.034185  ,  0.067732  , -0.192845  ,  0.024924  ,  0.214222  ,
       -0.316504  , -0.107316  , -0.102628  , -0.04959   ,  0.242558  ,
        0.26960099, -0.386686  , -0.30004099,  0.2861    ,  0.212669  ,
        0.171129  ,  0.048528  , -0.088312  ,  0.033313  ,  0.245244  ,
       -0.23348799,  0.006428  ,  0.109928  ,  0.31307501,  0.224555  ,
       -0.103989  ,  0.56022799, -0.382119  , -0.008523  ,  0.100352  ,
        0.230134  ,  0.214044  , -0.33333501, -0.155709  , -0.37961999,
       -0.134065  ,  0.032078  ,  0.181226  , -0.33781701, -0.110887  ])

In [12]:
################################################################################
# Exemplo de uma rede LSTM utiizando embeddings
################################################################################

model = Sequential()
model.add(Embedding(input_dim=vocabulary_size, 
                           output_dim=EMBED_SIZE, 
                           input_length=MAX_LEN,
                           weights=[embedding_matrix],
                           mask_zero=True,
                           trainable=True))
model.add(LSTM(EMBED_SIZE))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

checkpoint_filepath = os.path.join(DATA_PATH, 'modelo_keras_lstm.checkpoint')
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    checkpoint_filepath,
    monitor="val_binary_accuracy",
    verbose=0,
    save_best_only=True,
    save_weights_only=True,
    mode="max",
    save_freq="epoch",
)

metrics = [
    tf.keras.metrics.BinaryAccuracy(),
    # tf.keras.metrics.AUC(name='auc', num_thresholds=200, curve='ROC', summation_method='interpolation'),
]

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=metrics)
model.summary()
model.fit(X_train, y_train, batch_size=16, epochs=10, validation_split=0.2, shuffle=True, verbose=1, callbacks=[model_checkpoint_callback])

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 300, 50)           500000    
                                                                 
 lstm (LSTM)                 (None, 50)                20200     
                                                                 
 dropout (Dropout)           (None, 50)                0         
                                                                 
 dense (Dense)               (None, 1)                 51        
                                                                 
Total params: 520,251
Trainable params: 520,251
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7ff2083aea10>

In [13]:
model.load_weights(checkpoint_filepath)
y_pred = model.predict(X_test)
print(classification_report(y_test, (np.array(y_pred)>=0.5).astype(int).reshape(-1)))

              precision    recall  f1-score   support

           0       0.93      0.96      0.94      7170
           1       0.97      0.95      0.96      9573

    accuracy                           0.95     16743
   macro avg       0.95      0.95      0.95     16743
weighted avg       0.95      0.95      0.95     16743



In [14]:
# exemplos de frases e predição
frases = [
  "O produto é de baixa qualidade e chegou atrasado.",
  "O produto é muito bom mas não parece atender as minhas necessidades.",
  "Parabéns, você é excelente em fazer péssimos produtos.",
]

t_frases = pad_sequences(tokenizer.texts_to_sequences(frases), maxlen=MAX_LEN, padding='post', truncating='post')
preds = model.predict(t_frases).reshape(-1)

for f, p in zip(frases, preds):
  display(Markdown(f'__{f}__: {p*100:.02f}%'))

__O produto é de baixa qualidade e chegou atrasado.__: 0.73%

__O produto é muito bom mas não parece atender as minhas necessidades.__: 37.63%

__Parabéns, você é excelente em fazer péssimos produtos.__: 99.81%

In [15]:
################################################################################
# Exemplo de uma rede CNN utiizando embeddings
################################################################################
model1 = Sequential()
model1.add(Embedding(input_dim=vocabulary_size, 
                           output_dim=EMBED_SIZE, 
                           input_length=MAX_LEN,
                           weights=[embedding_matrix],
                           trainable=True))
model1.add(Conv1D(8, 5, activation='relu'))
model1.add(GlobalMaxPooling1D())
model1.add(Dense(1, activation='sigmoid'))

checkpoint_filepath = os.path.join(DATA_PATH, 'modelo_keras_cnn.checkpoint')
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    checkpoint_filepath,
    monitor="val_binary_accuracy",
    verbose=0,
    save_best_only=True,
    save_weights_only=True,
    mode="max",
    save_freq="epoch",
)


metrics = [
    tf.keras.metrics.BinaryAccuracy(),
    # tf.keras.metrics.AUC(name='auc', num_thresholds=200, curve='ROC', summation_method='interpolation'),
    # tf.keras.metrics.Recall(name='recall'),
    # tf.keras.metrics.Precision(name='precision'),
    # tfa.metrics.F1Score(name='f1', num_classes=1, threshold=0.5),
    # tfa.metrics.MatthewsCorrelationCoefficient(name='mcc', num_classes=1),
    # tf.keras.metrics.TruePositives(name='tp'),
    # tf.keras.metrics.TrueNegatives(name='tn'),
    # tf.keras.metrics.FalsePositives(name='fp'),
    # tf.keras.metrics.FalseNegatives(name='fn'),
]

model1.compile(loss='binary_crossentropy', optimizer='adam', metrics=metrics)
model1.summary()
history = model1.fit(X_train, y_train, batch_size=16, epochs=10, 
                     validation_split=.2, shuffle=True, verbose=1, 
                     callbacks=[model_checkpoint_callback])

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 300, 50)           500000    
                                                                 
 conv1d (Conv1D)             (None, 296, 8)            2008      
                                                                 
 global_max_pooling1d (Globa  (None, 8)                0         
 lMaxPooling1D)                                                  
                                                                 
 dense_1 (Dense)             (None, 1)                 9         
                                                                 
Total params: 502,017
Trainable params: 502,017
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [16]:
model1.load_weights(checkpoint_filepath)
y_pred = model1.predict(X_test)

from tensorflow.keras.metrics import BinaryAccuracy

bacc = BinaryAccuracy()
bacc.update_state(y_test, y_pred)
bacc.result().numpy()

0.9514424

In [17]:
# exemplos de frases e predição
frases = [
  "O produto é de baixa qualidade e chegou atrasado.",
  "O produto é muito bom mas não parece atender as minhas necessidades.",
  "Parabéns, você é excelente em fazer péssimos produtos.",
]

t_frases = pad_sequences(tokenizer.texts_to_sequences(frases), maxlen=MAX_LEN, padding='post', truncating='post')
preds = model1.predict(t_frases).reshape(-1)

for f, p in zip(frases, preds):
  display(Markdown(f'__{f}__: {p*100:.02f}%'))

__O produto é de baixa qualidade e chegou atrasado.__: 0.01%

__O produto é muito bom mas não parece atender as minhas necessidades.__: 62.92%

__Parabéns, você é excelente em fazer péssimos produtos.__: 99.44%