## FNetEncoder e TransformerEncoder

### Import de módulos

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow import keras
import tensorflow_datasets as tfds
from keras import layers
import re
import string

import keras_nlp
import random
import os

from tensorflow import keras
from tensorflow_text.tools.wordpiece_vocab import bert_vocab_from_dataset as bert_vocab

keras.utils.set_random_seed(42)

### Principais parametros

In [2]:
BATCH_SIZE = 64
EPOCHS = 3
MAX_SEQUENCE_LENGTH = 512 # Somente considerar as 512 primeiras palavras de cada review
VOCAB_SIZE = 15000 # Somente considerar 15000 palavras
EMBED_DIM = 128
INTERMEDIATE_DIM = 32

### Carregando dados

In [3]:
df = pd.read_csv('https://raw.githubusercontent.com/prof-renato/data/main/yelp.csv')[['text', 'stars']]
df.head(10)

Unnamed: 0,text,stars
0,My wife took me here on my birthday for breakf...,5
1,I have no idea why some people give bad review...,5
2,love the gyro plate. Rice is so good and I als...,4
3,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",5
4,General Manager Scott Petello is a good egg!!!...,5
5,"Quiessence is, simply put, beautiful. Full wi...",4
6,Drop what you're doing and drive here. After I...,5
7,"Luckily, I didn't have to travel far to make m...",4
8,Definitely come for Happy hour! Prices are ama...,4
9,Nobuo shows his unique talents with everything...,5


In [4]:
df['stars'].value_counts().unique

<bound method Series.unique of 4    3526
5    3337
3    1461
2     927
1     749
Name: stars, dtype: int64>

### Separando inputs e outputs

In [5]:
X = df['text'].values
y = df['stars'].values

In [6]:
y = pd.get_dummies(y)
y.head(10)

Unnamed: 0,1,2,3,4,5
0,0,0,0,0,1
1,0,0,0,0,1
2,0,0,0,1,0
3,0,0,0,0,1
4,0,0,0,0,1
5,0,0,0,1,0
6,0,0,0,0,1
7,0,0,0,1,0
8,0,0,0,1,0
9,0,0,0,0,1


### Convertendo para formato do tensorflow

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.20, random_state=42)

train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val))
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))

print(f"Training set size: {len(train_dataset)}")
print(f"Validation set size: {len(val_dataset)}")
print(f"Testing set size: {len(test_dataset)}")

2022-09-05 15:04:47.357007: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-09-05 15:04:47.409642: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-09-05 15:04:47.410018: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero


Training set size: 5600
Validation set size: 1400
Testing set size: 3000


2022-09-05 15:04:47.411646: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-09-05 15:04:47.412395: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-09-05 15:04:47.412717: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-09-05 15:04:47.412914: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zer

### Pré-processamento de dados

In [8]:
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
    return tf.strings.regex_replace(
        stripped_html, f"[{re.escape(string.punctuation)}]", ""
    )

vectorizer = layers.TextVectorization(3000, standardize=custom_standardization, output_sequence_length=150)

# Para um conjunto de dados muito grande, o adapt pode remover cópias sobressalentes do conjunto de dados da memória.
vectorizer.adapt(train_dataset.map(lambda x, y: x, num_parallel_calls=tf.data.AUTOTUNE).batch(BATCH_SIZE))

def vectorize_text(text, label):
    text = vectorizer(text)
    return text, label

# Vetorização dos dados e pré-busca/buffer assíncrona dos dados para melhor desempenho na GPU.
train_ds = train_dataset.batch(BATCH_SIZE).map(vectorize_text, num_parallel_calls=tf.data.AUTOTUNE)
val_ds = val_dataset.batch(BATCH_SIZE).map(vectorize_text, num_parallel_calls=tf.data.AUTOTUNE)
test_ds = test_dataset.batch(BATCH_SIZE).map(vectorize_text, num_parallel_calls=tf.data.AUTOTUNE)

### Arquitetura do modelo

In [9]:
inputs = keras.Input(shape=(None,), dtype="int64")
x = layers.Embedding(input_dim=MAX_SEQUENCE_LENGTH, output_dim=EMBED_DIM)(inputs)
x = layers.Bidirectional(layers.LSTM(INTERMEDIATE_DIM, return_sequences=True))(x)
x = layers.Bidirectional(layers.LSTM(INTERMEDIATE_DIM, return_sequences=True))(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(5, activation="softmax")(x)
lstm_model = keras.Model(inputs, outputs)
lstm_model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding (Embedding)       (None, None, 128)         65536     
                                                                 
 bidirectional (Bidirectiona  (None, None, 64)         41216     
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, None, 64)         24832     
 nal)                                                            
                                                                 
 global_average_pooling1d (G  (None, 64)               0         
 lobalAveragePooling1D)                                          
                                                             

In [10]:
lstm_model.compile(
        loss="categorical_crossentropy",
        optimizer="adam",
        metrics=[
            'accuracy',
        ],
    )

### Treinamento

In [11]:
lstm_model.fit(
    train_ds,
    epochs=20,
    validation_data=val_ds,
)

Epoch 1/20


2022-09-05 15:04:57.769895: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8204


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fe2605589d0>

### Avaliação

In [12]:
lstm_model.evaluate(test_ds, verbose=1, return_dict=True)



{'loss': 1.4231014251708984, 'accuracy': 0.4546666741371155}

## Comparativo de modelos

### Carregando dataset

In [13]:
dataset = tfds.load(
    "imdb_reviews",
    split="train + test",
    as_supervised=True,
    batch_size=-1,
    shuffle_files=False,
)
reviews, labels = tfds.as_numpy(dataset)

### Separando conjuntos de dados

In [14]:
X_train, X_test, y_train, y_test = train_test_split(reviews, labels, test_size=0.30, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.20, random_state=42)

train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val))
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))

print(f"Training set size: {len(train_dataset)}")
print(f"Validation set size: {len(val_dataset)}")
print(f"Testing set size: {len(test_dataset)}")

Training set size: 28000
Validation set size: 7000
Testing set size: 15000


### Pré-processamento de dados

In [15]:
# Vetorização dos dados e pré-busca/buffer assíncrona dos dados para melhor desempenho na GPU.
train_ds = train_dataset.batch(BATCH_SIZE).map(vectorize_text, num_parallel_calls=tf.data.AUTOTUNE)
val_ds = val_dataset.batch(BATCH_SIZE).map(vectorize_text, num_parallel_calls=tf.data.AUTOTUNE)
test_ds = test_dataset.batch(BATCH_SIZE).map(vectorize_text, num_parallel_calls=tf.data.AUTOTUNE)

### Arquitetura de modelo 1: LSTM

In [16]:
inputs = keras.Input(shape=(None,), dtype="int64")
x = layers.Embedding(input_dim=MAX_SEQUENCE_LENGTH, output_dim=EMBED_DIM)(inputs)
x = layers.Bidirectional(layers.LSTM(INTERMEDIATE_DIM, return_sequences=True))(x)
x = layers.Bidirectional(layers.LSTM(INTERMEDIATE_DIM, return_sequences=True))(x)
x = layers.Bidirectional(layers.LSTM(INTERMEDIATE_DIM, return_sequences=True))(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
lstm_model = keras.Model(inputs, outputs)
lstm_model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding_1 (Embedding)     (None, None, 128)         65536     
                                                                 
 bidirectional_2 (Bidirectio  (None, None, 64)         41216     
 nal)                                                            
                                                                 
 bidirectional_3 (Bidirectio  (None, None, 64)         24832     
 nal)                                                            
                                                                 
 bidirectional_4 (Bidirectio  (None, None, 64)         24832     
 nal)                                                            
                                                           

In [17]:
lstm_model.compile(
        loss="binary_crossentropy",
        optimizer="adam",
        metrics=[
            'accuracy',
        ],
    )

#### Treinamento

In [18]:
lstm_model.fit(
    train_ds,
    epochs=EPOCHS,
    validation_data=val_ds,
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fe2540ef2d0>

#### Avaliação

In [19]:
lstm_model.evaluate(test_ds, verbose=1, return_dict=True)



{'loss': 0.45852214097976685, 'accuracy': 0.7825999855995178}

In [20]:
inputs = keras.Input(shape=(None,), dtype="int64")

x = keras_nlp.layers.TokenAndPositionEmbedding(
    vocabulary_size=VOCAB_SIZE,
    sequence_length=MAX_SEQUENCE_LENGTH,
    embedding_dim=EMBED_DIM,
)(inputs) ## Além de transformar o dados esparso em denso, a posição do token é levada em conta para que dar um sentido semantico ao dado

x = keras_nlp.layers.FNetEncoder(intermediate_dim=INTERMEDIATE_DIM)(x)
x = keras_nlp.layers.FNetEncoder(intermediate_dim=INTERMEDIATE_DIM)(x)
x = keras_nlp.layers.FNetEncoder(intermediate_dim=INTERMEDIATE_DIM)(x)

x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)

fnet_model = keras.Model(inputs, outputs)
fnet_model.summary()

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, None)]            0         
                                                                 
 token_and_position_embeddin  (None, None, 128)        1985536   
 g (TokenAndPositionEmbeddin                                     
 g)                                                              
                                                                 
 f_net_encoder (FNetEncoder)  (None, None, 128)        8864      
                                                                 
 f_net_encoder_1 (FNetEncode  (None, None, 128)        8864      
 r)                                                              
                                                                 
 f_net_encoder_2 (FNetEncode  (None, None, 128)        8864      
 r)                                                        

In [21]:
fnet_model.compile(
    optimizer='adam',
    loss="binary_crossentropy",
    metrics=["accuracy"],
)

#### Treinamento

In [22]:
fnet_model.fit(train_ds, epochs=EPOCHS, validation_data=val_ds)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fe1b47dadd0>

#### Avaliação

In [23]:
fnet_model.evaluate(test_ds, verbose=1, return_dict=True)



{'loss': 0.44057032465934753, 'accuracy': 0.8101999759674072}

In [24]:
NUM_HEADS = 2
inputs = keras.Input(shape=(None,), dtype="int64", name="input_ids")

x = keras_nlp.layers.TokenAndPositionEmbedding(
    vocabulary_size=VOCAB_SIZE,
    sequence_length=MAX_SEQUENCE_LENGTH,
    embedding_dim=EMBED_DIM,
)(inputs)

x = keras_nlp.layers.TransformerEncoder(intermediate_dim=INTERMEDIATE_DIM, num_heads=NUM_HEADS)(x)
x = keras_nlp.layers.TransformerEncoder(intermediate_dim=INTERMEDIATE_DIM, num_heads=NUM_HEADS)(x)
x = keras_nlp.layers.TransformerEncoder(intermediate_dim=INTERMEDIATE_DIM, num_heads=NUM_HEADS)(x)

x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)

transformer_model = keras.Model(inputs, outputs)
transformer_model.summary()


Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_ids (InputLayer)      [(None, None)]            0         
                                                                 
 token_and_position_embeddin  (None, None, 128)        1985536   
 g_1 (TokenAndPositionEmbedd                                     
 ing)                                                            
                                                                 
 transformer_encoder (Transf  (None, None, 128)        74912     
 ormerEncoder)                                                   
                                                                 
 transformer_encoder_1 (Tran  (None, None, 128)        74912     
 sformerEncoder)                                                 
                                                                 
 transformer_encoder_2 (Tran  (None, None, 128)        7491

In [25]:
transformer_model.compile(
    optimizer='adam',
    loss="binary_crossentropy",
    metrics=["accuracy"],
)

#### Treinamento

In [26]:
transformer_model.fit(train_ds, epochs=EPOCHS, validation_data=val_ds)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fe1b40a5850>

#### Avaliação

In [27]:
transformer_model.evaluate(test_ds, verbose=1, return_dict=True)



{'loss': 0.5258623361587524, 'accuracy': 0.7649999856948853}

## Checkpoint 4.6
Aplicar FnetEncoder e TransformerEncoder na base do [yelp](https://raw.githubusercontent.com/prof-renato/data/main/yelp.csv).
Qual modelo performa melhor? 