## **Fase 1: Importando dependências**

In [2]:
import numpy as np
import math
import re
import pandas as pd
from bs4 import BeautifulSoup
import csv
import matplotlib.pyplot as plt
# from google.colab import drive, files

In [3]:
try:
  %tensorflow_version 2.x
except Exception:
  pass
import tensorflow as tf

from tensorflow.keras import layers
import tensorflow_datasets as tfds


## **Fase 2: Preprocessamento dos Dados**

### Loading data


In [4]:
# Just required for run it in google colab
# drive.mount('/content/drive')

In [5]:
cols= ["text"]
# path = "/content/drive/MyDrive/projeto/Projeto II - BDI Uniasselvi/Dataset AMAZON/amazon_train.ft.txt"
path = "./amazon_train.ft.txt"

f = open(path, encoding="latin")

data = []
for line in f:
    data_line = [line[9], line[11:]]
    data.append(data_line)

In [6]:
df = pd.DataFrame(data, columns=['label', 'text'])
df.head(7)


Unnamed: 0,label,text
0,2,Stuning even for the non-gamer: This sound tra...
1,2,The best soundtrack ever to anything.: I'm rea...
2,2,"""Amazing!: This soundtrack is my favorite musi..."
3,2,Excellent Soundtrack: I truly like this soundt...
4,2,"Remember, Pull Your Jaw Off The Floor After He..."
5,2,an absolute masterpiece: I am quite sure any o...
6,1,"""Buyer beware: This is a self-published book, ..."


### Limpeza

In [7]:
df['label'] = np.where(df['label'] == "2", 1, 0)
df = df[['text', 'label']]
df.head(7)

Unnamed: 0,text,label
0,Stuning even for the non-gamer: This sound tra...,1
1,The best soundtrack ever to anything.: I'm rea...,1
2,"""Amazing!: This soundtrack is my favorite musi...",1
3,Excellent Soundtrack: I truly like this soundt...,1
4,"Remember, Pull Your Jaw Off The Floor After He...",1
5,an absolute masterpiece: I am quite sure any o...,1
6,"""Buyer beware: This is a self-published book, ...",0


In [8]:
data_labels = df.label.values
set(data_labels)

{0, 1}

In [9]:
def clean(text):
  # text = BeautifulSoup(text, "lxml").get_text()
  text = re.sub(r"@[A-Za-z0-9]+", " ", text)
  text = re.sub(r"https?://[A-Za-z0-9./]+", " ", text)
  text = re.sub(r"[^A-Za-z0-9.:?!'\"]", " ", text)
  text = re.sub(r" +", " ", text)
  return text

In [10]:
data_clean = [clean(line) for line in df.text]

In [11]:
data_clean[1093]

"Holiday Tradition: when I was growing up during the week before Thanksgiving I would pull out my copy of 'A Charlie Brown Christmas' and would watch it constantly until Christmas its one of my FAVORITE holiday movies of all time. I recently started my own tradition with my little ones by introducing them to this movie and each year as the holidays start to arrive they beg me to pull it out and we watch it every day until Christmas. "

### Tokenização

In [12]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [13]:
num_words = 10000
oov_token = '<UNK>'
pad_type = 'post'
trunc_type = 'post'

In [14]:
# Tokenize our training data
tokenizer = Tokenizer(num_words=num_words, oov_token=oov_token)
tokenizer.fit_on_texts(data_clean)

# Get our training data word index
word_index = tokenizer.word_index

# Encode training data sentences into sequences
train_sequences = tokenizer.texts_to_sequences(data_clean)

# Get max training sequence length
maxlen = max([len(x) for x in train_sequences])

# Pad the training sequences
train_padded = pad_sequences(train_sequences, padding=pad_type, truncating=trunc_type, maxlen=maxlen)

### Split train / test

In [15]:
size = len(train_padded)
test_idx = np.random.randint(0, size, round(size/10))

In [16]:
test_inputs = train_padded[test_idx]
test_labels = data_labels[test_idx]

In [17]:
train_inputs = np.delete(train_padded, test_idx, axis=0)

In [18]:
train_labels = np.delete(data_labels, test_idx)

## **Fase 3: Construção do Modelo**

In [19]:
class DCNN(tf.keras.Model):
  def __init__(self, 
               vocab_size, 
               emb_dim=128, 
               nb_filters=50, 
               FFN_units=512,
               nb_classes=2,
               dropout_rate=0.1,
               training=False,
               name="dcnn"):
    super(DCNN, self).__init__(name=name)

    self.embeding = layers.Embedding(vocab_size, emb_dim)

    self.bigram = layers.Conv1D(filters=nb_filters,
                                kernel_size=2,
                                padding="valid",
                                activation="relu")
    
    self.pool_l = layers.GlobalMaxPool1D()

    self.trigram = layers.Conv1D(filters=nb_filters,
                                kernel_size=3,
                                padding="valid",
                                activation="relu")
    
    self.pool_2 = layers.GlobalMaxPool1D()

    self.fourgram = layers.Conv1D(filters=nb_filters,
                                kernel_size=4,
                                padding="valid",
                                activation="relu")
    
    self.pool_3 = layers.GlobalMaxPool1D()

    self.dense_l = layers.Dense(units=FFN_units, 
                                activation="relu")

    self.dropout = layers.Dropout(rate=dropout_rate)

    if nb_classes == 2:
      self.last_dense = layers.Dense(units=1, 
                                     activation="sigmoid")
    else:
      self.last_dense = layers.Dense(units=nb_classes,
                                     activation="softmax")
  def call(self, inputs, training):
    x = self.embeding(inputs)
    x_1 = self.bigram(x)
    x_1 = self.pool_l(x_1)
    x_2 = self.trigram(x)
    x_2 = self.pool_l(x_2)
    x_3 = self.trigram(x)
    x_3 = self.pool_l(x_3)

    merged = tf.concat([x_1, x_2, x_3], axis=-1)
    merged = self.dense_l(merged)
    merged = self.dropout(merged, training)
    output = self.last_dense(merged)

    return output

## **Fase 4: Treinamento dos dados**

### Config dos parâmetros

In [20]:
VOCAB_SIZE = num_words

EMB_DIM = 200
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = len(set(train_labels))

DROPOUT_RATE = 0.2

BATCH_SIZE = 32
NB_EPOCHS = 10

### Treino

In [21]:
model = DCNN(
    vocab_size=VOCAB_SIZE, 
    emb_dim=EMB_DIM, 
    nb_filters=EMB_DIM, 
    FFN_units=FFN_UNITS,
    nb_classes=NB_CLASSES,
    dropout_rate=DROPOUT_RATE,
)

2022-04-24 20:54:17.741622: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-04-24 20:54:18.290291: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 5750 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3070 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6


In [22]:
if NB_CLASSES == 2:
  model.compile(loss="binary_crossentropy",
                optimizer="adam",
                metrics=["accuracy"])
else:
  model.compile(loss="sparse_categorical_crossentropy",
                optimizer="adam",
                metrics=["sparse_categorical_accuracy"])

In [23]:
checkpoint_path = "/checkpoint"
ckpt = tf.train.Checkpoint(model=model)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=1)

if ckpt_manager.latest_checkpoint:
  ckpt.restore(ckpt_manager.latest_checkpoint)
  print("Checkpoint carregado")

In [24]:
history = model.fit(train_inputs,
          train_labels,
          batch_size=BATCH_SIZE,
          epochs=NB_EPOCHS)

2022-04-24 20:54:18.401195: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 944908188 exceeds 10% of free system memory.


Epoch 1/10


2022-04-24 20:54:21.217135: I tensorflow/stream_executor/cuda/cuda_dnn.cc:368] Loaded cuDNN version 8400
2022-04-24 20:54:22.673478: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2022-04-24 20:54:22.765992: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## **Fase 5: Avaliação do modelo**

In [29]:
test_loss, test_acc = model.evaluate(test_inputs, test_labels, batch_size=BATCH_SIZE, verbose=2)

3277/3277 - 11s - loss: 0.4197 - accuracy: 0.9256 - 11s/epoch - 3ms/step


In [30]:
print(test_acc)

0.925556480884552


In [39]:
predict_positive = tokenizer.texts_to_sequences(["I love this", 
                                                 "I can buy a new one of this every year", 
                                                 "Definitely a good choice!",
                                                 "I wanna more, this is awesome. I would like to have bought it earlier",
                                                 ])
predict_padded = pad_sequences(predict_positive, padding=pad_type, truncating=trunc_type, maxlen=maxlen)


In [40]:
for i in model.predict(predict_padded):
    print([i, "Positive" if i > 0.5 else "Negative"])

[array([0.9998785], dtype=float32), 'Positive']
[array([0.8869326], dtype=float32), 'Positive']
[array([0.9868096], dtype=float32), 'Positive']
[array([0.99965215], dtype=float32), 'Positive']


In [41]:
predict_positive = tokenizer.texts_to_sequences(["I dont like this book, seems very silly", 
                                                 "I cant recommend this brand again",
                                                 "Not sure if this is a good choice for you, for me its absolutely useless",
                                                 "This? Just for my haters. Its impossible to make this works. I guess its the worst game Ive bought in my life"
                                                 ])
predict_padded = pad_sequences(predict_positive, padding=pad_type, truncating=trunc_type, maxlen=maxlen)

In [42]:
for i in model.predict(predict_padded):
    print([i, "Positive" if i > 0.5 else "Negative"])

[array([4.9001927e-09], dtype=float32), 'Negative']
[array([1.424223e-07], dtype=float32), 'Negative']
[array([0.0028689], dtype=float32), 'Negative']
[array([0.01327246], dtype=float32), 'Negative']
