## **Step 1: Importing dependencies**

In [1]:
import numpy as np
import re
import pandas as pd
from bs4 import BeautifulSoup
import csv
import matplotlib.pyplot as plt
# from google.colab import drive, files

In [2]:
try:
  %tensorflow_version 2.x
except Exception:
  pass
import tensorflow as tf

from tensorflow.keras import layers
import tqdm as notebook_tqdm

import tensorflow_datasets as tfds
from keras.utils.vis_utils import plot_model
print(tf.test.gpu_device_name())


  from .autonotebook import tqdm as notebook_tqdm


/device:GPU:0


2022-07-16 09:45:20.899083: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-07-16 09:45:21.273763: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /device:GPU:0 with 5776 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3070 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6


## **Step 2: Data preprocessing**

### Loading data


In [3]:
# Just required for run it in google colab
# drive.mount('/content/drive')

In [4]:
cols= ["text"]
# path = "/content/drive/MyDrive/projeto/Projeto II - BDI Uniasselvi/Dataset AMAZON/amazon_train.ft.txt"
path = "./amazon.ft.txt"

f = open(path, encoding="latin")

data = []
for line in f:
    data_line = [line[9], line[11:]]
    data.append(data_line)

In [5]:
df = pd.DataFrame(data, columns=['label', 'text'])
df.head(7)


Unnamed: 0,label,text
0,2,Great CD: My lovely Pat has one of the GREAT v...
1,2,One of the best game music soundtracks - for a...
2,1,Batteries died within a year ...: I bought thi...
3,2,"works fine, but Maha Energy is better: Check o..."
4,2,Great for the non-audiophile: Reviewed quite a...
5,1,DVD Player crapped out after one year: I also ...
6,1,"Incorrect Disc: I love the style of this, but ..."


### Cleaning and preparing

In [6]:
df['label'] = np.where(df['label'] == "2", 1, 0)
df = df[['text', 'label']]
df.head(7)

Unnamed: 0,text,label
0,Great CD: My lovely Pat has one of the GREAT v...,1
1,One of the best game music soundtracks - for a...,1
2,Batteries died within a year ...: I bought thi...,0
3,"works fine, but Maha Energy is better: Check o...",1
4,Great for the non-audiophile: Reviewed quite a...,1
5,DVD Player crapped out after one year: I also ...,0
6,"Incorrect Disc: I love the style of this, but ...",0


In [7]:
data_labels = df.label.values
set(data_labels)

{0, 1}

In [8]:
def clean(text):
  # text = BeautifulSoup(text).get_text()
  text = re.sub(r"@[A-Za-z0-9]+", " ", text)
  text = re.sub(r"https?://[A-Za-z0-9./]+", " ", text)
  text = re.sub(r"[^A-Za-z0-9]", " ", text)
  text = re.sub(r" +", " ", text)
  return text

In [9]:
data_clean = [clean(line) for line in df.text]

In [10]:
data_clean[1093]

'Great Read Suspenseful When I first started reading the book I kept putting it down it just wasn t grabbing my attention but once I got further into the book my curiosity got the best of me I couldn t put the book down I could not believe that a woman would do the things that she did Adrian Jenkins was confused and deceitful All hell does break loose Gregory s relationship with his mother was sad I understand her point for leaving but at the same time it was selfish It not only destroyed Gregory but it also destroyed his little sister I am definately looking forward to her next book '

### Tokenizing

In [11]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [12]:
num_words = 1000
oov_token = '<UNK>'
pad_type = 'post'
trunc_type = 'post'

In [13]:
# Tokenize our training data
tokenizer = Tokenizer(num_words=num_words, oov_token=oov_token)
tokenizer.fit_on_texts(data_clean)

# Get our training data word index
word_index = tokenizer.word_index

# Encode training data sentences into sequences
train_sequences = tokenizer.texts_to_sequences(data_clean)

# Get max training sequence length
maxlen = max([len(x) for x in train_sequences])

# Pad the training sequences
train_padded = pad_sequences(train_sequences, padding=pad_type, truncating=trunc_type, maxlen=maxlen)

In [14]:
print(train_padded)

[[ 32  83  24 ...   0   0   0]
 [ 27   8   2 ...   0   0   0]
 [783   1 599 ...   0   0   0]
 ...
 [  1   1   1 ...   0   0   0]
 [  1 120   1 ...   0   0   0]
 [  1  21   1 ...   0   0   0]]


### Split train / test

In [15]:
size = len(train_padded)
test_idx = np.random.randint(0, size, round(size/10))

In [16]:
test_inputs = train_padded[test_idx]
test_labels = data_labels[test_idx]

In [17]:
train_inputs = np.delete(train_padded, test_idx, axis=0)

In [18]:
train_labels = np.delete(data_labels, test_idx)

## **Step 3: Building the Model**

In [19]:
class DCNN(tf.keras.Model):
  def __init__(self, 
               vocab_size, 
               emb_dim=128, 
               nb_filters=50, 
               FFN_units=512,
               nb_classes=2,
               dropout_rate=0.1,
               training=False,
               name="dcnn"):
    super(DCNN, self).__init__(name=name)

    self.embeding = layers.Embedding(vocab_size, emb_dim)

    self.bigram = layers.Conv1D(filters=nb_filters,
                                kernel_size=2,
                                padding="valid",
                                activation="relu")
    
    self.pool_l = layers.GlobalMaxPool1D()

    self.trigram = layers.Conv1D(filters=nb_filters,
                                kernel_size=3,
                                padding="valid",
                                activation="relu")
    
    self.pool_2 = layers.GlobalMaxPool1D()

    self.fourgram = layers.Conv1D(filters=nb_filters,
                                kernel_size=4,
                                padding="valid",
                                activation="relu")
    
    self.pool_3 = layers.GlobalMaxPool1D()

    self.dense_l = layers.Dense(units=FFN_units, 
                                activation="relu")

    self.dropout = layers.Dropout(rate=dropout_rate)

    if nb_classes == 2:
      self.last_dense = layers.Dense(units=1, 
                                     activation="sigmoid")
    else:
      self.last_dense = layers.Dense(units=nb_classes,
                                     activation="softmax")
  def call(self, inputs, training):
    x = self.embeding(inputs)
    x_1 = self.bigram(x)
    x_1 = self.pool_l(x_1)
    x_2 = self.trigram(x)
    x_2 = self.pool_l(x_2)
    x_3 = self.trigram(x)
    x_3 = self.pool_l(x_3)

    merged = tf.concat([x_1, x_2, x_3], axis=-1)
    merged = self.dense_l(merged)
    merged = self.dropout(merged, training)
    output = self.last_dense(merged)

    return output

## **Stpe 4: Training Model**

### Param config

In [20]:
VOCAB_SIZE = num_words

EMB_DIM = 200
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = len(set(train_labels))

DROPOUT_RATE = 0.2

BATCH_SIZE = 32
NB_EPOCHS = 8

In [21]:
model = DCNN(
    vocab_size=VOCAB_SIZE, 
    emb_dim=EMB_DIM, 
    nb_filters=EMB_DIM, 
    FFN_units=FFN_UNITS,
    nb_classes=NB_CLASSES,
    dropout_rate=DROPOUT_RATE,
)

2022-07-16 09:47:23.346476: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 5776 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3070 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6


### Train

In [22]:
if NB_CLASSES == 2:
  model.compile(loss="binary_crossentropy",
                optimizer="adam",
                metrics=["accuracy"])
else:
  model.compile(loss="sparse_categorical_crossentropy",
                optimizer="adam",
                metrics=["sparse_categorical_accuracy"])

In [23]:
checkpoint_path = "/checkpoint"
ckpt = tf.train.Checkpoint(model=model)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=1)

if ckpt_manager.latest_checkpoint:
  ckpt.restore(ckpt_manager.latest_checkpoint)
  print("Checkpoint looaded from {}".format(ckpt_manager.latest_checkpoint))

In [24]:
history = model.fit(train_inputs,
                    train_labels,
                    batch_size=BATCH_SIZE,
                    epochs=NB_EPOCHS)


2022-07-16 09:47:23.456042: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 1305502020 exceeds 10% of free system memory.


Epoch 1/8


2022-07-16 09:47:25.617062: I tensorflow/stream_executor/cuda/cuda_dnn.cc:368] Loaded cuDNN version 8100
2022-07-16 09:47:26.443332: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2022-07-16 09:47:27.423214: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


### Visualize Model

In [25]:
print(model.summary())

Model: "dcnn"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       multiple                  200000    
                                                                 
 conv1d (Conv1D)             multiple                  80200     
                                                                 
 global_max_pooling1d (Globa  multiple                 0         
 lMaxPooling1D)                                                  
                                                                 
 conv1d_1 (Conv1D)           multiple                  120200    
                                                                 
 global_max_pooling1d_1 (Glo  multiple                 0 (unused)
 balMaxPooling1D)                                                
                                                                 
 conv1d_2 (Conv1D)           multiple                  0 (unus

In [1]:
tf.keras.utils.plot_model(loaded_model, to_file="model_plot.png", show_shapes=True)

NameError: name 'tf' is not defined

## **Step 5: Model evaluation**

In [27]:
test_loss, test_acc = model.evaluate(test_inputs, test_labels, batch_size=BATCH_SIZE, verbose=2)

4527/4527 - 16s - loss: 0.1983 - accuracy: 0.9250 - 16s/epoch - 3ms/step


In [28]:
print(test_acc)

0.9249609708786011


In [29]:
def predict(list_of_sequences):
    sentences = tokenizer.texts_to_sequences(list_of_sequences)
    predict_padded = pad_sequences(sentences, padding=pad_type, truncating=trunc_type, maxlen=maxlen)
    for i in model.predict(predict_padded):
        print([i, "Positive" if i > 0.5 else "Negative"])

In [30]:
predict(["I love this", 
         "I can buy a new one of this every year", 
         "Definitely a good choice",
         "I wanna more this is awesome I would like to have bought it earlier",
         ])


[array([1.], dtype=float32), 'Positive']
[array([0.28406608], dtype=float32), 'Negative']
[array([0.9994722], dtype=float32), 'Positive']
[array([0.9879224], dtype=float32), 'Positive']


In [31]:
predict(["I dont like this book, seems very silly", 
         "I cant recommend this brand again",
         "Not sure if this is a good choice for you, for me its absolutely useless",
         "This? Just for my haters. Its impossible to make this works. I guess its the worst game Ive bought in my life"
         ])

[array([0.02223525], dtype=float32), 'Negative']
[array([0.98893994], dtype=float32), 'Positive']
[array([0.01104098], dtype=float32), 'Negative']
[array([0.0161139], dtype=float32), 'Negative']


### Save model

In [42]:
model.save("amazon_nlp", save_format='tf')

INFO:tensorflow:Assets written to: amazon_nlp/assets
