<a href="https://colab.research.google.com/github/diegovasconcelo/BERT-Tokenizator/blob/main/BERT_Tokenizador.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fase 1: Importar las dependencias

In [None]:
import numpy as np
import math
import re
import pandas as pd
from bs4 import BeautifulSoup
import random

from google.colab import drive

In [None]:
!pip install bert-for-tf2
!pip install sentencepiece



In [None]:
try:
    %tensorflow_version 2.x
except Exception:
    pass
import tensorflow as tf

import tensorflow_hub as hub

from tensorflow.keras import layers
import bert

# Fase 2: Pre procesado de datos

## Carga de los ficheros

Importamos los ficheros desde nuestro Google Drive personal.

In [None]:
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
cols = ["sentiment", "id", "date", "query", "user", "text"]
data = pd.read_csv(
    "/content/drive/MyDrive/Curso de NLP/BERT/sentiment_data/training.csv",
    header=None,
    names=cols,
    engine="python",
    encoding="latin1"
)


In [None]:
data.drop(["id", "date", "query", "user"],
          axis=1, #elimino las columnas que noy a necesitar
          inplace=True) #Ahorrar un paso, directamente reemplazo el data origial

In [None]:
data.head(6)

Unnamed: 0,sentiment,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."
5,0,@Kwesidei not the whole crew


## Preprocessing

### Cleaning

In [None]:
def clean_tweet(tweet):
    tweet = BeautifulSoup(tweet, "lxml").get_text()
    # Eliminar el @
    tweet = re.sub(r"@[A-Za-z0-9]+", ' ', tweet)
    # Eliminar los links de la URL
    tweet = re.sub(r"https?://[A-Za-z0-9./]+", ' ', tweet)
    # Conservamos solamente las letras ^:Todo lo que no sea....
    tweet = re.sub(r"[^a-zA-Z.!?']", ' ', tweet)
    # Eliminamos espacios en blanco adicionales
    tweet = re.sub(r" +", ' ', tweet)
    return tweet

In [None]:
data_clean = [clean_tweet(tweet) for tweet in data.text]

In [None]:
data_labels = data.sentiment.values
data_labels[data_labels == 4] = 1

### Tokenization

Necesitaremos crear una capa BERT para tener acceso a los meta datos para el tokenizador (como el tamaño del vocabulario).

In [None]:
FullTokenizer = bert.bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=False)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

In [None]:
def encode_sentence(sent):
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sent))

In [None]:
data_inputs = [encode_sentence(sentence) for sentence in data_clean]

### Creación del data set

Crearemos padded batches (por lo que rellenamos las frases para cada lote de forma independiente), de esta forma añadimos el mínimo número de tokens de padding posible. Para eso, ordenamos las frases por longitud, aplicamos padded_batches y luego las mezclamos (La primera mitad corresponden a sentinmientos negativos y la otra mitad a positivos).

In [None]:
data_with_len = [[sent, data_labels[i], len(sent)]
                 for i, sent in enumerate(data_inputs)]
random.shuffle(data_with_len)
data_with_len.sort(key=lambda x: x[2])
sorted_all = [(sent_lab[0], sent_lab[1])
              for sent_lab in data_with_len if sent_lab[2] > 7] #Saco frases cortas

Como las frases de entradas no tienen todas las mismas longitud. Por ello utilizo un generador para "arreglar" esto. Un generator, le doy un elemento y me devuelve otro.

In [None]:
all_dataset = tf.data.Dataset.from_generator(lambda: sorted_all,
                                             output_types=(tf.int32, tf.int32))

In [None]:
next(iter(all_dataset))

(<tf.Tensor: shape=(8,), dtype=int32, numpy=array([4283, 1045, 1005, 2222, 2156, 2017, 2574,  999], dtype=int32)>,
 <tf.Tensor: shape=(), dtype=int32, numpy=1>)

In [None]:
BATCH_SIZE = 32
all_batched = all_dataset.padded_batch(BATCH_SIZE, padded_shapes=((None, ), ()))

In [None]:
next(iter(all_batched))

(<tf.Tensor: shape=(32, 8), dtype=int32, numpy=
 array([[ 4283,  1045,  1005,  2222,  2156,  2017,  2574,   999],
        [ 2003,  2125,  2000,  2156, 20773,  3892,  1999, 10330],
        [ 3383,  2065,  2051,  1998,  2769,  3499,  2009,  1012],
        [ 1040,  1040,  2129,  2001,  1038, 20492,  2232,  1029],
        [ 2183,  2000,  2225,  6261,  2050,  2000,  3942,  2155],
        [ 2003, 11812,  2007,  1045, 12036,  1005,  1055,  1012],
        [13843,  4691,  4440,  2689,  2000,  8991,  3436,  4440],
        [12855, 17969,  2025,  2026,  7967,  2100,  3924, 16780],
        [ 2073,  2115, 10722, 14905, 20974,  2015,  3632,  1029],
        [ 3693,  5821,  2082,  2005,  4714,  7408,  1045,  2106],
        [ 2025,  2146,  2187,  1997,  1996,  7570,  4877,  1060],
        [ 8271,  2039,  2023,  2851,  2007,  1037,  2919, 14978],
        [ 4365,  1045,  4299,  1057,  2020,  2182, 16525,  2205],
        [ 3071,  2323,  2272,  2000,  2256,  2208,  3892,  2012],
        [ 2204,  2000,  2022

Genero mi propios dataset para entrenamiento y test usando los datos ya limpios 

In [None]:
NB_BATCHES = math.ceil(len(sorted_all) / BATCH_SIZE)
NB_BATCHES_TEST = NB_BATCHES // 10 #cociente de la división entera
all_batched.shuffle(NB_BATCHES)
test_dataset = all_batched.take(NB_BATCHES_TEST)
train_dataset = all_batched.skip(NB_BATCHES_TEST)

# Fase 3: Construcción del modelo

In [None]:
class DCNN(tf.keras.Model):
    
    def __init__(self,
                 vocab_size,
                 emb_dim=128,
                 nb_filters=50,
                 FFN_units=512,
                 nb_classes=2,
                 dropout_rate=0.1,
                 training=False,
                 name="dcnn"):
        super(DCNN, self).__init__(name=name)
        
        self.embedding = layers.Embedding(vocab_size,
                                          emb_dim)
        self.bigram = layers.Conv1D(filters=nb_filters,
                                    kernel_size=2,
                                    padding="valid",
                                    activation="relu")
        self.trigram = layers.Conv1D(filters=nb_filters,
                                     kernel_size=3,
                                     padding="valid",
                                     activation="relu")
        self.fourgram = layers.Conv1D(filters=nb_filters,
                                      kernel_size=4,
                                      padding="valid",
                                      activation="relu")
        self.pool = layers.GlobalMaxPool1D()
        
        self.dense_1 = layers.Dense(units=FFN_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        if nb_classes == 2:
            self.last_dense = layers.Dense(units=1,
                                           activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=nb_classes,
                                           activation="softmax")
    
    def call(self, inputs, training):
        x = self.embedding(inputs)
        x_1 = self.bigram(x) # (batch_size, nb_filters, seq_len-1)
        x_1 = self.pool(x_1) # (batch_size, nb_filters)
        x_2 = self.trigram(x) # (batch_size, nb_filters, seq_len-2)
        x_2 = self.pool(x_2) # (batch_size, nb_filters)
        x_3 = self.fourgram(x) # (batch_size, nb_filters, seq_len-3)
        x_3 = self.pool(x_3) # (batch_size, nb_filters)
        
        merged = tf.concat([x_1, x_2, x_3], axis=-1) # (batch_size, 3 * nb_filters)
        merged = self.dense_1(merged)
        merged = self.dropout(merged, training)
        output = self.last_dense(merged)
        
        return output

# Fase 4: Entrenamiento

In [None]:
VOCAB_SIZE = len(tokenizer.vocab)
EMB_DIM = 200
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = 2

DROPOUT_RATE = 0.2

NB_EPOCHS = 5

In [None]:
Dcnn = DCNN(vocab_size=VOCAB_SIZE,
            emb_dim=EMB_DIM,
            nb_filters=NB_FILTERS,
            FFN_units=FFN_UNITS,
            nb_classes=NB_CLASSES,
            dropout_rate=DROPOUT_RATE)

In [None]:
if NB_CLASSES == 2:
    Dcnn.compile(loss="binary_crossentropy",
                 optimizer="adam",
                 metrics=["accuracy"])
else:
    Dcnn.compile(loss="sparse_categorical_crossentropy",
                 optimizer="adam",
                 metrics=["sparse_categorical_accuracy"])

In [None]:
checkpoint_path = "./drive/MyDrive/Curso de NLP/BERT/ckpt_bert_tok/"
ckpt = tf.train.Checkpoint(Dcnn=Dcnn)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=1)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Último checkpoint restaurado!!")

Último checkpoint restaurado!!


In [None]:
class MyCustomCallback(tf.keras.callbacks.Callback):

    def on_epoch_end(self, epoch, logs=None):
        ckpt_manager.save()
        print("Checkpoint guardado en {}.".format(checkpoint_path))

In [None]:
Dcnn.fit(train_dataset,
         epochs=NB_EPOCHS,
         callbacks=[MyCustomCallback()])

Epoch 1/5
Checkpoint guardado en ./drive/MyDrive/Curso de NLP/BERT/ckpt_bert_tok/.
Epoch 2/5
Checkpoint guardado en ./drive/MyDrive/Curso de NLP/BERT/ckpt_bert_tok/.
Epoch 3/5
Checkpoint guardado en ./drive/MyDrive/Curso de NLP/BERT/ckpt_bert_tok/.
Epoch 4/5
Checkpoint guardado en ./drive/MyDrive/Curso de NLP/BERT/ckpt_bert_tok/.
Epoch 5/5
Checkpoint guardado en ./drive/MyDrive/Curso de NLP/BERT/ckpt_bert_tok/.


<keras.callbacks.History at 0x7f70f35aa4d0>

# Fase 5: Evaluación

In [None]:
results = Dcnn.evaluate(test_dataset)
print(results)

In [None]:
print(results)

[0.4206061363220215, 0.8330484628677368]




*   Training: 88.5%
*   Testing: 84.6%



In [None]:
def get_prediction(sentence):
    tokens = encode_sentence(sentence)
    inputs = tf.expand_dims(tokens, 0)

    output = Dcnn(inputs, training=False)

    sentiment = math.floor(output*2)

    if sentiment == 0:
        print("Salida del modelo: {}\nSentimiento predicho: Negativo.".format(
            output))
    elif sentiment == 1:
        print("Salida del modelo: {}\nSentimiento predicho: Positivo.".format(
            output))

In [None]:
get_prediction("The best time for new beginnings is now.")

Salida del modelo: [[0.8914482]]
Sentimiento predicho: Positivo.
