In [19]:
import os
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.models import Model
from tensorflow import keras
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
import json
import bert
import tqdm
from bert.tokenization.bert_tokenization import FullTokenizer

HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]

# https://www.youtube.com/watch?v=gE-95nFF4Cc 

In [99]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

Physical devices cannot be modified after being initialized


In [None]:
# !gdown --id 1OlcvGWReJMuyYQuOZm149vHWwPtlboR6 --output intent/train.csv
# !gdown --id 1Oi5cRlTybuIF2Fl5Bfsr-KkqrXrdt77w --output intent/valid.csv
# !gdown --id 1ep9H6-HvhB4utJRLVcLzieWNUSG3P_uF --output intent/test.csv

In [None]:
train = pd.read_csv("intent/train.csv")
validation = pd.read_csv("intent/valid.csv")
test = pd.read_csv("intent/test.csv")


train=train.append(validation).reset_index()

In [None]:
train.shape

In [None]:
train.head()

In [None]:
chart = sns.countplot(train.intent, palette=HAPPY_COLORS_PALETTE)
chart.set_xticklabels(chart.get_xticklabels(), rotation=30, horizontalalignment="right")

In [84]:
# This class handles the tokenizing and padding of our dataset. Each dataset will have a slightly different way to do each step. On regular
# TF packages, https://www.coursera.org/learn/natural-language-processing-tensorflow/lecture/2Cyzs/padding, you can see that they have methods that # tokenize and pad together that work seamlessly. Because we are using a pretrained model, bert-tf2, these are not available to us (not sure why tbh, probably because we want to use the pretrained tokenizer which requires us to not use TF methods)
class IntentDetection:
    DATA = "text"
    LABELS= "intent"

    def __init__(self, train, test,  classes, tokenizer:FullTokenizer, max_sequence_length=192):
        self.tokenizer = tokenizer
        self.max_sequence_length = 0
        self.classes = classes

        # train, test = map(lambda df: df.reindex(df[IntentDetection.DATA].str.len().sort_values().index), [train, test])

        ((self.train_x, self.train_y), (self.test_x, self.text_y)) = map(self._prepare, [train, test])
        self.max_sequence_length = min(self.max_sequence_length, max_sequence_length)
        self.train_x, self.test_x = map(self._pad, [self.train_x, self.test_x])


    def _prepare(self, data_frame):
        x, y = [], []
        for _, row in tqdm.tqdm(data_frame.iterrows()):
            text, label = row[IntentDetection.DATA], row[IntentDetection.LABELS]

            tokens = self.tokenizer.tokenize(text)
            tokens = ["[CLS]"]+tokens+["[SEP]"]
            token_ids = self.tokenizer.convert_tokens_to_ids(tokens)

            self.max_sequence_length = max(self.max_sequence_length, len(token_ids))
            x.append(token_ids)
            y.append(self.classes.index(label))

        return np.array(x), np.array(y)

    def _pad(self, ids):
        x = []

        for input_ids in ids:
            # cut_off = min(len(input_ids), self.max_sequence_length-2)
            cut_off = min(len(input_ids), self.max_sequence_length)
            input_ids[:cut_off] 
            input_ids = input_ids + [0]*(self.max_sequence_length-len(input_ids))
            x.append(np.array(input_ids))
        
        return np.array(x)

In [74]:
tokenizer = FullTokenizer(vocab_file="bert_en_uncased_L-12_H-768_A-12_2/assets/vocab.txt")

In [75]:
tokenizer.tokenize("what is this going to do")

['what', 'is', 'this', 'going', 'to', 'do']

In [39]:
bert_model = tf.saved_model.load("./bert_en_uncased_L-12_H-768_A-12_2")

In [102]:
def create_model(max_sequence_length, bert_model):
    input_layer = keras.layers.Input(shape=(max_sequence_length, ), dtype=tf.int32, name="input_layer")
    input_mask = tf.keras.layers.Input(shape=(max_sequence_length,), dtype=tf.int32, name="input_mask")
    segment_ids = tf.keras.layers.Input(shape=(max_sequence_length,), dtype=tf.int32, name="segment_ids")
    bert_layer = hub.KerasLayer(bert_model, trainable=True)
    
    pooled, seq = bert_layer([input_layer, input_mask, segment_ids])
    print(pooled)
    print(seq)


In [100]:
classes = train.intent.unique().tolist()
data = IntentDetection(train, test, classes, tokenizer,  max_sequence_length=192)

13784it [00:02, 5818.10it/s]
700it [00:00, 5732.47it/s]


In [103]:
model = create_model(data.max_sequence_length, bert_model)

Tensor("keras_layer_17/Identity:0", shape=(None, 768), dtype=float32)
Tensor("keras_layer_17/Identity_1:0", shape=(None, 30, 768), dtype=float32)
