In [None]:
import os
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow import keras
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
import json
import bert
import tqdm
from bert.tokenization.bert_tokenization import FullTokenizer

HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]

from transformers import BertTokenizer


In [None]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

In [None]:
train = pd.read_csv("intent/train.csv")
validation = pd.read_csv("intent/valid.csv")
test = pd.read_csv("intent/test.csv")

In [None]:
train.head()

In [None]:
chart = sns.countplot(train.intent, palette=HAPPY_COLORS_PALETTE)
chart.set_xticklabels(chart.get_xticklabels(), rotation=30, horizontalalignment="right")

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [None]:
for _, row in train.iterrows():
    print(row["text"])
    

In [None]:
class IntentDetection:
    DATA = "text"
    LABELS = "intent"
    def __init__(self, train, test, classes, tokenizer:FullTokenizer, absolute_max=192):
        self.tokenizer = tokenizer
        self.classes = classes
        self.absolute_max = absolute_max
        self.max_sequence_length = 0
        self.attention_mask = []

        x, y = map(self._get_max_length, [train, test])
        if self.max_sequence_length > self.absolute_max:
            print("### overriding calculated max sequence length")
            self.max_sequence_length = self.absolute_max

        # ((self.train_x, self.train_y), (self.test_x, self.test_y)) = map(self._prepare, [train, test])
        self.test_x, self.test_y, self.test_original = self._prepare(test)
        # self.train_x, self.test_x = map(self._pad, [self.train_x, self.test_x])

    def _get_max_length(self, data_frame):
        for _, row in data_frame.iterrows():
            sequence = row["text"]
            self.max_sequence_length = max(len(sequence), self.max_sequence_length)

    def _prepare(self, data_frame):
        x, y, z = [], [], []
        count = 0
        for _, row in tqdm.tqdm(data_frame.iterrows()):
            text, label = row[IntentDetection.DATA], row[IntentDetection.LABELS]
            encoded_dict = self.tokenizer.encode_plus(
                        text,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = self.max_sequence_length,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'tf',     # Return tensorflow tensors.
                        truncation=True # truncates to max sequence length
                   )
            x.append(encoded_dict["input_ids"])
            z.append(text)
            self.attention_mask.append(encoded_dict["attention_mask"])
            y.append(self.classes.index(label))
            # print(f"count:{count}")
            count +=1
        return np.array(x), np.array(y), z


In [None]:
data = IntentDetection(train, test, train.intent.unique().tolist(), tokenizer)


In [None]:
data.test_x[0]

In [None]:
data.test_original[0]

In [None]:
encoded_dict = tokenizer.encode_plus(
                        'add sabrina salerno to the grime instrumentals playlist',                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 32,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'tf',     # Return tensorflow tensors.
                        truncation=True # truncates to max sequence length
                   )
print(encoded_dict["input_ids"])               