In [1]:
import os
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.models import Model
from tensorflow import keras
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
import json
import bert
import tqdm
import json
import jsonlines
from bert.tokenization.bert_tokenization import FullTokenizer

HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]

# https://www.youtube.com/watch?v=gE-95nFF4Cc 

In [2]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

In [3]:
tokenizer = FullTokenizer(vocab_file="bert_en_uncased_L-12_H-768_A-12_2/assets/vocab.txt")
bert_model = tf.saved_model.load("./bert_en_uncased_L-12_H-768_A-12_2")

In [4]:
dev_jsonl = "CSAbstruct/dev.jsonl"
train_jsonl = "CSAbstruct/train.jsonl"
test_jsonl = "CSAbstruct/test.jsonl"

In [5]:
with jsonlines.open(train_jsonl) as rf:
    max_length = 0
    max_sent = 0
    for line in rf:
        sentences = line["sentences"]
        labels = line["labels"]
        confs = line["confs"]
        sentence_words_length = 0
        final = ["[CLS]"]
        for sentence in sentences:
            tokens = tokenizer.tokenize(sentence)
            final += tokens+["[SEP]"]

        out = tokenizer.convert_tokens_to_ids(final)
        max_length = max(max_length, len(out))
        # print(len(out))
        # for sentence in sentences:
        #     words = sentence.split()
        #     max_sent = max(max_sent, len(words))
        #     sentence_words_length += len(words)
        #     if len(words) > 300:
        #         print(words)
        # max_length = max(max_length, sentence_words_length)
    print(max_length)
    # print(max_sent)
        

471


In [6]:
classes = ["background", "objective", "method", "result", "other"]
# c_test = ["objective", "method", "other"]
# np_labels = [classes.index(x) for x in c_test]
# print(np_labels)

In [7]:
class PrepareData:
    def __init__(self, train, dev, test, classes, tokenizer:FullTokenizer):
        self.tokenizer = tokenizer
        self.classes = classes
        self.max_sequence_length = 0

        ((self.train_x, self.train_y), (self.dev_x, self.dev_y), (self.test_x, self.test_y)) = map(self.parse_to_tokenize, [train, dev, test])
        self.train_x, self.dev_x, self.test_x = map(self._pad, [self.train_x, self.dev_x, self.test_x])


    def parse_to_tokenize(self, jsonlfile):
        x, y = [], []
        with jsonlines.open(jsonlfile) as rf:
            for line in rf:
                sentences = line["sentences"]
                final_token = ["[CLS]"]
                for sentence in sentences:
                    tokens = self.tokenizer.tokenize(sentence)
                    final_token += tokens+["[SEP]"]
                final_token_ids = self.tokenizer.convert_tokens_to_ids(final_token)
                self.max_sequence_length = max(self.max_sequence_length, len(final_token_ids))
                x.append(final_token_ids)

                labels = line["labels"]
                np_labels = [self.classes.index(x) for x in labels]
                y.append(2)
        '''
        Checks to make sure there are equal number of samples and labels
        '''
        assert(len(x)==len(y))
        return np.array(x), np.array(y)

    def _pad(self, ids):
        '''
        Pads all the inputs to be equal to the max length
        '''
        x = []
        for input_ids in ids:
            # cut_off = min(len(input_ids), self.max_sequence_length-2)
            cut_off = min(len(input_ids), self.max_sequence_length)
            input_ids[:cut_off] 
            input_ids = input_ids + [0]*(self.max_sequence_length-len(input_ids))
            x.append(np.array(input_ids))
        
        return np.array(x)


In [8]:
data = PrepareData(train_jsonl, dev_jsonl, test_jsonl, classes, tokenizer)

In [None]:
type(data.train_y)
for x in data.train_y:
    print(type(x))
# print(np.zeros(data.train_x.shape))

In [9]:
def create_model(max_sequence_length, bert_model):
    tf.keras.backend.clear_session()

    input_layer = keras.layers.Input(shape=(max_sequence_length,), dtype=tf.int32, name="input_layer")
    input_mask = tf.keras.layers.Input(shape=(max_sequence_length,), dtype=tf.int32, name="input_mask")
    segment_ids = tf.keras.layers.Input(shape=(max_sequence_length,), dtype=tf.int32, name="segment_ids")
    bert_layer = hub.KerasLayer(bert_model, trainable=True)
    pooled, seq = bert_layer([input_layer, input_mask, segment_ids])

    # input_l = tf.keras.layers.Input(shape=pooled.shape)
    # x = bert_layer()(input_l)

    # x = keras.layers.Lambda(lambda seq: seq[:, 0, :])(seq)
    # print(x.shape)
    x = keras.layers.Layer(pooled.shape)(pooled)
    x = keras.layers.Dropout(0.5)(x)

    x = keras.layers.Dense(768, activation="relu")(x)
    x = keras.layers.Dropout(0.5)(x)
    x = keras.layers.Dense(len(classes), activation="softmax")(x)
    model = keras.Model([input_layer, input_mask, segment_ids], x, name="bert_model")
    return model

In [10]:
model = create_model(data.max_sequence_length, bert_model)
model.summary()

Model: "bert_model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_layer (InputLayer)        [(None, 471)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 471)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 471)]        0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 768), (None, 109482241   input_layer[0][0]                
                                                                 input_mask[0][0]        

In [11]:
model.compile(
    optimizer=keras.optimizers.Adam(1e-5),
    loss=keras.losses.SparseCategoricalCrossentropy(),
    metrics=[keras.metrics.SparseCategoricalAccuracy()]
)

In [12]:
check_points = "checkpoint/checkpoint.hb"
check_point_dir = os.path.dirname(check_points)
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=check_point_dir, verbose=1, monitor="val_sparse_categorical_accuracy", save_best_only=True)

tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="log_dir")

In [20]:
# the data.train_x is tokenized BEFORE the model begins running. First we create the tokenizer, then we embed the tokenizer with
# a giant corpus of words not related to our data. Then we use that tokenizer on our data to put all the sentences in a token-form.
# The tokenized data is then used to train the model.

#Currently failing bc the train_y is a list of 
mask = np.zeros(data.train_x.shape)
seg = np.zeros(data.train_x.shape)
history = model.fit(x=[data.train_x, mask, seg], y=data.train_y, verbose=1, validation_split=0.1, batch_size=1, shuffle=True,
use_multiprocessing=True, workers=5, epochs=5, callbacks=[cp_callback, tensorboard_callback])

Train on 1500 samples, validate on 167 samples
Epoch 1/5


KeyboardInterrupt: 

In [None]:
model_folder = "model_path"
os.makedirs(model_folder, exist_ok=True)

In [None]:
saved_model = "saved_model"
model_path = os.path.join(model_folder, saved_model)
tf.saved_model.save(model, model_path)

In [None]:
# model = tf.keras.models.load_model(model_path)

In [None]:
mask = np.zeros(data.train_x.shape)
seg = np.zeros(data.train_x.shape)
tmask = np.zeros(data.test_x.shape)
tseg = np.zeros(data.test_x.shape)

train_loss, train_acc = model.evaluate([data.train_x, mask, seg], data.train_y, batch_size=32)
test_loss, test_acc = model.evaluate([data.test_x, tmask, tseg], data.test_y, batch_size=32)

print("train acc", train_acc)
print("train acc", test_acc)

In [None]:
print(data.train_x[4])
tok = tokenizer.tokenize("[CLS] listen to westbam alumb allergic on google music")
tokenizer.convert_tokens_to_ids(tok)

In [None]:
tok = tokenizer.tokenize("[CLS] google play Bank account")
x = tokenizer.convert_tokens_to_ids(tok)
print(x)
np_x = np.array(x)
np_x.shape

In [None]:
sentences = [
  "how hot is it outside",
  "Rate this book as awful"
]
print(classes)
%precision 4

pred_tokens = map(tokenizer.tokenize, sentences)
pred_tokens = map(lambda tok: ["[CLS]"] + tok + ["[SEP]"], pred_tokens)
pred_token_ids = list(map(tokenizer.convert_tokens_to_ids, pred_tokens))

pred_token_ids = map(lambda tids: tids +[0]*(data.max_sequence_length-len(tids)),pred_token_ids)
pred_token_ids = np.array(list(pred_token_ids))
x1, x2 = np.zeros(pred_token_ids.shape), np.zeros(pred_token_ids.shape)

predictions = model.predict([pred_token_ids, x1, x2])
print(predictions)
# for text, label in zip(sentences, predictions):
#   print("text:", text, "\nintent:", classes[label])
#   print()

# output = model.predict([data.test_x, tmask, tseg])
# print(output)