In [20]:
import tensorflow as tf

# from bilstm_crf import build_bilstm
import numpy as np

In [123]:
import pickle
import operator
import re
import string
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

# from plot_keras_history import plot_history
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import multilabel_confusion_matrix
# from keras_contrib.utils import save_load_utils

from keras import layers
from keras import optimizers

from keras.models import Model
# from keras import Input
from tensorflow_addons import metrics
from tensorflow_addons import losses
from tensorflow_addons.layers import CRF
from tensorflow_addons.text.crf import crf_log_likelihood



def build_bilstm(max_len, vocab_size, n_tags, embedding_matrix=None, embedding_dim=None, unit='lstm', num_units=100, dropout=0.1, recurrent_dropout=0.1, name='bilstm_crf'):
    input = layers.Input(shape=(max_len,))

    if embedding_matrix is not None:
        model = layers.Embedding(input_dim=vocab_size, output_dim=embedding_matrix.shape[-1], input_length=max_len, mask_zero=True, weights=[embedding_matrix], trainable=False)(input)
    elif embedding_dim is not None:
        model = layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len, mask_zero=True, embeddings_initializer='uniform')(input)
    else:
        raise ValueError('Must provide either an embedding matrix or an embedding dimension.')

    if unit == 'lstm':
        model = layers.Bidirectional(layers.LSTM(units=num_units, return_sequences=True, dropout=recurrent_dropout))(model)
    elif unit == 'gru':
        model = layers.Bidirectional(layers.GRU(units=num_units, return_sequences=True, dropout=recurrent_dropout))(model)
    elif unit == 'rnn':
        model = layers.Bidirectional(layers.SimpleRNN(units=num_units, return_sequences=True, dropout=recurrent_dropout))(model)
    else:
        raise ValueError('Invalid unit type. Must be one of lstm, gru, or rnn.')

    model = layers.Dropout(dropout)(model)
    model = layers.TimeDistributed(layers.Dense(n_tags, activation="relu"))(model)

    crf_layer = CRF(units=n_tags)
    output_layer  = crf_layer(model)

    output_model = Model(input, output_layer, name=name)

    # loss = losses.SigmoidFocalCrossEntropy()
    metric = metrics.F1Score(num_classes=n_tags, average='micro')
    
    opt = optimizers.RMSprop(learning_rate=0.01)

    output_model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy', metric])
    output_model.summary()

    return output_model

# Load material

In [22]:
import json

# load embedding
embedding_maxtrix = np.load('embedding/embedding_matrix.npy')

# load vocab
with open('data/vocab.txt', 'r') as f:
    vocab = f.read().split('\n')
len(vocab)

# load tag_to_id
with open('data/tag_to_id.json', 'r') as f:
    tag_to_id = json.load((f))

# load train and dev data
TRAIN_PATH = 'data/span_detection_datasets_IOB/train.json'
DEV_PATH = 'data/span_detection_datasets_IOB/dev.json'

with open(TRAIN_PATH, 'r') as f:
    train_data = json.load(f)

with open(DEV_PATH, 'r') as f:
    dev_data = json.load(f)

train_sentences = list(train_data['text'].values())
dev_sentences = list(dev_data['text'].values())

train_labels = list(train_data['labels'].values())
dev_labels = list(dev_data['labels'].values())

# Convert data

## Sentences

In [23]:
import numpy as np

# Convert data to ids
def convert_to_ids(data, vocab, max_len=256):
    id_data = []

    pad_token_id = vocab.index('<PAD>')
    ukn_token_id = vocab.index('<UNK>')
    for sentence in data:
        ids = []
        for word in sentence.split():
            if word in vocab:
                ids.append(vocab.index(word))
            else:
                ids.append(ukn_token_id)

        if len(ids) < max_len:
            ids += [pad_token_id] * (max_len - len(ids))
        id_data.append(np.array(ids))
        
    return id_data

In [96]:
train_tokenized = np.array(convert_to_ids(train_sentences, vocab), dtype=np.float32)
dev_tokenized = np.array(convert_to_ids(dev_sentences, vocab), dtype=np.float32)

# train_tokenized = [torch.LongTensor(tokenized) for tokenized in train_tokenized]
# dev_tokenized = [torch.LongTensor(tokenized) for tokenized in dev_tokenized]

## Labels

In [108]:
# load tag_to_id
with open('data/tag_to_id.json', 'r') as f:
    tag_to_id = json.load((f))

# Convert labels to ids
# labels = [[start, end, tag], ...]
def convert_labels_to_ids(label, tag_to_id, max_len=256):
    ids = [int(tag_to_id[tag]) for tag in label]

    if len(ids) < max_len:
        ids += [int(tag_to_id['O'])] * (max_len - len(ids))
        
    return np.array(ids, dtype=np.int32)

In [109]:
train_labels_encoding = np.array([convert_labels_to_ids(label, tag_to_id) for label in train_labels], dtype=np.int32)
dev_labels_encoding = np.array([convert_labels_to_ids(label, tag_to_id) for label in dev_labels], dtype=np.int32)

# train_labels_encoding = np.expand_dims(train_labels_encoding, axis=2)
# dev_labels_encoding = np.expand_dims(dev_labels_encoding, axis=2)

# train_labels_encoding = [torch.LongTensor(label) for label in train_labels_encoding]
# dev_labels_encoding = [torch.LongTensor(label) for label in dev_labels_encoding]

## Loader

In [111]:
# create data loader tensorflow
BATCH_SIZE = 32
BUFFER_SIZE = 1000

train_dataset = tf.data.Dataset.from_tensor_slices((train_tokenized, train_labels_encoding))
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dev_dataset = tf.data.Dataset.from_tensor_slices((dev_tokenized, dev_labels_encoding))
dev_dataset = dev_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

# Model

In [112]:
# Hyperparameters
MAX_LEN = 256
VOCAB_SIZE = len(vocab)
TAG_SIZE = len(tag_to_id)
UNITS = 100
EMBEDDING_DIM = 300

In [124]:
# build_bilstm(max_len, vocab_size, n_tags, embedding_matrix=None, embedding_dim=None, unit='lstm', num_units=100, dropout=0.1, recurrent_dropout=0.1)

model = build_bilstm(
    max_len=MAX_LEN,
    vocab_size=VOCAB_SIZE,
    n_tags=TAG_SIZE,
    embedding_dim=EMBEDDING_DIM,
    unit='lstm',
    num_units=UNITS,
    dropout=0.1,
    recurrent_dropout=0.1,
    name='bilstm-crf-v1'
)

Model: "bilstm-crf-v1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_21 (InputLayer)       [(None, 256)]             0         
                                                                 
 embedding_20 (Embedding)    (None, 256, 300)          2697900   
                                                                 
 bidirectional_20 (Bidirect  (None, 256, 200)          320800    
 ional)                                                          
                                                                 
 dropout_20 (Dropout)        (None, 256, 200)          0         
                                                                 
 time_distributed_20 (TimeD  (None, 256, 21)           4221      
 istributed)                                                     
                                                                 
 crf_19 (CRF)                [(None, 256),           

In [125]:
# train model
EPOCHS = 10

history = model.fit(
    train_dataset,
    validation_data=dev_dataset,
    epochs=EPOCHS
)

Epoch 1/10
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Unknown node type <gast.gast.Assign object at 0x000001C61E146A40>


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Unknown node type <gast.gast.Assign object at 0x000001C61E146A40>


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Unknown node type <gast.gast.Assign object at 0x000001C61E146A40>


TypeError: Expected int32, but got 0.0 of type 'float'.

# End