In [1]:
import json
from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
DATA_PATH = r"./data/tvtropes_books"

for fname in (f"tvtropes_books-{suffix}.json" for suffix in ["train", "test", "val"]):
    assert (Path(DATA_PATH) / fname).is_file(), f"File {fname} not found"

In [3]:
train_list, val_list, test_list = [], [], []
for path, which in ((Path(DATA_PATH) / f"tvtropes_books-{suffix}.json", suffix) for suffix in ["train", "test", "val"]):
    with open(path, "r") as f:
        for line in f:
            globals()[f"{which}_list"].append(json.loads(line))

In [4]:
import re

def split_with_position(str_):
    word_pos_list = []
    for m in re.finditer(r'\S+', str_):
        pos, word = m.span(), m.group()
        word_pos_list.append((word, pos))
    return word_pos_list

In [5]:
def text_to_word_sequence(
    input_text,
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
    lower=True,
    split=" ",
    return_pos=True
):
    if lower:
        input_text = input_text.lower()

    translate_dict = {c: split for c in filters}
    translate_map = str.maketrans(translate_dict)
    input_text = input_text.translate(translate_map)

    if return_pos:
        return tuple(zip(*[(word, pos) for word, pos in split_with_position(input_text) if word]))
    else:
        return tuple(word for word, pos in split_with_position(input_text) if word)

In [6]:
DIM = 512

In [7]:
def in_range(interval_1, interval_2):
    assert interval_1[0] <= interval_1[1] and interval_2[0] <= interval_2[1]
    return interval_1[0] >= interval_2[0] and interval_1[1] <= interval_2[1]

In [8]:
def prepare_dataset(data_list):
    X_list = []
    y_list = []
    for data in data_list:
        if data["has_spoiler"]:
            sentence_data = data["sentences"]
            i = 0
            while i < len(sentence_data):
                # i points to the sentence to process
                input_words_list = []
                input_labels_list = []
                cur_words_count = 0

                while i < len(sentence_data):
                    next_sentence_words, next_sentence_word_positions = text_to_word_sequence(sentence_data[i][1])
                    if cur_words_count + len(next_sentence_words) > DIM:
                        if len(next_sentence_words) > DIM:
                            i += 1
                        break
                    cur_words_count += len(next_sentence_words)
                    input_words_list.extend(next_sentence_words)
                    input_labels_list.extend(any(in_range(pos, spoiler_boundary) for spoiler_boundary in sentence_data[i][2]) for pos in next_sentence_word_positions)
                    i += 1

                if input_words_list:
                    X_list.append(input_words_list)
                    y_list.append(input_labels_list)
    X_list = [" ".join(s) for s in X_list]
    return X_list, y_list

In [9]:
X_train_list, y_train_list = prepare_dataset(train_list)

In [10]:
X_val_list, y_val_list = prepare_dataset(val_list)
X_test_list, y_test_list = prepare_dataset(test_list)

In [11]:
from tensorflow.keras.preprocessing.text import Tokenizer
t = Tokenizer()
# Preprocessed_reviews contains all the cleaned reviews.
t.fit_on_texts(X_train_list)

2023-01-19 19:01:01.909444: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-19 19:01:02.694525: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-01-19 19:01:02.694601: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


In [12]:
from tensorflow.keras.preprocessing import sequence
import numpy as np

X_train = sequence.pad_sequences(t.texts_to_sequences(X_train_list), maxlen=DIM, padding='post')
y_train = np.expand_dims(sequence.pad_sequences(y_train_list, maxlen=DIM, padding='post'), axis=-1)

X_val = sequence.pad_sequences(t.texts_to_sequences(X_val_list), maxlen=DIM, padding='post')
y_val = np.expand_dims(sequence.pad_sequences(y_val_list, maxlen=DIM, padding='post'), axis=-1)

X_test = sequence.pad_sequences(t.texts_to_sequences(X_test_list), maxlen=DIM, padding='post')
y_test = np.expand_dims(sequence.pad_sequences(y_test_list, maxlen=DIM, padding='post'), axis=-1)

In [13]:
import numpy as np
embeddings_index = dict()
f = open('glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
    EMBEDDING_DIM = coefs.size

In [14]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((X_train.max()+1, EMBEDDING_DIM))
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [15]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, LSTM, Embedding, Input, Bidirectional, Attention, Dropout

Binary accuracy dziala dobrze, bo testowałem - PW

In [16]:
class JaccardSimilarity(tf.keras.metrics.Metric):
    def __init__(self, name, **kwargs):
        super().__init__(name=name)
        self.metric: tf.keras.metrics.Metric = tf.keras.metrics.IoU(**kwargs)

    def update_state(self, y_true, y_pred, sample_weight=None):
        y_pred = tf.math.greater(y_pred, 0.5)
        self.metric.update_state(y_true, y_pred, sample_weight=sample_weight) # Dzięki temu, że tu mamy sample_weight, to metryka wspiera masking

    def reset_state(self):
        self.metric.reset_state()

    def result(self):
        return self.metric.result()

Podobienstwo Jaccarda gdy ground truth dla danego labela jest zbiorem pustym będzie 0. Jednak średnie podobieństwo poprawnie ignoruje taki wynik.

In [17]:
vocab_size = len(t.word_index) + 1

In [18]:
##LSTM
##fixing numpy RS
np.random.seed(0)
##fixing tensorflow RS
tf.random.set_seed(1)

HIDDEN_DIM= 256
DIM = 512

inputs = Input(shape=(DIM, ))
x = Embedding(vocab_size, EMBEDDING_DIM, weights=[embedding_matrix], trainable = False, mask_zero=True)(inputs)

x = Bidirectional(LSTM(HIDDEN_DIM, return_sequences=True))(x)

x = Bidirectional(LSTM(HIDDEN_DIM, return_sequences=True))(x)

x = Attention()([x, x, x])

x = Bidirectional(LSTM(HIDDEN_DIM, return_sequences=True))(x)

x = Dropout(0.1)(x)

x = Dense(1, activation='sigmoid')(x)

outputs = x

model = Model(inputs, outputs)
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
              metrics=[
                  'binary_accuracy',
                  JaccardSimilarity('jaccard_nonspoilers', num_classes=2, target_class_ids=[0]),
                  JaccardSimilarity('jaccard_spoilers', num_classes=2, target_class_ids=[1]),
                  JaccardSimilarity('mean_jaccard', num_classes=2, target_class_ids=[0, 1])
              ],
              optimizer='adam')
model.summary()

2023-01-19 19:01:13.414095: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-01-19 19:01:13.420170: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-01-19 19:01:13.420365: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-01-19 19:01:13.420889: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorF

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 512)]        0           []                               
                                                                                                  
 embedding (Embedding)          (None, 512, 100)     8765500     ['input_1[0][0]']                
                                                                                                  
 bidirectional (Bidirectional)  (None, 512, 512)     731136      ['embedding[0][0]']              
                                                                                                  
 bidirectional_1 (Bidirectional  (None, 512, 512)    1574912     ['bidirectional[0][0]']          
 )                                                                                            

In [19]:
# # Sprawdzenie, czy poprawnie dziala masking
# test = np.array([[1, 2, 3] + [0] * 509, [200, 0, 1, 1] + [0] * 508])
# data_out = np.array([[0, 0, 0] + [1] * 509, [1, 0, 0, 1] + [1] * 508])
# data_out = np.expand_dims(data_out, axis = -1)
# model.evaluate(test, data_out, verbose=0)

In [20]:
checkpoint_name = f"./checkpoints/lstm-with-attention-best-val-512"

model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_name,
    save_weights_only=True,
    monitor="val_binary_accuracy",
    mode="max",
    save_best_only=True,
)

model.fit(x=X_train, y=y_train, validation_data=(X_val, y_val), epochs=5, batch_size=32, callbacks=[model_checkpoint_callback])

Epoch 1/5


2023-01-19 19:01:28.480035: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:690] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "GPU" vendor: "NVIDIA" model: "NVIDIA GeForce RTX 3060 Laptop GPU" frequency: 1425 num_cores: 30 environment { key: "architecture" value: "8.6" } environment { key: "cuda" value: "11020" } environment { key: "cudnn" value: "8100" } num_registers: 65536 l1_cache_size: 24576 l2_cache_size: 3145728 shared_memory_size_per_multiprocessor: 102400 memory_size: 4318887936 bandwidth: 336048000 } outputs { dtype: DT_FLOAT shape { unknown_rank: true } }
2023-01-19 19:01:29.643140: W tensorflow/core/common_runtime/type_inference.cc:339] Type inference failed. This indicates an invalid graph that escaped type checking. Error message: INVALID_ARGUMENT: expected compatible input types, but input 1:
type_id: TFT_OPTIONAL
args {
  type_id: TFT_PRODUCT



2023-01-19 19:04:05.556555: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:690] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "GPU" vendor: "NVIDIA" model: "NVIDIA GeForce RTX 3060 Laptop GPU" frequency: 1425 num_cores: 30 environment { key: "architecture" value: "8.6" } environment { key: "cuda" value: "11020" } environment { key: "cudnn" value: "8100" } num_registers: 65536 l1_cache_size: 24576 l2_cache_size: 3145728 shared_memory_size_per_multiprocessor: 102400 memory_size: 4318887936 bandwidth: 336048000 } outputs { dtype: DT_FLOAT shape { unknown_rank: true } }


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fb708165b40>

In [21]:
model.load_weights(checkpoint_name)
model.evaluate(X_test, y_test)



[0.5748364925384521,
 0.7059265375137329,
 0.5763131380081177,
 0.5098705291748047,
 0.5430918335914612]