In [None]:
import math
import os
import re
import json
import csv
import string
import numpy as np
import tensorflow as tf
from tensorflow.data import Dataset
from tensorflow import keras
from tensorflow.keras import layers
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer, TFBertModel, BertConfig
from tqdm import tqdm
from random import randrange

MODEL_DIR = "/kaggle/input/huggingface-bert/"

In [None]:
tokenizer = BertWordPieceTokenizer("../input/scibert-210605/vocab.txt", lowercase=True)

In [None]:
def load_json(path, pub_id):
    filepath = path + "{}.json".format(pub_id)
    with open(filepath, "r") as file:
        return json.load(file)
    raise Error("could not open json file at '{}'".format(filepath))

def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt))

def concat_sections(sections):
    return " ".join(section['text'] for section in sections)

def find_matches(text, label):
    esc_label = re.escape(label) # TODO ignore case
    return [match.start() for match in re.finditer(esc_label, text)]

In [None]:
def find_token_idx(offsets, start_char, end_char):
    for start_idx, (offset_start, offset_end) in enumerate(offsets):
        if offset_start <= start_char <= offset_end:
            for end_idx, (offset_start, offset_end) in enumerate(offsets[start_idx:]):
                if offset_start <= end_char <= offset_end:
                    return start_idx, start_idx + end_idx
    raise Exception('invalid token indices: ({}, {}) for offsets {}'.format(start_char, end_char, offsets))

def find_token_indices(tokenized, matches, match_len):
    match_idx = 0
    token_matches = []
    for start_idx, (offset_start, offset_end) in enumerate(tokenized.offsets):
        if offset_start <= matches[match_idx] <= offset_end:
            end_char = matches[match_idx] + match_len
            for end_idx, (offset_start, offset_end) in enumerate(tokenized.offsets[start_idx:]):
                if offset_start <= end_char <= offset_end:
                    token_matches.append((start_idx, start_idx + end_idx))
                    
                    match_idx += 1
                    if match_idx >= len(matches):
                        return token_matches
                    break
    raise Exception('could not find all matches in tokens')

In [None]:
def chunk_text(text):
    tokenized = tokenizer.encode(text)
    tokens = tokenized.ids

    chunk_count = int(math.ceil(len(tokens) / CONTEXT_LEN))
    
    flattened_ids = np.zeros((chunk_count * CONTEXT_LEN,), dtype=np.float32)
    flattened_masks = np.ones((chunk_count * CONTEXT_LEN,), dtype=np.float32)
    
    flattened_ids[:len(tokens)] = tokens
    flattened_masks[len(tokens):] = 0

    ids = flattened_ids.reshape((chunk_count, CONTEXT_LEN))
    masks = flattened_masks.reshape((chunk_count, CONTEXT_LEN))
    
    return [ids, masks], tokenized.offsets[:-1] + [(len(text), len(text))]

In [None]:
CONTEXT_LEN = 512

def make_example(tokenized, label_start, label_end):
    label_len = label_end - label_start
    half = (CONTEXT_LEN - label_len) // 2
    shift = randrange(-half + 1, half)

    left = label_start - (CONTEXT_LEN - label_len) // 2 - (label_len & 1 == 1) + shift
    bounded_left = max(0, left)
    left_padding = bounded_left - left

    right = label_end + (CONTEXT_LEN - label_len) // 2 + shift
    bounded_right = min(len(tokenized.ids), right)
    right_padding = right - bounded_right

    type_ids = [0] * left_padding + tokenized.ids[bounded_left:bounded_right] + [0] * right_padding
    mask = [0] * left_padding + [1] * (CONTEXT_LEN - right_padding - left_padding) + [0] * right_padding
    
    left_margin = left_padding - bounded_left
    start_idx = label_start + left_margin
    end_idx = label_end + left_margin
    
    return ((type_ids, mask), (start_idx, end_idx))

In [None]:
x_train_ids = []
x_train_masks = []
y_train_starts = []
y_train_ends = []
y_train_haslabel = []

labels = set()

skip_count = 0
with open('../input/coleridgeinitiative-show-us-the-data/train.csv', 'r') as traincsv:
    reader = csv.reader(traincsv)
    next(reader) # skip headers
    
    for pub_id, _, _, db_label, _ in tqdm(reader):
        labels.add(clean_text(db_label))
        
        sections = load_json("../input/coleridgeinitiative-show-us-the-data/train/", pub_id)
        text = concat_sections(sections)
        (chunk_ids, chunk_masks), offsets = chunk_text(text)
        
        for start_idx, ids, mask in zip(range(0, len(offsets), CONTEXT_LEN), chunk_ids, chunk_masks):
            start_offset = offsets[start_idx][0]
            end_offset = offsets[min(start_idx + CONTEXT_LEN, len(offsets)) - 1][1] # NOTE: not sure if this works
            orig_text = text[start_offset:end_offset]
            matches = find_matches(orig_text, db_label)
            local_offsets = offsets[start_idx:min(start_idx + CONTEXT_LEN, len(offsets))]
            
            if matches:
                try:
                    match_start = matches[0] + local_offsets[0][0]
                    start, end = find_token_idx(local_offsets, match_start, match_start + len(db_label))

                    x_train_ids.append(ids)
                    x_train_masks.append(mask)
                    y_train_starts.append(start)
                    y_train_ends.append(end)
                    y_train_haslabel.append(1.0)
                except:
                    skip_count += 1
            else:
                try:
                    random_start = randrange(0, max(1, len(orig_text) - len(db_label))) + local_offsets[0][0]
                    random_end = min(random_start + len(db_label), len(orig_text) + local_offsets[0][0])
                    start, end = find_token_idx(local_offsets, random_start, random_end)

                    x_train_ids.append(ids)
                    x_train_masks.append(mask)
                    y_train_starts.append(start)
                    y_train_ends.append(end)
                    y_train_haslabel.append(0.0)
                except:
                    skip_count += 1

x_train = [np.array(x_train_ids, dtype=np.float32), np.array(x_train_masks, dtype=np.float32)]
y_train = [np.array(y_train_starts, dtype=np.float32), np.array(y_train_ends, dtype=np.float32), np.array(y_train_haslabel, dtype=np.float32)]

In [None]:
def save_dataset(filepath, x_train, y_train):
    np.savez_compressed(filepath, x_train_ids=x_train[0], x_train_masks=x_train[1], y_train_starts=y_train[0], y_train_ends=y_train[1], y_train_haslabel=y_train[2])

def load_dataset(filepath):
    data = np.load(filepath)
    return [data['x_train_ids'], data['x_train_masks']], [data['y_train_starts'], data['y_train_ends'], data['y_train_haslabel']]

In [None]:
save_dataset("unbalanced_dataset", x_train, y_train)

print(len(y_train_haslabel))
print(sum(y_train_haslabel))
print(sum(1 - haslabel for haslabel in y_train_haslabel))
print(skip_count)

In [None]:
x_train, y_train = load_dataset("./dataset.npz")
pos_example_matches = y_train[2] >= 0.5
neg_example_matches = y_train[2] < 0.5
examples_per_class = min(np.sum(pos_example_matches), np.sum(neg_example_matches))
indices = np.arange(0, y_train[2].shape[0])
pos_indices = np.random.choice(indices[pos_example_matches], examples_per_class, replace=False)
neg_indices = np.random.choice(indices[neg_example_matches], examples_per_class, replace=False)
data_indices = np.concatenate([pos_indices, neg_indices])
np.random.shuffle(data_indices)

x_train_ids = x_train[0][data_indices]
x_train_masks = x_train[1][data_indices]
y_train_starts = y_train[0][data_indices]
y_train_ends = y_train[1][data_indices]
y_train_haslabels = y_train[2][data_indices]
x_train = [x_train_ids, x_train_masks]
y_train = [y_train_starts, y_train_ends, y_train_haslabels]

In [None]:
save_dataset("balanced_dataset", x_train, y_train)

print(len(y_train[2]))
print(sum(y_train[2]))
print(sum(1 - haslabel for haslabel in y_train[2]))

In [None]:
def create_model():
    ## BERT encoder
    encoder = TFBertModel.from_pretrained("../input/scibert-210605/", from_pt=True)

    input_ids = layers.Input(shape=(CONTEXT_LEN,), dtype=tf.int32)
    attention_mask = layers.Input(shape=(CONTEXT_LEN,), dtype=tf.int32)
    embedding = encoder(
        input_ids, attention_mask=attention_mask
    )[0]

    start_logits = layers.Dense(1, name="start_logit", use_bias=False)(embedding)
    start_logits = layers.Flatten()(start_logits)

    end_logits = layers.Dense(1, name="end_logit", use_bias=False)(embedding)
    end_logits = layers.Flatten()(end_logits)

    start_probs = layers.Activation(keras.activations.softmax)(start_logits)
    end_probs = layers.Activation(keras.activations.softmax)(end_logits)
    
    confidence = layers.Flatten()(embedding)
    confidence = layers.Dense(1, name="confidence", use_bias=False)(confidence)
    confidence = layers.Activation(keras.activations.sigmoid)(confidence)

    model = keras.Model(
        inputs=[input_ids, attention_mask],
        outputs=[start_probs, end_probs, confidence]
    )
    loss = keras.losses.SparseCategoricalCrossentropy(from_logits=False)
    optimizer = keras.optimizers.Adam(lr=5e-5)
    model.compile(optimizer=optimizer, loss=[loss, loss, keras.losses.BinaryCrossentropy(from_logits=False)])
    return model

In [None]:
use_tpu = True
if use_tpu:
    # Create distribution strategy
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)

    # Create model
    with strategy.scope():
        model = create_model()
else:
    model = create_model()

model.summary()

In [None]:
model.fit(
    x_train,
    y_train,
    epochs=3,
    verbose=2,
    batch_size=64,
    callbacks=[],
)

In [None]:
def make_excerpts(pub_id):
    sections = load_json("../input/coleridgeinitiative-show-us-the-data/test/", pub_id)
    text = concat_sections(sections)
    (chunk_ids, chunk_masks), offsets = chunk_text(text)
    
    start_probs, end_probs, confs = model.predict([chunk_ids, chunk_masks])
    
    label_chunks = np.ravel(confs) >= 0.5
    starts, ends = np.argmax(start_probs[label_chunks], axis=1), np.argmax(end_probs[label_chunks], axis=1) + 1
    return [tokenizer.decode(chunk[start:end + 1].astype(int)) for (chunk, start, end) in zip(chunk_ids[label_chunks], starts, ends)]

In [None]:
test_path = "../input/coleridgeinitiative-show-us-the-data/test"
_, _, filenames = next(os.walk(test_path))

with open("submission.csv", "w") as submissions:
    writer = csv.writer(submissions)

    # headers
    writer.writerow(["Id", "PredictionString"])
    # entries
    for filename in filenames:
        pub_id = filename[:-5]
        excerpts = make_excerpts(pub_id)
        writer.writerow([pub_id, "|".join(set(clean_text(excerpt) for excerpt in excerpts))])

In [None]:
import pickle

with open("model_weights.dat", 'wb') as file:
    weights = [layer.get_weights() for layer in model.layers]
    pickle.dump(weights, file)

with open("db_labels.dat", 'wb') as file:
    pickle.dump(labels, file)