In [1]:
# For Kernal Mode

# !pip install -q ../input/tensorflow-determinism
# !pip install -q ../input/huggingfacetokenizers/tokenizers-0.0.11-cp36-cp36m-manylinux1_x86_64.whl
# !pip uninstall --yes pytorch-transformers
# !pip install -q ../input/huggingface-transformers-master

In [2]:
"""
1. Use only title (100) + question (206) + answer (206) (failed)
2. LR decay factor=0.3 (failed)
3. Use one embedding input instead of two (failed)
4. Use three embedding inputs instead of two (failed)
5. Split question and anwer FC layers (good)
7. Add category and domain as embeddings (good)
8. Drop out=0.2 (failed)
9. AdamW (failed)
10. Cyclic LR (failed)
11. Normallization for layer output (failed)
12. netloc as feature
"""

import pandas as pd
import numpy as np
import random
import random, math, time
import os, sys, re
from pathlib import Path
from urllib.parse import urlparse

from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import bisect

import matplotlib.pyplot as plt
from tqdm import tqdm
# from tqdm.notebook import tqdm

import tensorflow as tf
import tensorflow.keras.backend as K
# https://github.com/NVIDIA/tensorflow-determinism
os.environ['TF_DETERMINISTIC_OPS'] = '1' # TF 2.1
# from tfdeterminism import patch
# patch()

import transformers
from transformers import *

import torch

from scipy.stats import spearmanr
from math import floor, ceil

from bs4 import BeautifulSoup

import gc
gc.enable()

np.set_printoptions(suppress=True)
print('Tensorflow version', tf.__version__)

print('PyTorch version', torch.__version__)

print('Transformers version',
      transformers.__version__)  # Current version: 2.3.0

Tensorflow version 2.1.0-rc0
PyTorch version 1.1.0
Transformers version 2.4.1


In [3]:
# https://www.tensorflow.org/guide/gpu#limiting_gpu_memory_growth
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    # Restrict TensorFlow to only use the first GPU
    try:
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
    except RuntimeError as e:
        # Visible devices must be set before GPUs have been initialized
        print(e)

1 Physical GPUs, 1 Logical GPU


In [4]:
# debug_mode = True
debug_mode = False

kernel_mode = False
# kernel_mode = True

rand_seed = 20201120
n_splits = 5

dataset_folder = Path("/workspace/Kaggle/QA/")
BERT_PATH = "/workspace/Kaggle/QA/pretrained_models/"

# dataset_folder = Path("../input/google-quest-challenge/")
# BERT_PATH = "../input/huggingface-transformers/"

MAX_SEQUENCE_LENGTH = 512
# max_title_length = 50
max_title_length = 100

learning_rate = 2e-5
# embeddings_dropout = 0.05
# dense_dropout = 0.05
# learning_rate = 2e-5
embeddings_dropout = 0.2
dense_dropout = 0.2

if debug_mode:
#     epochs = 2
#     batch_size = 2
    epochs = 15
    batch_size = 2
else:
#     epochs = 6
    epochs = 15
    if kernel_mode:
        batch_size = 4
    else:
        batch_size = 3
#         batch_size = 4

# lr_decay_patience = 1
# early_stopping_patience = 2

lr_decay_patience = 2
early_stopping_patience = 3

In [5]:
df_train = pd.read_csv(dataset_folder / 'train.csv')
df_test = pd.read_csv(dataset_folder / 'test.csv')
df_sub = pd.read_csv(dataset_folder / 'sample_submission.csv')
print('Train shape:', df_train.shape)
print('Test shape:', df_test.shape)

Train shape: (6079, 41)
Test shape: (476, 11)


In [6]:
output_categories = list(df_train.columns[11:])
# Select only question title, body and answer
input_categories = list(df_train.columns[[1, 2, 5]])

print('\nOutput categories:\n', output_categories)
print('\nInput categories:\n', input_categories)


Output categories:
 ['question_asker_intent_understanding', 'question_body_critical', 'question_conversational', 'question_expect_short_answer', 'question_fact_seeking', 'question_has_commonly_accepted_answer', 'question_interestingness_others', 'question_interestingness_self', 'question_multi_intent', 'question_not_really_a_question', 'question_opinion_seeking', 'question_type_choice', 'question_type_compare', 'question_type_consequence', 'question_type_definition', 'question_type_entity', 'question_type_instructions', 'question_type_procedure', 'question_type_reason_explanation', 'question_type_spelling', 'question_well_written', 'answer_helpful', 'answer_level_of_information', 'answer_plausible', 'answer_relevance', 'answer_satisfaction', 'answer_type_instructions', 'answer_type_procedure', 'answer_type_reason_explanation', 'answer_well_written']

Input categories:
 ['question_title', 'question_body', 'answer']


In [7]:
df_train['host'].value_counts(dropna=False)

stackoverflow.com                      1253
english.stackexchange.com               229
superuser.com                           227
electronics.stackexchange.com           221
serverfault.com                         213
                                       ... 
meta.christianity.stackexchange.com       4
robotics.stackexchange.com                2
meta.askubuntu.com                        2
meta.math.stackexchange.com               2
meta.codereview.stackexchange.com         2
Name: host, Length: 63, dtype: int64

In [8]:
df_train['category'].value_counts(dropna=False)

TECHNOLOGY       2441
STACKOVERFLOW    1253
CULTURE           963
SCIENCE           713
LIFE_ARTS         709
Name: category, dtype: int64

In [9]:
# Extract domain
def extract_netloc(x):
    tokens = x.split(".")
    if len(tokens) > 3:
        print(x)
        return ".".join(tokens[:2])
        # looks like meta is a special site, we should keep it
        # https://stackoverflow.com/help/whats-meta
        # the part of the site where users discuss the workings and policies of Stack Overflow rather than discussing programming itself.
        # return tokens[1]
    else:
        return tokens[0]


# TODO: test it
# df_train['netloc'] = df_train['host'].apply(
#     lambda x: extract_netloc(x))
# df_test['netloc'] = df_test['host'].apply(
#     lambda x: extract_netloc(x))

df_train['netloc'] = df_train['host'].apply(lambda x: x.split(".")[0])
df_test['netloc'] = df_test['host'].apply(lambda x: x.split(".")[0])

In [10]:
def set_all_seeds(rand_seed):
    np.random.seed(rand_seed)
    random.seed(rand_seed)
    os.environ['PYTHONHASHSEED'] = str(rand_seed)
    
    # TF 2.0
    tf.random.set_seed(rand_seed)
    
    # PyTorch
    torch.manual_seed(rand_seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [11]:
set_all_seeds(rand_seed)

In [12]:
# Redirect outputs to console
import sys
jupyter_console = sys.stdout
sys.stdout = open('/dev/stdout', 'w')

# Append to log file
# sys.stdout = open(f"stdout.log", 'a')
# sys.stdout = jupyter_console



## Preprocessing Utilities

In [13]:
def _convert_to_transformer_inputs(title, question, answer, tokenizer,
                                   max_sequence_length):
    """Converts tokenized input to ids, masks and segments for transformer (including bert)"""
    def return_id(str1, str2, truncation_strategy, length):

        inputs = tokenizer.encode_plus(str1,
                                       str2,
                                       add_special_tokens=True,
                                       max_length=length,
                                       truncation_strategy=truncation_strategy)

        input_ids = inputs["input_ids"]
        input_masks = [1] * len(input_ids)
        input_segments = inputs["token_type_ids"]
        padding_length = length - len(input_ids)
        padding_id = tokenizer.pad_token_id
        input_ids = input_ids + ([padding_id] * padding_length)
        input_masks = input_masks + ([0] * padding_length)
        input_segments = input_segments + ([0] * padding_length)

        return [input_ids, input_masks, input_segments]

    def remove_html_special_symbols(x):
        html_entities = [
            ("&quot;", "\""),
            ("&num;", "#"),
            ("&dollar;", "$"),
            ("&percnt;", "%"),
            ("&amp;", "&"),
            ("&apos;", "'"),
            ("&lpar;", "("),
            ("&rpar;", ")"),
            ("&ast;", "*"),
            ("&plus;", "+"),
            ("&comma;", ","),
            ("&minus;", "-"),
            ("&period;", "."),
            ("&sol;", "/"),
            ("&colon;", ":"),
            ("&semi;", ";"),
            ("&lt;", "<"),
            ("&equals;", "="),
            ("&gt;", ">"),
            ("&quest;", "?"),
            ("&commat;", "@"),
            ("&lsqb;", "["),
            ("&bsol;", "\\"),
            ("&rsqb;", "]"),
            ("&Hat;", "^"),
            ("&lowbar;", "_"),
            ("&grave;", "`"),
            ("&lcub;", "{"),
            ("&verbar;", "|"),
            ("&rcub;", "}"),
            # ("", ""),
        ]
        for (k, v) in html_entities:
            x = str(x.replace(k, v))
        return x

    def remove_latex_and_code_tokens(tokens):
        return [
            x for x in tokens if not (x.startswith("$") or x.startswith("\\"))
        ]

    # Remove extra spaces
    title = remove_html_special_symbols(" ".join(
        remove_latex_and_code_tokens(str(title).split()))).strip()
    question = remove_html_special_symbols(" ".join(
        remove_latex_and_code_tokens(str(question).split()))).strip()
    answer = remove_html_special_symbols(" ".join(
        remove_latex_and_code_tokens(str(answer).split()))).strip()

    # Extract plain text from html
    try:
        soup_q = BeautifulSoup(question)
        question = soup_q.get_text()
    except Exception as e:
        print(e)
        pass

    try:
        soup_a = BeautifulSoup(answer)
        answer = soup_a.get_text()
    except Exception as e:
        print(e)
        pass

    input_ids_q, input_masks_q, input_segments_q = return_id(
        "[CLS] " + title[:max_title_length] + " [SEP] " + question + " [SEP]",
        None, 'longest_first', max_sequence_length)

    input_ids_a, input_masks_a, input_segments_a = return_id(
        "[CLS] " + answer + " [SEP]", None, 'longest_first',
        max_sequence_length)

    return [
        input_ids_q, input_masks_q, input_segments_q, input_ids_a,
        input_masks_a, input_segments_a
    ]


def compute_input_arrays(df, columns, tokenizer, max_sequence_length):
    input_ids_q, input_masks_q, input_segments_q = [], [], []
    input_ids_a, input_masks_a, input_segments_a = [], [], []
    for _, instance in tqdm(df[columns].iterrows()):
        t, q, a = instance.question_title, instance.question_body, instance.answer

        ids_q, masks_q, segments_q, ids_a, masks_a, segments_a = \
        _convert_to_transformer_inputs(t, q, a, tokenizer, max_sequence_length)

        input_ids_q.append(ids_q)
        input_masks_q.append(masks_q)
        input_segments_q.append(segments_q)

        input_ids_a.append(ids_a)
        input_masks_a.append(masks_a)
        input_segments_a.append(segments_a)

    return [
        np.asarray(input_ids_q, dtype=np.int32),
        np.asarray(input_masks_q, dtype=np.int32),
        np.asarray(input_segments_q, dtype=np.int32),
        np.asarray(input_ids_a, dtype=np.int32),
        np.asarray(input_masks_a, dtype=np.int32),
        np.asarray(input_segments_a, dtype=np.int32)
    ]


def compute_output_arrays(df, columns):
    return np.asarray(df[columns])

In [14]:
def compute_spearmanr_ignore_nan(trues, preds):
    rhos = []
    for tcol, pcol in zip(np.transpose(trues), np.transpose(preds)):
        rhos.append(spearmanr(tcol, pcol).correlation)
    return np.nanmean(rhos)

In [15]:
class SpearmanMonitorCallback(tf.keras.callbacks.Callback):
    def __init__(self, valid_data, batch_size=16, fold=None):
        self.valid_inputs = valid_data[0]
        self.valid_outputs = valid_data[1]

        self.batch_size = batch_size
        self.fold = fold

    def on_train_begin(self, logs={}):
        self.valid_predictions = []

    def on_epoch_end(self, epoch, logs={}):
        self.valid_predictions.append(
            self.model.predict(self.valid_inputs, batch_size=self.batch_size))

        rho_val = compute_spearmanr_ignore_nan(
            self.valid_outputs, np.average(self.valid_predictions, axis=0))

        print(f" Fold {self.fold+1} Validation Score: {rho_val:.6f}")
        
class SpearmanRhoEarlyStoppingCallback(tf.keras.callbacks.Callback):
    def __init__(self, valid_data, batch_size=16, fold=None, model_save_path=None, patience=2):
        self.x_val = valid_data[0]
        self.y_val = valid_data[1]
        
        self.batch_size = batch_size
        self.fold = fold
        self.model_save_path = model_save_path
        
        self.patience = patience
        self.current_best = -1
        self.bad_epochs = 0

    def on_train_begin(self, logs={}):
        return

    def on_train_end(self, logs={}):
        return

    def on_epoch_begin(self, epoch, logs={}):
        return

    def on_epoch_end(self, epoch, logs={}):
        y_pred_val = self.model.predict(self.x_val, batch_size=self.batch_size)

        rho_val = np.mean([spearmanr(
            self.y_val[:, ind], y_pred_val[:, ind] + np.random.normal(
                0, 1e-7, y_pred_val.shape[0])).correlation for ind in range(y_pred_val.shape[1])])

        if rho_val >= self.current_best:
            self.current_best = rho_val
            # Save model
            self.model.save_weights(self.model_save_path)
        else:
            self.bad_epochs += 1
            print(f"\nEpoch {epoch}: no improvement")
            
        if self.bad_epochs >= self.patience:
            print(f"\nEpoch {epoch} early stopping ......")
            self.model.stop_training = True
        
        print(f"\nFold {self.fold+1} Validation Score: {rho_val:.6f}")
        
        return rho_val

    def on_batch_begin(self, batch, logs={}):
        return

    def on_batch_end(self, batch, logs={}):
        return

## Load Pretrained Model Topology and Weights

In [16]:
is_tf = True
pretrained_model_name = "xlnet-base-cased"

if is_tf:
    model_class = TFAutoModel
    tokenizer_class = AutoTokenizer
else:
    model_class = AutoModel
    tokenizer_class = AutoTokenizer

In [17]:
tokenizer = tokenizer_class.from_pretrained(BERT_PATH +
                                            f"{pretrained_model_name}")

## Create Custom Model

In [18]:
def create_model(embed_info):
    q_id = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32)
    a_id = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32)

    q_mask = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32)
    a_mask = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32)

    q_atn = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32)
    a_atn = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32)

    pretrained_model = model_class.from_pretrained(BERT_PATH +
                                                   f"{pretrained_model_name}")

    # Get last hidden-state from 1st element of output
    q_embedding = pretrained_model(q_id,
                                   attention_mask=q_mask,
                                   token_type_ids=q_atn)[0]
    a_embedding = pretrained_model(a_id,
                                   attention_mask=a_mask,
                                   token_type_ids=a_atn)[0]

    # Get CLS token output
    q = q_embedding[:, 0, :]
    a = a_embedding[:, 0, :]

    host_input = tf.keras.Input(shape=(1,), name="host_input")
    netloc_input = tf.keras.Input(shape=(1,), name="netloc_input")
    cate_input = tf.keras.Input(shape=(1,), name="category_input")
    
    host_embed_info = embed_info["host"]
    host_embed = tf.keras.layers.Embedding(
        input_dim=host_embed_info[0],
        output_dim=host_embed_info[1],
        input_length=(1, ))(host_input)

    netloc_embed_info = embed_info["netloc"]
    netloc_embed = tf.keras.layers.Embedding(
        input_dim=netloc_embed_info[0],
        output_dim=netloc_embed_info[1],
        input_length=(1, ))(netloc_input)

    cate_embed_info = embed_info["category"]
    cate_embed = tf.keras.layers.Embedding(
        input_dim=cate_embed_info[0],
        output_dim=cate_embed_info[1],
        input_length=(1, ))(cate_input)
    
    host_embed = tf.keras.layers.Reshape(target_shape=(host_embed_info[1],))(host_embed)
    netloc_embed = tf.keras.layers.Reshape(target_shape=(netloc_embed_info[1],))(netloc_embed)
    cate_embed = tf.keras.layers.Reshape(target_shape=(cate_embed_info[1],))(cate_embed)

    # Batch normalization before concatenation
    # q_pooler_output = tf.keras.layers.BatchNormalization(momentum=0.99)(q_pooler_output)
    # a_pooler_output = tf.keras.layers.BatchNormalization(momentum=0.99)(a_pooler_output)

    # q = tf.keras.layers.BatchNormalization(momentum=0.99)(q)
    # host_embed = tf.keras.layers.BatchNormalization(momentum=0.99)(host_embed)
    # cate_embed = tf.keras.layers.BatchNormalization(momentum=0.99)(cate_embed)

    embed_concat = tf.keras.layers.Concatenate()([host_embed, netloc_embed, cate_embed])
    embed_concat = tf.keras.layers.Dense(128, activation='relu')(embed_concat)

    # Concatenation
    q_concat = tf.keras.layers.Concatenate()([q, embed_concat])
    # q_concat = tf.keras.layers.Concatenate()([q, host_embed, cate_embed, q_pooler_output])
    q_concat = tf.keras.layers.Dense(256, activation='relu')(q_concat)

    a_concat = tf.keras.layers.Concatenate()([a, embed_concat])
    # a_concat = tf.keras.layers.Concatenate()([a, host_embed, cate_embed, a_pooler_output])
    a_concat = tf.keras.layers.Dense(256, activation='relu')(a_concat)
    
    # Dense dropout
    # q_concat = tf.keras.layers.Dropout(dense_dropout)(q_concat)
    # a_concat = tf.keras.layers.Dropout(dense_dropout)(a_concat)

    # Use sigmoid for multi-label predictions
    q_concat = tf.keras.layers.Dense(21, activation='sigmoid')(q_concat)
    a_concat = tf.keras.layers.Dense(9, activation='sigmoid')(a_concat)

    x = tf.keras.layers.Concatenate()([q_concat, a_concat])

    model = tf.keras.models.Model(inputs=[
        q_id,
        q_mask,
        q_atn,
        a_id,
        a_mask,
        a_atn,
        host_input,
        netloc_input,
        cate_input
    ],
                                  outputs=x)

    return model

In [19]:
outputs = compute_output_arrays(df_train, output_categories)
inputs = compute_input_arrays(df_train, input_categories, tokenizer,
                              MAX_SEQUENCE_LENGTH)

test_inputs = compute_input_arrays(df_test, input_categories, tokenizer,
                                   MAX_SEQUENCE_LENGTH)

6079it [00:15, 399.37it/s]
476it [00:01, 398.25it/s]


### Split K-Folds by Unique Group

In [20]:
set_all_seeds(rand_seed)
gkf = GroupKFold(n_splits=n_splits).split(X=df_train.question_body,
                                          groups=df_train.question_body)
gkf = list(gkf)
len(gkf)

5

### Learning Rate Finder

In [21]:
"`learner` contains essential learner utilities"

import matplotlib.pyplot as plt
import seaborn as sns
import math

class LRFinder:
    """
    Plots the change of the loss function of a Keras model when the learning rate is exponentially increasing.
    See for details:
    https://towardsdatascience.com/estimating-optimal-learning-rate-for-a-deep-neural-network-ce32f2556ce0
    """
    def __init__(self, model):
        self.model = model
        self.losses = []
        self.lrs = []
        self.best_loss = 1e9

    def on_batch_end(self, batch, logs, tolerance=4):
        # Log the learning rate
        lr = K.get_value(self.model.optimizer.lr)
        self.lrs.append(lr)

        # Log the loss
        loss = logs['loss']
        self.losses.append(loss)

        # Check whether the loss got too large or NaN
        if math.isnan(loss) or loss > self.best_loss * tolerance:
            self.model.stop_training = True
            return

        if loss < self.best_loss:
            self.best_loss = loss

        # Increase the learning rate for the next batch
        lr *= self.lr_mult
        K.set_value(self.model.optimizer.lr, lr)

    def find(self, x_train, y_train, start_lr=1e-6, end_lr=1e-1, batch_size=64, epochs=1, tolerance=4):
        # num_batches = epochs * x_train.shape[0] / batch_size
        num_batches = epochs * len(x_train) / batch_size
        
        self.lr_mult = (float(end_lr) / float(start_lr)) ** (float(1) / float(num_batches))

        # Save weights into a file
        self.model.save_weights('tmp.h5')

        # Remember the original learning rate
        original_lr = K.get_value(self.model.optimizer.lr)

        # Set the initial learning rate
        K.set_value(self.model.optimizer.lr, start_lr)

        callback = tf.keras.callbacks.LambdaCallback(on_batch_end=lambda batch, logs: self.on_batch_end(batch, logs, tolerance))

        self.model.fit(x_train, y_train,
                        batch_size=batch_size, epochs=epochs,
                        callbacks=[callback])

        # Restore the weights to the state before model fitting
        self.model.load_weights('tmp.h5')

        # Restore the original learning rate
        K.set_value(self.model.optimizer.lr, original_lr)

    def find_generator(self, generator, start_lr=1e-6, end_lr=1e-1, epochs=1, tolerance=4, steps_per_epoch=None, **kw_fit):
            if steps_per_epoch is None:
                try:
                    steps_per_epoch = len(generator)
                except (ValueError, NotImplementedError) as e:
                    raise e('`steps_per_epoch=None` is only valid for a'
                            ' generator based on the '
                            '`keras.utils.Sequence`'
                            ' class. Please specify `steps_per_epoch` '
                            'or use the `keras.utils.Sequence` class.')
            self.lr_mult = (float(end_lr) / float(start_lr)) ** (float(1) / float(steps_per_epoch))

            # Save weights into a file
            self.model.save_weights('tmp.h5')

            # Remember the original learning rate
            original_lr = K.get_value(self.model.optimizer.lr)

            # Set the initial learning rate
            K.set_value(self.model.optimizer.lr, start_lr)

            callback = tf.keras.callbacks.LambdaCallback(on_batch_end=lambda batch,
                                      logs: self.on_batch_end(batch, logs, tolerance))

            self.model.fit_generator(generator=generator,
                                     epochs=epochs,
                                     steps_per_epoch=steps_per_epoch,
                                     callbacks=[callback],
                                     **kw_fit)

            # Restore the weights to the state before model fitting
            self.model.load_weights('tmp.h5')

            # Restore the original learning rate
            K.set_value(self.model.optimizer.lr, original_lr)

    def plot_loss(self, n_skip_beginning=10, n_skip_end=5, log_scale=True):
        """
        Plots the loss.
        Parameters:
            n_skip_beginning - number of batches to skip on the left.
            n_skip_end - number of batches to skip on the right.
        """
        plt.ylabel("loss")
        plt.xlabel("learning rate (log scale)")
        plt.plot(self.lrs[n_skip_beginning:-n_skip_end], self.losses[n_skip_beginning:-n_skip_end])
        if log_scale:
            plt.xscale('log')

    def plot_loss_change(self, sma=1, n_skip_beginning=10, n_skip_end=5, y_lim=(-0.01, 0.01)):
        """
        Plots rate of change of the loss function.
        Parameters:
            sma - number of batches for simple moving average to smooth out the curve.
            n_skip_beginning - number of batches to skip on the left.
            n_skip_end - number of batches to skip on the right.
            y_lim - limits for the y axis.
        """
        assert sma >= 1
        derivatives = [0] * sma
        for i in range(sma, len(self.lrs)):
            derivative = (self.losses[i] - self.losses[i - sma]) / sma
            derivatives.append(derivative)

        plt.ylabel("rate of loss change")
        plt.xlabel("learning rate (log scale)")
        plt.plot(self.lrs[n_skip_beginning:-n_skip_end], derivatives[n_skip_beginning:-n_skip_end])
        plt.xscale('log')
        plt.ylim(y_lim)

In [22]:
# K.clear_session()
# tmp_model = create_model(pretrained_model)
# tmp_optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
# tmp_model.compile(loss='binary_crossentropy', optimizer=tmp_optimizer)

In [23]:
# finder = LRFinder(tmp_model)

In [24]:
# train_idx, valid_idx = list(gkf)[0]

In [25]:
# tmp_train_inputs = [inputs[i][train_idx] for i in range(len(inputs))]
# tmp_train_outputs = outputs[train_idx]
# # tmp_valid_inputs = [inputs[i][valid_idx] for i in range(len(inputs))]
# # tmp_valid_outputs = outputs[valid_idx]

In [26]:
# set_all_seeds(rand_seed)
# finder.find(tmp_train_inputs, tmp_train_outputs,
#             start_lr=1e-7, end_lr=9e-5,
#             batch_size=4, epochs=5,
#             tolerance=500000)

In [27]:
# finder.plot_loss(log_scale=True, n_skip_beginning=5, n_skip_end=30)
# finder.plot_loss(n_skip_beginning=10, n_skip_end=5)

In [28]:
# del tmp_model, tmp_optimizer, tmp_train_inputs, tmp_train_outputs, finder
# del tmp_model, tmp_train_inputs, tmp_train_outputs, tmp_valid_inputs, tmp_valid_outputs, finder
gc.collect()

2805

## Fine-Tuning

In [29]:
model_prefix = "exp_cate_embed"


batch_size = 2
infer_batch_size = 2

In [None]:
val_scores = []

tf.executing_eagerly()

for fold, (train_idx, valid_idx) in enumerate(gkf):

    set_all_seeds(rand_seed)

    print(f"Fine-tuning {pretrained_model_name} for Fold {fold+1} ......")
    SAVE_PATH = f"{dataset_folder}/{pretrained_model_name}_{model_prefix}_fold{fold+1}.h5"

    train_inputs = [inputs[i][train_idx] for i in range(len(inputs))]
    train_outputs = outputs[train_idx]

    valid_inputs = [inputs[i][valid_idx] for i in range(len(inputs))]
    valid_outputs = outputs[valid_idx]

    # Extra categorical embeddings
    embed_info = {}
    category_features = {}

    def extract_category_ids(train, test, c, info):
        le = LabelEncoder()
        le.fit(train[c])
        # Set unknonwn category
        test[c] = test[c].map(lambda s: '<unknown>'
                              if s not in le.classes_ else s)

        le_classes = le.classes_.tolist()
        bisect.insort_left(le_classes, '<unknown>')
        le.classes_ = le_classes

        train[c + "_label"] = le.transform(train[c])
        test[c + "_label"] = le.transform(test[c])

        no_of_unique_cat = train[c + "_label"].nunique()
        embedding_size = min(np.ceil((no_of_unique_cat) / 2), 50)
        embedding_size = int(embedding_size)
        vocab_size = no_of_unique_cat + 1
        info[c] = (vocab_size, embedding_size)

        print(f"Extracted (vocab_size, embedding_size) for {c}: ({vocab_size}, {embedding_size})")

        return train[c + "_label"], test[c + "_label"]

    host_train, host_val = extract_category_ids(df_train.iloc[train_idx, :].copy(),
                                                df_train.iloc[valid_idx, :].copy(), "host",
                                                embed_info)
    netloc_train, netloc_val = extract_category_ids(df_train.iloc[train_idx, :].copy(),
                                                df_train.iloc[valid_idx, :].copy(), "netloc",
                                                embed_info)
    cate_train, cate_val = extract_category_ids(df_train.iloc[train_idx, :].copy(),
                                                df_train.iloc[valid_idx, :].copy(), "category",
                                                embed_info)
    
    train_inputs.append(host_train)
    train_inputs.append(netloc_train)
    train_inputs.append(cate_train)
    
    valid_inputs.append(host_val)
    valid_inputs.append(netloc_val)
    valid_inputs.append(cate_val)

    K.clear_session()
    model = create_model(embed_info)

    print(model.summary())

    # Define callback to save the model
    cbs = [
        SpearmanRhoEarlyStoppingCallback(valid_data=(valid_inputs,
                                                     valid_outputs),
                                         batch_size=infer_batch_size,
                                         fold=fold,
                                         model_save_path=SAVE_PATH,
                                         patience=early_stopping_patience),
        #         SpearmanMonitorCallback(valid_data=([valid_inputs, valid_outputs),
        #                                 batch_size=batch_size,
        #                                 fold=fold),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.1,
            # factor=0.3,
            min_delta=1e-4,
            min_lr=1e-7,
            patience=lr_decay_patience,
            verbose=1),
        # Save fine tuned model
        #         tf.keras.callbacks.ModelCheckpoint(filepath=SAVE_PATH,
        #                                            mode="min",
        #                                            monitor="val_loss",
        #                                            save_best_only=True,
        #                                            save_weights_only=True,
        #                                            verbose=1),
        #         tf.keras.callbacks.EarlyStopping(patience=early_stopping_patience,
        #                                          min_delta=1e-4,
        #                                          mode="min",
        #                                          verbose=1)
    ]

    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
#     optimizer = tf.keras.optimizers.AdamW(learning_rate=learning_rate)
    
    model.compile(loss='binary_crossentropy', optimizer=optimizer)

    model.fit(train_inputs,
              train_outputs,
              validation_data=[valid_inputs,
                               valid_outputs],
              epochs=epochs,
              batch_size=batch_size,
              callbacks=cbs,
              shuffle=True,
              verbose=1)
    # verbose=2)

    # Load best model weights
    model.load_weights(SAVE_PATH)

    fold_val_preds = model.predict(valid_inputs, batch_size=infer_batch_size)

    rho_val = compute_spearmanr_ignore_nan(valid_outputs, fold_val_preds)
    print(f"Fold {fold+1} Best Validation Score: {rho_val:.6f}")

    val_scores.append(rho_val)

    del model, rho_val, fold_val_preds
    gc.collect()

    if debug_mode:
        break

In [None]:
val_scores

In [None]:
print(f"Mean Validation Score: {np.mean(val_scores):.6f}")