# Settings

In [1]:
# CONTROLS
MODEL_PREFIX = "V01" 
MODEL_NUMBER = MODEL_PREFIX[-2:]

TRAIN_SPLIT_RATIO = 0.2
BATCH_SIZE = 8
PREDICT_BATCH_SIZE = 32
DROPOUT = 0.3
LABEL_SMOOTHING_PARAM = 0.2

RUN_ON_SAMPLE = True
NUM_EPOCHS = [10, 40, 4]
MAX_LR = 5e-3
MID_LR = 1e-3
MIN_LR = 5e-5
STEP_SIZE = 2 # Number of training epochs per half-cycle in tclr

In [2]:
RESULTS_DIR = "../results/"
DATA_DIR = "../data/"
MODEL_DIR = "../models/"
EXT_MODEL_DIR = "../"

# Libraries

In [3]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score, classification_report
from sklearn.model_selection import StratifiedKFold, train_test_split, KFold
from sklearn.utils import class_weight

import pickle, os, sys, re, json, gc
from time import time, ctime
from pprint import pprint

import tensorflow as tf
from tensorflow.keras import Model, Input
from tensorflow.keras.layers import Conv1D, Conv2D, LSTM, Embedding, Dense, concatenate, MaxPooling2D, Softmax, Flatten
from tensorflow.keras.layers import BatchNormalization, Dropout, Reshape, Activation, Bidirectional, TimeDistributed
from tensorflow.keras.layers import RepeatVector, Multiply, Layer, LeakyReLU, Subtract
from tensorflow.keras.activations import softmax
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import initializers, regularizers, constraints
from tensorflow.keras.callbacks import *
from tensorflow.keras.callbacks import CSVLogger, ModelCheckpoint
import tensorflow.keras.backend as K
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.models import save_model, load_model

import tokenizers, transformers
from transformers import *

import tensorflow_addons as tfa
from tensorflow_addons.optimizers import TriangularCyclicalLearningRate

%matplotlib inline

In [4]:
seeded_value = 2510
pd.set_option('display.max_colwidth', None)
np.random.seed(seeded_value)
tf.random.set_seed(seeded_value)

In [5]:
print(ctime(time()))

Tue Jun 16 22:21:29 2020


In [6]:
print([
    tf.__version__,
    transformers.__version__,
    tokenizers.__version__
])

['2.1.0', '2.8.0', '0.5.2']


<a href="https://www.tensorflow.org/guide/gpu#limiting_gpu_memory_growth"  target="_blank"><h2 id="limiting_gpu_memory_growth" data-text="Limiting GPU memory growth" tabindex="0">Limiting GPU memory growth</h2></a>
<p>By default, TensorFlow maps nearly all of the GPU memory of all GPUs (subject to
<a href="https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars"><code translate="no" dir="ltr">CUDA_VISIBLE_DEVICES</code></a>) visible to the process. This is done to more efficiently use the relatively precious GPU memory resources on the devices by reducing memory fragmentation. To limit TensorFlow to a specific set of GPUs we use the <code translate="no" dir="ltr">tf.config.experimental.set_visible_devices</code> method.</p>

In [7]:
print(tf.config.experimental.list_logical_devices('CPU'))
print(tf.config.experimental.list_logical_devices('GPU'))
print(tf.config.experimental.list_physical_devices('CPU'))
print(tf.config.experimental.list_physical_devices('GPU'))

[LogicalDevice(name='/device:CPU:0', device_type='CPU')]
[LogicalDevice(name='/device:GPU:0', device_type='GPU')]
[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [8]:
physical_devices = tf.config.list_physical_devices('GPU')
try:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
except:
    # Invalid device or cannot modify virtual devices once initialized.
    pass

# Import Data

In [9]:
train = pd.read_csv(DATA_DIR+'jigsaw-toxic-comment-train.csv')
validation = pd.read_csv(DATA_DIR+'validation.csv')
test = pd.read_csv(DATA_DIR+'test.csv')

In [10]:
train['lang'] = 'en'

train['set'] = 'train'
validation['set'] = 'valid'
test['set'] = 'test'

test['toxic'] = 0

In [11]:
print(train.columns)
print(validation.columns)
print(test.columns)

Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate', 'lang', 'set'],
      dtype='object')
Index(['id', 'comment_text', 'lang', 'toxic', 'set'], dtype='object')
Index(['id', 'content', 'lang', 'set', 'toxic'], dtype='object')


In [12]:
train.columns = ['id', 'text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate', 'lang', 'set']
validation.columns = ['id', 'text', 'lang', 'toxic', 'set']
test.columns = ['id', 'text', 'lang', 'set', 'toxic']

In [13]:
REQ_COLS = ['id', 'set', 'text', 'lang', 'toxic']

In [14]:
data = pd.concat([train[REQ_COLS], validation[REQ_COLS], test[REQ_COLS]], axis=0)

In [15]:
data.shape

(295361, 5)

In [16]:
data.sample(2)

Unnamed: 0,id,set,text,lang,toxic
124131,97ed91f08edf2244,train,"This is just world bank published, this is an academic resource which is acceptable under the WP:RS clause of WP:SCHOLARSHIP. Sorry I did not realize I made a mistake on who really did the study.",en,0
65626,af8e60ad3dc2d65e,train,"""\nI think is unaware of the connotations of the word """"defame"""", as I don't think English is his first language. He does use it a lot in connection with . I nearly posted him a message about WP:LIBEL as a kind of friendly warning, but didn't think it was my place to. I am not too happy about the collapsible thread either, PBS. I know we cannot have open edit-warring on the Talk page, but I was a bit uncomfortable with it for perhaps obvious reasons. ~ """,en,0


In [17]:
data.groupby(["set", "lang"]).agg({'id':'count', 'toxic':np.mean})

Unnamed: 0_level_0,Unnamed: 1_level_0,id,toxic
set,lang,Unnamed: 2_level_1,Unnamed: 3_level_1
test,es,8438,0.0
test,fr,10920,0.0
test,it,8494,0.0
test,pt,11012,0.0
test,ru,10948,0.0
test,tr,14000,0.0
train,en,223549,0.095657
valid,es,2500,0.1688
valid,it,2500,0.1952
valid,tr,3000,0.106667


In [18]:
train['text'] = " " + train["text"].astype(str)
validation['text'] = " " + validation["text"].astype(str)
test['text'] = " " + test["text"].astype(str)
data['text'] = " " + data["text"].astype(str)

# XLM Roberta Tokenizer, Config & Model Initialization

1. https://arxiv.org/pdf/1911.02116.pdf
2. https://huggingface.co/transformers/model_doc/xlmroberta.html

In [19]:
xlmr_tok = transformers.XLMRobertaTokenizer.from_pretrained(MODEL_DIR)

In [20]:
with open(MODEL_DIR+"/special_tokens_map.json") as f:
    special_tokens = json.load(f)
xlmr_tok.add_special_tokens(special_tokens)

0

In [21]:
VOCAB_SIZE = xlmr_tok.vocab_size
print(VOCAB_SIZE)

250002


# Sample

In [22]:
SAMPLE_SIZE = 2000
if RUN_ON_SAMPLE:
    train = train.sample(SAMPLE_SIZE).copy()
    train = train.reset_index(drop=True)
    print("Train RUN_ON_SAMPLE", train.shape)
    
    validation = validation.sample(SAMPLE_SIZE).copy()
    validation = validation.reset_index(drop=True)
    print("Validation  RUN_ON_SAMPLE", validation.shape)
    
    test = test.sample(SAMPLE_SIZE).copy()
    test = test.reset_index(drop=True)
    print("Test  RUN_ON_SAMPLE", test.shape)

    data = data.sample(SAMPLE_SIZE).copy()
    data = data.reset_index(drop=True)
    print("Data  RUN_ON_SAMPLE", data.shape)

Train RUN_ON_SAMPLE (2000, 10)
Validation  RUN_ON_SAMPLE (2000, 5)
Test  RUN_ON_SAMPLE (2000, 5)
Data  RUN_ON_SAMPLE (2000, 5)


# Tokenization

In [23]:
MAX_SEQ_LEN = 75

In [24]:
X_tokens, X_att = [], []
for t in data.text.tolist():
    encoded_text = xlmr_tok.encode_plus(t, pad_to_max_length=True, max_length=MAX_SEQ_LEN)
    X_tokens.append(encoded_text['input_ids'])
    X_att.append(encoded_text['attention_mask'])

X_tokens, X_att = np.array(X_tokens), np.array(X_att)
X_tokens.shape, X_att.shape

((2000, 75), (2000, 75))

In [25]:
from tensorflow.keras.utils import to_categorical

In [26]:
Y_lang, Y_toxic = data['lang'].astype('category').cat.codes.values, data['toxic'].values
Y_lang = to_categorical(Y_lang)

In [27]:
NUM_LANG = data['lang'].nunique()

In [28]:
print("\n",
      X_tokens.shape, "\t: X_tokens ", "\n",
      X_att.shape, "\t: X_att ", "\n",
      Y_lang.shape, "\t: Y_lang ", "\n",
      Y_toxic.shape, "\t: Y_toxic ", "\n",
)


 (2000, 75) 	: X_tokens  
 (2000, 75) 	: X_att  
 (2000, 7) 	: Y_lang  
 (2000,) 	: Y_toxic  



# Model Specifications

In [29]:
def build_model():
    input_sequences = Input((MAX_SEQ_LEN), dtype=tf.int32, name="words")
    input_att_flags = Input((MAX_SEQ_LEN), dtype=tf.int32, name="att_flags")
    
    xlmr_config = transformers.XLMRobertaConfig.from_pretrained(MODEL_DIR)
    xlmr_model = transformers.TFXLMRobertaModel.from_pretrained(MODEL_DIR, config=xlmr_config)
    x = xlmr_model(inputs=input_sequences, attention_mask=input_att_flags)
    
    x1 = tf.keras.layers.Dropout(DROPOUT)(x[0])
    x1 = tf.keras.layers.Conv1D(768, 2, padding='same')(x1)
    x1 = tf.keras.layers.BatchNormalization()(x1)
    x1 = tf.keras.layers.LeakyReLU()(x1)
    x1 = tf.keras.layers.Dense(1)(x1)
    x1 = tf.keras.layers.BatchNormalization()(x1)
    x1 = tf.keras.layers.LeakyReLU()(x1)
    x1 = tf.keras.layers.Flatten()(x1)
    x1 = tf.keras.layers.Dense(1)(x1)
    toxic_output = tf.keras.layers.Activation('sigmoid', name="toxic_output")(x1)
    
    x2 = tf.keras.layers.Dropout(DROPOUT)(x[0])
    x2 = tf.keras.layers.Conv1D(768, 2, padding='same')(x2)
    x2 = tf.keras.layers.BatchNormalization()(x2)
    x2 = tf.keras.layers.LeakyReLU()(x2)
    x2 = tf.keras.layers.Dense(1)(x2)
    x2 = tf.keras.layers.BatchNormalization()(x2)
    x2 = tf.keras.layers.LeakyReLU()(x2)
    x2 = tf.keras.layers.Flatten()(x2)
    x2 = tf.keras.layers.Dense(NUM_LANG)(x2)
    lang_output = tf.keras.layers.Activation('softmax', name="lang_output")(x2)
    
    model = Model([input_att_flags, input_sequences],
                  [toxic_output, lang_output])
    
    return model

In [30]:
model = build_model()

In [31]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
words (InputLayer)              [(None, 75)]         0                                            
__________________________________________________________________________________________________
att_flags (InputLayer)          [(None, 75)]         0                                            
__________________________________________________________________________________________________
tfxlm_roberta_model (TFXLMRober ((None, 75, 768), (N 278043648   words[0][0]                      
__________________________________________________________________________________________________
dropout_38 (Dropout)            (None, 75, 768)      0           tfxlm_roberta_model[0][0]        
______________________________________________________________________________________________

In [32]:
def print_metrics(pred_dict):
    print("[INFO] ","="*15,"Validation for FOLD#", num, "="*15)
    funcs = [accuracy_score, f1_score, precision_score, recall_score, confusion_matrix]
    for f in funcs:
        for data_set in ["train","valid"]:
            for var in ["starts", "stops"]:
                if f in [accuracy_score]:
                    res = f(**pred_dict[data_set][var])
                    print("[INFO] {:.2f}".format(100 * res), "\t||", data_set, "\t||", var, "\t||", f.__name__)
                elif f in [confusion_matrix]:
                    res = f(**pred_dict[data_set][var], labels=np.arange(MAX_SEQ_LEN))
                    np.savetxt(X=res, fmt='%i', delimiter=",",
                               fname=RESULTS_DIR+"ConfusionMatrix_"+MODEL_PREFIX+"_"+data_set+"_"+var+".csv")
                    print("[INFO] \t||", data_set, "\t||", var, "\t||", f.__name__, "\t||", 
                          RESULTS_DIR+"ConfusionMatrix_"+MODEL_PREFIX+"_"+data_set+"_"+var+".csv")
                else:
                    res = f(**pred_dict[data_set][var], average="macro")
                    print("[INFO] {:.2f}".format(100 * res), "\t||", data_set, "\t||", var, "\t||", f.__name__)
        print("=======================================================================")

<a href="https://keras.io/guides/transfer_learning/#finetuning" target="_blank"><h2 id="finetuning">Fine-tuning</h2></a>
<p>Once your model has converged on the new data, you can try to unfreeze all or part of
 the base model and retrain the whole model end-to-end with a very low learning rate.</p>
 <p>This is an optional last step that can potentially give you incremental improvements.
 It could also potentially lead to quick overfitting -- keep that in mind.</p>
 <p>It is critical to only do this step <em>after</em> the model with frozen layers has been
trained to convergence. If you mix randomly-initialized trainable layers with
trainable layers that hold pre-trained features, the randomly-initialized layers will
cause very large gradient updates during training, which will destroy your pre-trained
 features.</p>
 <p>It's also critical to use a very low learning rate at this stage, because
you are training a much larger model than in the first round of training, on a dataset
 that is typically very small.
As a result, you are at risk of overfitting very quickly if you apply large weight
 updates. Here, you only want to readapt the pretrained weights in an incremental way.</p>

<a href="https://keras.io/guides/transfer_learning/#finetuning" target="_blank"><p><strong>Important note about <code>compile()</code> and <code>trainable</code></strong></p></a>
<p>Calling <code>compile()</code> on a model is meant to "freeze" the behavior of that model. This
 implies that the <code>trainable</code>
attribute values at the time the model is compiled should be preserved throughout the
 lifetime of that model,
until <code>compile</code> is called again. Hence, if you change any <code>trainable</code> value, make sure
 to call <code>compile()</code> again on your
model for your changes to be taken into account.</p>

In [33]:
class LossWeightAdjust(Callback):
    def __init__(self, alpha, beta):
        self.alpha = alpha
        self.beta = beta
        
    def on_epoch_end(self, epoch, logs):
        losses = np.array([v for k,v in logs.items() if k in ['val_toxic_output_loss', 'val_lang_output_loss']], dtype=np.float64)
        
        total_loss = np.sum(losses)
        
        losses = (losses - 0.5*losses.min()) / (losses.max() - 0.5*losses.min())
        losses = losses/np.sum(losses)

        K.set_value(self.alpha, losses[0])
        K.set_value(self.beta, losses[1])
        
        logs['val_total_loss'] = total_loss
        
        print("\n Loss weights recalibrated to alpha = %s, beta = %s, gamma = %s, delta = %s " % (np.round(losses[0],2),
                                                                                                  np.round(losses[1],2)))
        print("Total Val Loss", np.round(total_loss,3))
        logger.info("Loss weights recalibrated to alpha = %s, beta = %s, gamma = %s, delta = %s " % (K.get_value(self.alpha),
                                                                                                     K.get_value(self.beta)))

###### What does the Loss Weight Adjust Callback do?

In [34]:
losses = np.array([2.7892, 2.7021])
losses = (losses - 0.5*losses.min()) / (losses.max() - 0.5*losses.min())
losses = losses/np.sum(losses)
losses

array([0.5156138, 0.4843862])

In [36]:
t_index, v_index = train_test_split(np.arange(X_tokens.shape[0]), shuffle=True, random_state=seeded_value)

In [37]:
tclr = tfa.optimizers.TriangularCyclicalLearningRate(
    initial_learning_rate=MIN_LR,
    maximal_learning_rate=MID_LR,
    step_size=STEP_SIZE*len(t_index)
)

In [38]:
num = 0
alpha = K.variable(0.25)
beta = K.variable(0.25)

mcp = ModelCheckpoint(filepath=RESULTS_DIR+MODEL_PREFIX+"BestCheckpoint_"+str(num)+".h5", monitor='val_total_loss',
                      verbose=1, save_best_only=True, save_weights_only=True, mode='min', save_freq='epoch')

csvl = CSVLogger(filename=RESULTS_DIR+MODEL_PREFIX+"_LossLogs_"+str(num)+".csv",
                 separator=",", append=True)

In [39]:
print("[INFO] Training only the final layers at higher learning rates.")
model.layers[3].trainable = False
adam = Adam(learning_rate=MAX_LR)
model.compile(loss={"lang_output":tf.keras.losses.CategoricalCrossentropy(from_logits=False, label_smoothing=LABEL_SMOOTHING_PARAM),
                   "toxic_output":tf.keras.losses.BinaryCrossentropy()},
              optimizer=adam,
              metrics=['accuracy'],
              loss_weights={"lang_output":alpha,"toxic_output":beta})
np.random.shuffle(t_index); np.random.shuffle(v_index);
history = model.fit(x={"att_flags":X_att[t_index],
                       "words":X_tokens[t_index]},
                    y={"lang_output":Y_lang[t_index],
                       "toxic_output":Y_toxic[t_index]},
                    shuffle=False,
                    batch_size=BATCH_SIZE,
                    epochs=NUM_EPOCHS[0],
                    validation_data=({"att_flags":X_att[v_index],
                                      "words":X_tokens[v_index]},
                                     {"lang_output":Y_lang[v_index],
                                      "toxic_output":Y_toxic[v_index]}),
                    verbose=1,
                    callbacks=[LossWeightAdjust(alpha=alpha, beta=beta), mcp, csvl])

[INFO] Training only the final layers at higher learning rates.
Train on 1500 samples, validate on 500 samples
Epoch 1/10
   8/1500 [..............................] - ETA: 2:02:27

ValueError: zero-size array to reduction operation minimum which has no identity

In [None]:
print("[INFO] Training only the final layers with a triangular cyclical learning rate policy.")
model.layers[3].trainable = False
adam = Adam(learning_rate=tclr)
model.compile(loss={"lang_output":tf.keras.losses.CategoricalCrossentropy(from_logits=False, label_smoothing=LABEL_SMOOTHING_PARAM),
                   "toxic_output":tf.keras.losses.BinaryCrossentropy()},
              optimizer=adam,
              metrics=['accuracy'],
              loss_weights={"lang_output":alpha,"toxic_output":beta})
np.random.shuffle(t_index); np.random.shuffle(v_index);
history = model.fit(x={"att_flags":X_att[t_index],
                       "words":X_tokens[t_index]},
                    y={"lang_output":Y_lang[t_index],
                       "toxic_output":Y_toxic[t_index]},
                    shuffle=False,
                    batch_size=BATCH_SIZE,
                    epochs=NUM_EPOCHS[1],
                    validation_data=({"att_flags":X_att[v_index],
                                      "words":X_tokens[v_index]},
                                     {"lang_output":Y_lang[v_index],
                                      "toxic_output":Y_toxic[v_index]}),
                    verbose=1,
                    callbacks=[LossWeightAdjust(alpha=alpha, beta=beta), mcp, csvl])

In [None]:
print("[INFO] Unfreezing RoBerta layer and training at lowest learning rates.")
model.layers[3].trainable = True
adam = Adam(learning_rate=MIN_LR*0.25)
model.compile(loss={"lang_output":tf.keras.losses.CategoricalCrossentropy(from_logits=False, label_smoothing=LABEL_SMOOTHING_PARAM),
                   "toxic_output":tf.keras.losses.BinaryCrossentropy()},
              optimizer=adam,
              metrics=['accuracy'],
              loss_weights={"lang_output":alpha,"toxic_output":beta})
np.random.shuffle(t_index); np.random.shuffle(v_index);
history = model.fit(x={"att_flags":X_att[t_index],
                       "words":X_tokens[t_index]},
                    y={"lang_output":Y_lang[t_index],
                       "toxic_output":Y_toxic[t_index]},
                    shuffle=False,
                    batch_size=BATCH_SIZE,
                    epochs=NUM_EPOCHS[2],
                    validation_data=({"att_flags":X_att[v_index],
                                      "words":X_tokens[v_index]},
                                     {"lang_output":Y_lang[v_index],
                                      "toxic_output":Y_toxic[v_index]}),
                    verbose=1,
                    callbacks=[LossWeightAdjust(alpha=alpha, beta=beta), mcp, csvl])

In [None]:
# Loading best weights per fold
model.load_weights(RESULTS_DIR+MODEL_PREFIX+"BestCheckpoint_"+str(num)+".h5")

pred_train = model.predict(x = {"att_flags":X_att[t_index],
                                "words":X_tokens[t_index]},
                           batch_size=PREDICT_BATCH_SIZE)

pred_val = model.predict(x = {"att_flags":X_att[v_index],
                              "words":X_tokens[v_index]},
                         batch_size=PREDICT_BATCH_SIZE)

In [None]:
# Accumulate test results after training every fold
pred_test_fold = model.predict(x = {"att_flags":X_att_test,
                                    "words":X_test},
                               batch_size=PREDICT_BATCH_SIZE)
if num==0:
    pred_test = []
    pred_test.append(pred_test_fold[0]/2.0 + pred_test_fold[2]/2.0)
    pred_test.append(pred_test_fold[1]/2.0 + pred_test_fold[3]/2.0)

# Tabulate
preds = {
    "train":{
        "Lang":{
            "y_true":Y_lang[t_index].argmax(axis=1),
            "y_pred":pred_train[0].argmax(axis=1)
        },
        "Toxic":{
            "y_true":Y_toxic[t_index].argmax(axis=1),
            "y_pred":pred_train[1].argmax(axis=1)
        }
    },
    "valid":{
        "Lang":{
            "y_true":Y_lang[v_index].argmax(axis=1),
            "y_pred":pred_val[0].argmax(axis=1)
        },
        "Toxic":{
            "y_true":Y_toxic[v_index].argmax(axis=1),
            "y_pred":pred_val[1].argmax(axis=1)
        }        
    }
}

In [None]:
print_metrics(pred_dict=preds)

print("[INFO] Prediction shape for training data: ", pred_starts_train.shape, pred_stops_train.shape)
print("[INFO] Prediction shape for validation data: ", pred_starts_val.shape, pred_stops_val.shape)

print("[INFO] Normal predictions (StartIndex less than EndIndex) for training data: ",
      sum([s<e for s,e in zip(pred_starts_train.argmax(axis=1),
                              pred_stops_train.argmax(axis=1))]),
      "out of", pred_starts_train.shape[0])
print("[INFO] Normal predictions (StartIndex less than EndIndex) for validation data: ",
      sum([s<e for s,e in zip(pred_starts_val.argmax(axis=1),
                              pred_stops_val.argmax(axis=1))]),
      "out of", pred_starts_val.shape[0],)

print("[INFO] Training Jaccard Score: ",
      np.mean([jaccard(str1=i, str2=j) for i,j in zip([t for n,t in enumerate(Y_words) if n in t_index],
                                                      pred_words_train)]))
print("[INFO] Validation Jaccard Score: ",
      np.mean([jaccard(str1=i, str2=j) for i,j in zip([t for n,t in enumerate(Y_words) if n in v_index],
                                                      pred_words_val)]))
print("[INFO] Training for fold:", num, "finished at", ctime(time()))

print(ctime(time()))

## Validation

#### Inference

In [None]:
pred_starts_test, pred_stops_test = pred_test[0]/NUM_FOLDS, pred_test[1]/NUM_FOLDS
print("[INFO] Prediction shape for testing data: ", pred_starts_test.shape, pred_stops_test.shape)

#### Postprocessing

In [None]:
print("Normal predictions (StartIndex less than EndIndex) for testing data:",
      sum([s<e for s,e in zip(pred_starts_test.argmax(axis=1),
                              pred_stops_test.argmax(axis=1))]), 
      "out of",
      pred_starts_test.shape[0])

In [None]:
pred_words_test = [
    post_process(tokenizer.decode(t[s:e+1])) if s<e else post_process(tokenizer.decode(t[e:])) for t,s,e in zip(X_span_test,
                                                                                                              pred_starts_test.argmax(axis=1),
                                                                                                              pred_stops_test.argmax(axis=1))
]

In [None]:
check_idx = 1111
#print([[t,i,j,k] for t,i,j,k in zip(tokenizer.decode(),X_test[check_idx],pred_starts_test[check_idx],pred_stops_test[check_idx])])
print(tokenizer.decode(X_span_test[check_idx]))
print(pred_starts_test.argmax(axis=1)[check_idx])
print(pred_stops_test.argmax(axis=1)[check_idx])
print(post_process(tokenizer.decode(X_span_test[check_idx][pred_starts_test.argmax(axis=1)[check_idx]:1+pred_stops_test.argmax(axis=1)[check_idx]])))

## Submission

In [None]:
test_df_span['selected_text'] = pred_words_test

In [None]:
test_df_span["selected_text"] = np.where(test_df_span["sentiment"] == "neutral",
                                         test_df_span["text"],
                                         test_df_span["selected_text"])

In [None]:
test_df_span[["textID", "selected_text"]].to_csv(RESULTS_DIR+"submission.csv", index=False)

In [None]:
test_df_span.loc[test_df_span.sentiment!="neutral"][["text", "sentiment","selected_text"]].sample(25)