## Settings

In [1]:
# CONTROLS
MODEL_PREFIX = "V37" # OnlySpanModelKF3 2X2Tasks LabelSmoothed with NeutralSamples and Better Preprocessing
MODEL_NUMBER = MODEL_PREFIX[-2:]

TRAIN_SPLIT_RATIO = 0.2
BATCH_SIZE = 16
PREDICT_BATCH_SIZE = 512
DROPOUT = 0.3
LABEL_SMOOTHING_PARAM = 0.2

RUN_ON_SAMPLE = False
EXCLUDE_NEUTRAL_CLASS = False
NUM_EPOCHS = [3, 3, 1]
NUM_FOLDS = 3
#LRs = [5e-3, 1e-4, 1e-6]
MAX_LR = 5e-3 #5e-4 #5e-3 #3e-5
MID_LR = 5e-4 #5e-5 #1e-4 #3e-5
MIN_LR = 1e-6 #5e-6 #1e-6 #3e-5

In [2]:
RESULTS_DIR = "../results/"
DATA_DIR = "../data/"
MODEL_DIR = "../data/models/roberta-base/"
EXT_MODEL_DIR = "../data/models/roberta-tokenizer/"

## Libraries

In [3]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score, classification_report
from sklearn.model_selection import StratifiedKFold, train_test_split, KFold
from sklearn.utils import class_weight

import pickle, os, sys, re, json, gc
from time import time, ctime
from pprint import pprint

import tensorflow as tf
from tensorflow.keras import Model, Input
from tensorflow.keras.layers import Conv1D, Conv2D, LSTM, Embedding, Dense, concatenate, MaxPooling2D, Softmax, Flatten
from tensorflow.keras.layers import BatchNormalization, Dropout, Reshape, Activation, Bidirectional, TimeDistributed
from tensorflow.keras.layers import RepeatVector, Multiply, Layer, LeakyReLU, Subtract
from tensorflow.keras.activations import softmax
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import initializers, regularizers, constraints
from tensorflow.keras.callbacks import *
from tensorflow.keras.callbacks import CSVLogger, ModelCheckpoint
import tensorflow.keras.backend as K
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.models import save_model, load_model

import tokenizers, transformers
from transformers import *

%matplotlib inline

In [4]:
def jaccard(str1, str2):
    a = set(str1)
    b = set(str2)
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [5]:
seeded_value = 753951
pd.set_option('display.max_colwidth', None)
np.random.seed(seeded_value)
tf.random.set_seed(seeded_value)

In [6]:
print(ctime(time()))

Sun Jun 14 15:06:13 2020


In [7]:
print([
    tf.__version__,
    transformers.__version__,
    tokenizers.__version__
])

['2.1.0', '2.8.0', '0.5.2']


In [8]:
if not os.path.exists(MODEL_DIR):
    os.mkdir(MODEL_DIR)
    
if not os.path.exists(MODEL_DIR+"tokenizers"):
    os.mkdir(MODEL_DIR+"tokenizers")

if not os.path.exists(MODEL_DIR+"tokenizers/roberta_tokenizer"):
    os.mkdir(MODEL_DIR+"tokenizers/roberta_tokenizer")

<a href="https://www.tensorflow.org/guide/gpu#limiting_gpu_memory_growth"  target="_blank"><h2 id="limiting_gpu_memory_growth" data-text="Limiting GPU memory growth" tabindex="0">Limiting GPU memory growth</h2></a>
<p>By default, TensorFlow maps nearly all of the GPU memory of all GPUs (subject to
<a href="https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars"><code translate="no" dir="ltr">CUDA_VISIBLE_DEVICES</code></a>) visible to the process. This is done to more efficiently use the relatively precious GPU memory resources on the devices by reducing memory fragmentation. To limit TensorFlow to a specific set of GPUs we use the <code translate="no" dir="ltr">tf.config.experimental.set_visible_devices</code> method.</p>

In [9]:
print(tf.config.experimental.list_logical_devices('CPU'))
print(tf.config.experimental.list_logical_devices('GPU'))
print(tf.config.experimental.list_physical_devices('CPU'))
print(tf.config.experimental.list_physical_devices('GPU'))

[LogicalDevice(name='/device:CPU:0', device_type='CPU')]
[LogicalDevice(name='/device:GPU:0', device_type='GPU')]
[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [10]:
physical_devices = tf.config.list_physical_devices('GPU')
try:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
except:
    # Invalid device or cannot modify virtual devices once initialized.
    pass

#### Tokenization

In [11]:
tokenizer = tokenizers.ByteLevelBPETokenizer(vocab_file=EXT_MODEL_DIR+'/vocab.json',
                                             merges_file=EXT_MODEL_DIR+'/merges.txt',                                         
                                             add_prefix_space=True,
                                             lowercase=True)

In [12]:
with open(EXT_MODEL_DIR+"/special_tokens_map.json") as f:
    special_tokens = json.load(f)

tokenizer.add_special_tokens([i for i in special_tokens.values()])

0

In [13]:
VOCAB_SIZE = tokenizer.get_vocab_size(); VOCAB_SIZE

50265

# Import data

In [14]:
df_span = pd.read_csv(DATA_DIR+"train.csv", encoding="utf8").fillna('')

print(pd.concat((df_span.dtypes, df_span.isna().sum()), axis=1))
print(df_span.shape)

# Counts of various columns
print({i:df_span[i].nunique() for i in df_span.columns})
print(df_span.describe())
df_span.head(2)

                    0  1
textID         object  0
text           object  0
selected_text  object  0
sentiment      object  0
(27481, 4)
{'textID': 27481, 'text': 27481, 'selected_text': 22464, 'sentiment': 3}
            textID                                             text  \
count        27481                                            27481   
unique       27481                                            27481   
top     a3283b0178  Just got up from a nap.. Relaxing for the night   
freq             1                                                1   

       selected_text sentiment  
count          27481     27481  
unique         22464         3  
top             good   neutral  
freq             199     11118  


Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative


In [15]:
test_df_span = pd.read_csv(DATA_DIR+"test.csv", encoding="utf8").fillna('')
print(pd.concat((test_df_span.dtypes, test_df_span.isna().sum()), axis=1))
print(test_df_span.shape)

# Counts of various columns
print({i:test_df_span[i].nunique() for i in test_df_span.columns})
print(test_df_span.describe())
test_df_span.head(2)

                0  1
textID     object  0
text       object  0
sentiment  object  0
(3534, 3)
{'textID': 3534, 'text': 3534, 'sentiment': 3}
            textID                            text sentiment
count         3534                            3534      3534
unique        3534                            3534         3
top     c8b610923d   well thank your phone for me.   neutral
freq             1                               1      1430


Unnamed: 0,textID,text,sentiment
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral
1,96d74cb729,Shanghai is also really exciting (precisely -- skyscrapers galore). Good tweeps in China: (SH) (BJ).,positive


#### Preprocessing for span detection

In [16]:
df_span["text"] = df_span["text"].astype(str)
df_span["selected_text"] = df_span["selected_text"].astype(str)
test_df_span["text"] = test_df_span["text"].astype(str)

In [17]:
def trim_addspace(text:str) -> str:
    text = text.lower()
    text = " " + text.strip(" ") + " "
    return text

In [18]:
def find_indices(text:str, selected_text:str) -> (str, str, int, int):
    
    text, selected_text = text.lower(), selected_text.lower()
    
    text = trim_addspace(text)
    
    substring_ = re.findall(pattern="\\s[^\s]*?"+re.escape(selected_text)+"[^\s]*?\\s", string=text)[0]
    
    return pd.Series([text, " "+substring_.strip(" "), text.find(substring_), len(substring_) + text.find(substring_)])

In [19]:
df_span[["text_mod", "selected_text_mod", "start", "stop"]] = df_span[['text','selected_text']].apply(lambda x: find_indices(x.text, x.selected_text), axis=1)

In [20]:
df_span.iloc[27476].to_dict()

{'textID': '4eac33d1c0',
 'text': ' wish we could come see u on Denver  husband lost his job and can`t afford it',
 'selected_text': 'd lost',
 'sentiment': 'negative',
 'text_mod': ' wish we could come see u on denver  husband lost his job and can`t afford it ',
 'selected_text_mod': ' husband lost',
 'start': 36,
 'stop': 50}

In [21]:
test_df_span['text_mod'] = test_df_span['text'].apply(trim_addspace)

In [22]:
df_span.loc[df_span.text_mod.str.contains("tonight, but no one will go")].to_dict()

{'textID': {12154: 'adfbcc6806'},
 'text': {12154: 'i wanna see `up` tonight, but no one will go with me. whhhyyy'},
 'selected_text': {12154: 'but no one will go with me.'},
 'sentiment': {12154: 'negative'},
 'text_mod': {12154: ' i wanna see `up` tonight, but no one will go with me. whhhyyy '},
 'selected_text_mod': {12154: ' but no one will go with me.'},
 'start': {12154: 26},
 'stop': {12154: 55}}

#### Cleaning for span detection

#### Adding special tokens

In [23]:
{t:tokenizer.encode(" "+t).ids for t in df_span.sentiment.unique()}

{'neutral': [7974], 'negative': [2430], 'positive': [1313]}

In [24]:
df_span["text_mod"] = "<s>" + df_span['text_mod'] + "</s> </s> " + df_span.sentiment + " </s>"
test_df_span["text_mod"] = "<s>" + test_df_span['text_mod'] + "</s> </s> " + test_df_span.sentiment + " </s>"

#### Exclusions for span detection

In [25]:
if EXCLUDE_NEUTRAL_CLASS:
    df_span = df_span.loc[df_span.sentiment!="neutral"].copy()
    df_span = df_span.reset_index(drop=True)
    print("EXCLUDE_NEUTRAL_CLASS:", df_span.shape)


if RUN_ON_SAMPLE:
    df_span = df_span.sample(2000).copy()
    df_span = df_span.reset_index(drop=True)
    print("Train RUN_ON_SAMPLE", df_span.shape)
    test_df_span = test_df_span.sample(2000).copy()
    test_df_span = test_df_span.reset_index(drop=True)
    print("Test  RUN_ON_SAMPLE", test_df_span.shape)

#### Tokenization for span detection

In [26]:
X_span_tokens = tokenizer.encode_batch(df_span.text_mod.tolist())
Y_span_tokens = tokenizer.encode_batch(df_span.selected_text_mod.tolist())
X_span_tokens_test = tokenizer.encode_batch(test_df_span.text_mod.tolist())

In [27]:
X_span = [i.ids for i in X_span_tokens]
Y_span = [i.ids for i in Y_span_tokens]
X_span_test = [i.ids for i in X_span_tokens_test]

In [28]:
X_span_att = [i.attention_mask for i in X_span_tokens]
Y_span_att = [i.attention_mask for i in Y_span_tokens] # Useless
X_span_att_test = [i.attention_mask for i in X_span_tokens_test]

In [29]:
MAX_SEQ_LEN_SPAN = max([len(i) for i in X_span])

In [30]:
def get_extremities(l_string, s_string, print_it=False):
    len_l = len(l_string)
    len_s = len(s_string)
    
    for i in range(len_l - len_s + 1):
        if (i + len_s) <= len_l:
            substring = l_string[i:i+len_s]
            if substring == s_string:
                if print_it:
                    print(l_string)
                    print(substring)
                    print(i, i+len_s, substring)
                
                start_vector, end_vector = [0] * len_l, [0] * len_l
                start_vector[i], end_vector[i+len_s-1] = 1, 1
                
                return (start_vector, end_vector)

In [31]:
Y_span_starts, Y_span_stops = [], []
anomaly_idx, counter = [], 0
for num, (i,j) in enumerate(zip(X_span_tokens, Y_span_tokens)):
    x,y = i.ids, j.ids
    try:
        s,e = get_extremities(x, y)
        Y_span_starts.append(s)
        Y_span_stops.append(e)
    except TypeError as t:
        counter += 1
        anomaly_idx.append(num)
        Y_span_starts.append([0]*15)
        Y_span_stops.append([0]*15)
print(num + 1, "\t: #Processed")

print(counter,"\t: # of Anomalies")

27481 	: #Processed
2 	: # of Anomalies


In [32]:
check_idx = 1572
print(df_span.text[check_idx])
print(df_span.selected_text[check_idx])
print([[i,j,k,l] for i,j,k,l in zip(X_span_tokens[check_idx].tokens,
                                    X_span_tokens[check_idx].ids,
                                    Y_span_starts[check_idx],
                                    Y_span_stops[check_idx])])
print([[i,j] for i,j in zip(Y_span_tokens[check_idx].ids,
                            Y_span_tokens[check_idx].tokens)])

 Hello, yourself. Enjoy London. Watch out for the Hackneys. They`re mental.
They`re mental.
[['<s>', 0, 0, 0], ['Ġhello', 20760, 0, 0], [',', 6, 0, 0], ['Ġyourself', 2512, 0, 0], ['.', 4, 0, 0], ['Ġenjoy', 2254, 0, 0], ['Ġl', 784, 0, 0], ['ondon', 24639, 0, 0], ['.', 4, 0, 0], ['Ġwatch', 1183, 0, 0], ['Ġout', 66, 0, 0], ['Ġfor', 13, 0, 0], ['Ġthe', 5, 0, 0], ['Ġhack', 14157, 0, 0], ['neys', 30915, 0, 0], ['.', 4, 0, 0], ['Ġthey', 51, 1, 0], ['`', 12905, 0, 0], ['re', 241, 0, 0], ['Ġmental', 2536, 0, 0], ['.', 4, 0, 1], ['Ġ', 1437, 0, 0], ['</s>', 2, 0, 0], ['Ġ', 1437, 0, 0], ['</s>', 2, 0, 0], ['Ġnegative', 2430, 0, 0], ['Ġ', 1437, 0, 0], ['</s>', 2, 0, 0]]
[[51, 'Ġthey'], [12905, '`'], [241, 're'], [2536, 'Ġmental'], [4, '.']]


#### Padding for span detection

In [33]:
X_span = pad_sequences(X_span, maxlen=MAX_SEQ_LEN_SPAN, padding="post")
X_span_att = pad_sequences(X_span_att, maxlen=MAX_SEQ_LEN_SPAN, padding="post")
Y_span = pad_sequences(Y_span, maxlen=MAX_SEQ_LEN_SPAN, padding="post")

Y_span_starts = pad_sequences(Y_span_starts, maxlen=MAX_SEQ_LEN_SPAN, padding="post")#.argmax(axis=1)
Y_span_stops = pad_sequences(Y_span_stops, maxlen=MAX_SEQ_LEN_SPAN, padding="post")#.argmax(axis=1)

X_span_test = pad_sequences(X_span_test, maxlen=MAX_SEQ_LEN_SPAN, padding="post")
X_span_att_test = pad_sequences(X_span_att_test, maxlen=MAX_SEQ_LEN_SPAN, padding="post")

In [34]:
pprint({
    "X_span" : X_span.shape,
    "X_span_att" : X_span_att.shape,
    "Y_span" : Y_span.shape,
    "Y_span_starts" : Y_span_starts.shape,
    "Y_span_stops" : Y_span_stops.shape,
    "X_span_test" : X_span_test.shape,
    "X_span_att_test" : X_span_att_test.shape,
    "VOCAB_SIZE":VOCAB_SIZE,
    "MAX_SEQ_LEN_SPAN":MAX_SEQ_LEN_SPAN
})

{'MAX_SEQ_LEN_SPAN': 108,
 'VOCAB_SIZE': 50265,
 'X_span': (27481, 108),
 'X_span_att': (27481, 108),
 'X_span_att_test': (3534, 108),
 'X_span_test': (3534, 108),
 'Y_span': (27481, 108),
 'Y_span_starts': (27481, 108),
 'Y_span_stops': (27481, 108)}


#### Cross validation for span detection

In [35]:
keep_flag = np.isin(Y_span_stops.argmax(axis=1),
                    np.unique(Y_span_stops.argmax(axis=1),
                              return_counts=True)[0][np.unique(Y_span_stops.argmax(axis=1),
                                                               return_counts=True)[1]>1])

In [36]:
sum(keep_flag), df_span.shape[0], df_span.shape[0] - sum(keep_flag)

(27475, 27481, 6)

In [37]:
print("\n",
     X_span.shape, "\t: X ", "\n",
     X_span_att.shape, "\t: X_att ", "\n",
     Y_span.shape, "\t: Y ", "\n",
     Y_span_starts.shape, "\t: Y_starts ", "\n",
     Y_span_stops.shape, "\t: Y_stops ", "\n",
     X_span_test.shape, "\t: X_test ", "\n",
     X_span_att_test.shape, "\t: X_att_test ", "\n"
)


 (27481, 108) 	: X  
 (27481, 108) 	: X_att  
 (27481, 108) 	: Y  
 (27481, 108) 	: Y_starts  
 (27481, 108) 	: Y_stops  
 (3534, 108) 	: X_test  
 (3534, 108) 	: X_att_test  



In [38]:
X_span = X_span[keep_flag]
X_span_att = X_span_att[keep_flag]
Y_span = Y_span[keep_flag]
Y_span_starts = Y_span_starts[keep_flag]
Y_span_stops = Y_span_stops[keep_flag]
X_span_test = X_span_test
X_span_att_test = X_span_att_test

In [39]:
print("\n",
     X_span.shape, "\t: X ", "\n",
     X_span_att.shape, "\t: X_att ", "\n",
     Y_span.shape, "\t: Y ", "\n",
     Y_span_starts.shape, "\t: Y_starts ", "\n",
     Y_span_stops.shape, "\t: Y_stops ", "\n",
     X_span_test.shape, "\t: X_test ", "\n",
     X_span_att_test.shape, "\t: X_att_test ", "\n"
)


 (27475, 108) 	: X  
 (27475, 108) 	: X_att  
 (27475, 108) 	: Y  
 (27475, 108) 	: Y_starts  
 (27475, 108) 	: Y_stops  
 (3534, 108) 	: X_test  
 (3534, 108) 	: X_att_test  



In [41]:
Y_span_words = [tokenizer.decode(i) for i in Y_span]

In [42]:
MAX_SEQ_LEN =  MAX_SEQ_LEN_SPAN
MAX_SEQ_LEN, MAX_SEQ_LEN_SPAN

(108, 108)

In [43]:
X_span = pad_sequences(X_span, maxlen=MAX_SEQ_LEN, padding="post")
X_span_att = pad_sequences(X_span_att, maxlen=MAX_SEQ_LEN, padding="post")
Y_span = pad_sequences(Y_span, maxlen=MAX_SEQ_LEN, padding="post")
Y_span_starts = pad_sequences(Y_span_starts, maxlen=MAX_SEQ_LEN, padding="post")
Y_span_stops = pad_sequences(Y_span_stops, maxlen=MAX_SEQ_LEN, padding="post")

X_span_test = pad_sequences(X_span_test, maxlen=MAX_SEQ_LEN, padding="post")
X_span_att_test = pad_sequences(X_span_att_test, maxlen=MAX_SEQ_LEN, padding="post")

pprint({
    "X_span" : X_span.shape,
    "X_span_att" : X_span_att.shape,
    "Y_span" : Y_span.shape,
    
    "X_span_test" : X_span_test.shape,
    "X_span_att_test" : X_span_att_test.shape,
})

{'X_span': (27475, 108),
 'X_span_att': (27475, 108),
 'X_span_att_test': (3534, 108),
 'X_span_test': (3534, 108),
 'Y_span': (27475, 108)}


#### Model Specifications

In [44]:
def build_model():
    input_sequences = Input((MAX_SEQ_LEN), dtype=tf.int32, name="words")
    input_att_flags = Input((MAX_SEQ_LEN), dtype=tf.int32, name="att_flags")
    input_token_ids = Input((MAX_SEQ_LEN), dtype=tf.int32, name="token_ids")
    
    config = RobertaConfig.from_pretrained(MODEL_DIR+'config.json')
    roberta_model = TFRobertaModel.from_pretrained(MODEL_DIR+'tf_model.h5', config=config)
    x = roberta_model(inputs=input_sequences, attention_mask=input_att_flags, token_type_ids=input_token_ids)
    
    x1 = tf.keras.layers.Dropout(DROPOUT)(x[0])
    x1 = tf.keras.layers.Conv1D(768, 2,padding='same')(x1)
    x1 = tf.keras.layers.LeakyReLU()(x1)
    x1 = tf.keras.layers.Dense(1)(x1)
    x1 = tf.keras.layers.Flatten()(x1)
    output_starts_0 = tf.keras.layers.Activation('softmax', name="starts_0")(x1)
    
    x2 = tf.keras.layers.Dropout(DROPOUT)(x[0]) 
    x2 = tf.keras.layers.Conv1D(768, 2,padding='same')(x2)
    x2 = tf.keras.layers.LeakyReLU()(x2)
    x2 = tf.keras.layers.Dense(1)(x2)
    x2 = tf.keras.layers.Flatten()(x2)
    output_stops_0 = tf.keras.layers.Activation('softmax', name="stops_0")(x2)
    
    output_subtract = tf.keras.layers.Subtract()([output_starts_0, output_stops_0])
    output_flat = concatenate([output_starts_0, output_stops_0, output_subtract])
    output_starts_1 = Dense(MAX_SEQ_LEN, activation='softmax', name="starts_1")(output_flat)
    output_stops_1 = Dense(MAX_SEQ_LEN, activation='softmax', name="stops_1")(output_flat)
    
    span_detection_model = Model([input_att_flags, input_sequences, input_token_ids],
                                 [output_starts_0, output_stops_0, output_starts_1, output_stops_1])
    
    return span_detection_model

In [45]:
span_detection_model = build_model()

In [46]:
span_detection_model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
words (InputLayer)              [(None, 108)]        0                                            
__________________________________________________________________________________________________
att_flags (InputLayer)          [(None, 108)]        0                                            
__________________________________________________________________________________________________
token_ids (InputLayer)          [(None, 108)]        0                                            
__________________________________________________________________________________________________
tf_roberta_model (TFRobertaMode ((None, 108, 768), ( 124645632   words[0][0]                      
______________________________________________________________________________________________

### Span Detection Model Fit

In [47]:
for i in range(NUM_FOLDS):
    if os.path.exists(RESULTS_DIR+MODEL_PREFIX+"_LossLogs_"+str(i)+".csv"):
        os.remove(RESULTS_DIR+MODEL_PREFIX+"_LossLogs_"+str(i)+".csv")

In [48]:
def print_metrics(pred_dict):
    print("[INFO] ","="*15,"Validation for FOLD#", num, "="*15)
    funcs = [accuracy_score, f1_score, precision_score, recall_score, confusion_matrix]
    for f in funcs:
        for data_set in ["train","valid"]:
            for var in ["starts", "stops"]:
                if f in [accuracy_score]:
                    res = f(**pred_dict[data_set][var])
                    print("[INFO] {:.2f}".format(100 * res), "\t||", data_set, "\t||", var, "\t||", f.__name__)
                elif f in [confusion_matrix]:
                    res = f(**pred_dict[data_set][var], labels=np.arange(MAX_SEQ_LEN))
                    np.savetxt(X=res, fmt='%i', delimiter=",",
                               fname=RESULTS_DIR+"ConfusionMatrix_"+MODEL_PREFIX+"_"+data_set+"_"+var+".csv")
                    print("[INFO] \t||", data_set, "\t||", var, "\t||", f.__name__, "\t||", 
                          RESULTS_DIR+"ConfusionMatrix_"+MODEL_PREFIX+"_"+data_set+"_"+var+".csv")
                else:
                    res = f(**pred_dict[data_set][var], average="macro")
                    print("[INFO] {:.2f}".format(100 * res), "\t||", data_set, "\t||", var, "\t||", f.__name__)
        print("=======================================================================")

In [49]:
def post_process(string):
    string = re.sub(pattern=" (negative|positive|neutral)[ ]+$", repl="", string=string)
    string = re.sub(pattern=" +", repl=" ", string=string.strip())
    return string

<a href="https://keras.io/guides/transfer_learning/#finetuning" target="_blank"><h2 id="finetuning">Fine-tuning</h2></a>
<p>Once your model has converged on the new data, you can try to unfreeze all or part of
 the base model and retrain the whole model end-to-end with a very low learning rate.</p>
 <p>This is an optional last step that can potentially give you incremental improvements.
 It could also potentially lead to quick overfitting -- keep that in mind.</p>
 <p>It is critical to only do this step <em>after</em> the model with frozen layers has been
trained to convergence. If you mix randomly-initialized trainable layers with
trainable layers that hold pre-trained features, the randomly-initialized layers will
cause very large gradient updates during training, which will destroy your pre-trained
 features.</p>
 <p>It's also critical to use a very low learning rate at this stage, because
you are training a much larger model than in the first round of training, on a dataset
 that is typically very small.
As a result, you are at risk of overfitting very quickly if you apply large weight
 updates. Here, you only want to readapt the pretrained weights in an incremental way.</p>

<a href="https://keras.io/guides/transfer_learning/#finetuning" target="_blank"><p><strong>Important note about <code>compile()</code> and <code>trainable</code></strong></p></a>
<p>Calling <code>compile()</code> on a model is meant to "freeze" the behavior of that model. This
 implies that the <code>trainable</code>
attribute values at the time the model is compiled should be preserved throughout the
 lifetime of that model,
until <code>compile</code> is called again. Hence, if you change any <code>trainable</code> value, make sure
 to call <code>compile()</code> again on your
model for your changes to be taken into account.</p>

In [50]:
class LossWeightAdjust(Callback):
    def __init__(self, alpha, beta, gamma, delta):
        self.alpha = alpha
        self.beta = beta
        self.gamma = gamma
        self.delta = delta
    
    # customize your behavior
    def on_epoch_end(self, epoch, logs):
        losses = np.array([v for k,v in logs.items() if k in ['val_starts_0_loss', 'val_stops_0_loss', 'val_starts_1_loss', 'val_stops_1_loss']], dtype=np.float64)
        losses = (losses - 0.5*losses.min()) / (losses.max() - 0.5*losses.min())
        losses = losses/np.sum(losses)

        K.set_value(self.alpha, losses[0])
        K.set_value(self.beta, losses[1])
        K.set_value(self.gamma, losses[2])
        K.set_value(self.delta, losses[3])
        
        print("\n Loss weights recalibrated to alpha = %s, beta = %s, gamma = %s, delta = %s " % (np.round(losses[0],2),
                                                                                                  np.round(losses[1],2),
                                                                                                  np.round(losses[2],2),
                                                                                                  np.round(losses[3],2)))
        
        logger.info("Loss weights recalibrated to alpha = %s, beta = %s, gamma = %s, delta = %s " % (K.get_value(self.alpha), K.get_value(self.beta), K.get_value(self.gamma), K.get_value(self.delta)))
        #logger.info("epoch %s, alpha = %s, beta = %s, gamma = %s, delta = %s" % (epoch, K.get_value(self.alpha), K.get_value(self.beta), K.get_value(self.gamma), K.get_value(self.delta)))

###### What does the Loss Weight Adjust Callback do?

In [51]:
losses = np.array([2.7892, 2.7021, 4.1144, 4.1274])
losses = (losses - 0.5*losses.min()) / (losses.max() - 0.5*losses.min())
losses = losses/np.sum(losses)
losses

array([0.17266986, 0.1622123 , 0.33177851, 0.33333934])

In [40]:
span_kf = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=seeded_value)

In [52]:
for num, (t_index, v_index) in enumerate(span_kf.split(X_span, Y_span_stops)):
    print("[INFO] ==================== FOLD#", num, "====================")
    
    if num > 0:
        del span_history
        del span_detection_model
        gc.collect()
        K.clear_session()
    
    span_detection_model = build_model()
    
    alpha = K.variable(0.25)
    beta = K.variable(0.25)
    gamma = K.variable(0.25)
    delta = K.variable(0.75)
    
    span_mcp = ModelCheckpoint(filepath=RESULTS_DIR+MODEL_PREFIX+"BestCheckpoint_"+str(num)+".h5", monitor='val_loss',
                               verbose=0, save_best_only=False, save_weights_only=True, mode='auto', save_freq='epoch')
    
    span_csvl = CSVLogger(filename=RESULTS_DIR+MODEL_PREFIX+"_LossLogs_"+str(num)+".csv",
                          separator=",",
                          append=True)
    
    print("[INFO] Training only the final layers at higher learning rates.")
    span_detection_model.layers[3].trainable = False
    adam = Adam(learning_rate=MAX_LR)
    span_detection_model.compile(loss=tf.keras.losses.CategoricalCrossentropy(from_logits=False, label_smoothing=LABEL_SMOOTHING_PARAM),
                                 optimizer=adam,
                                 metrics=['accuracy'],
                                 loss_weights={"starts_0":alpha,"stops_0":beta,"starts_1":gamma,"stops_1":delta})
    span_history = span_detection_model.fit(x={"att_flags":X_span_att[t_index],
                                               "words":X_span[t_index],
                                               "token_ids":np.zeros_like(X_span_att[t_index])},
                                            y={"starts_0":Y_span_starts[t_index],
                                               "stops_0":Y_span_stops[t_index], 
                                               "starts_1":Y_span_starts[t_index],
                                               "stops_1":Y_span_stops[t_index]},
                                            shuffle=True,
                                            batch_size=BATCH_SIZE,
                                            epochs=NUM_EPOCHS[0],
                                            validation_data=({"att_flags":X_span_att[v_index],
                                                              "words":X_span[v_index],
                                                              "token_ids":np.zeros_like(X_span_att[v_index])},
                                                             {"starts_0":Y_span_starts[v_index],
                                                              "stops_0":Y_span_stops[v_index], 
                                                              "starts_1":Y_span_starts[v_index],
                                                              "stops_1":Y_span_stops[v_index]}),
                                            verbose=1,
                                            callbacks=[span_mcp, span_csvl, LossWeightAdjust(alpha=alpha, beta=beta, gamma=gamma, delta=delta)])
    
    print("[INFO] Training only the final layers at lower learning rates.")
    adam = Adam(learning_rate=MID_LR)
    span_detection_model.compile(loss=tf.keras.losses.CategoricalCrossentropy(from_logits=False, label_smoothing=LABEL_SMOOTHING_PARAM),
                                 optimizer=adam,
                                 metrics=['accuracy'],
                                 loss_weights={"starts_0":alpha,"stops_0":beta,"starts_1":gamma,"stops_1":delta})
    span_history = span_detection_model.fit(x={"att_flags":X_span_att[t_index],
                                               "words":X_span[t_index],
                                               "token_ids":np.zeros_like(X_span_att[t_index])},
                                            y={"starts_0":Y_span_starts[t_index],
                                               "stops_0":Y_span_stops[t_index], 
                                               "starts_1":Y_span_starts[t_index],
                                               "stops_1":Y_span_stops[t_index]},
                                            shuffle=True,
                                            batch_size=BATCH_SIZE,
                                            epochs=NUM_EPOCHS[1],
                                            validation_data=({"att_flags":X_span_att[v_index],
                                                              "words":X_span[v_index],
                                                              "token_ids":np.zeros_like(X_span_att[v_index])},
                                                             {"starts_0":Y_span_starts[v_index],
                                                              "stops_0":Y_span_stops[v_index], 
                                                              "starts_1":Y_span_starts[v_index],
                                                              "stops_1":Y_span_stops[v_index]}),
                                            verbose=1,
                                            callbacks=[span_mcp, span_csvl, LossWeightAdjust(alpha=alpha, beta=beta, gamma=gamma, delta=delta)])
    
    print("[INFO] Unfreezing RoBerta layer and training at lowest learning rates.")
    span_detection_model.layers[3].trainable = True
    adam = Adam(learning_rate=MIN_LR)
    span_detection_model.compile(loss=tf.keras.losses.CategoricalCrossentropy(from_logits=False, label_smoothing=LABEL_SMOOTHING_PARAM),
                                 optimizer=adam,
                                 metrics=['accuracy'],
                                 loss_weights={"starts_0":alpha,"stops_0":beta,"starts_1":gamma,"stops_1":delta})
    span_history = span_detection_model.fit(x={"att_flags":X_span_att[t_index],
                                               "words":X_span[t_index],
                                               "token_ids":np.zeros_like(X_span_att[t_index])},
                                            y={"starts_0":Y_span_starts[t_index],
                                               "stops_0":Y_span_stops[t_index], 
                                               "starts_1":Y_span_starts[t_index],
                                               "stops_1":Y_span_stops[t_index]},
                                            shuffle=True,
                                            batch_size=BATCH_SIZE,
                                            epochs=NUM_EPOCHS[2],
                                            validation_data=({"att_flags":X_span_att[v_index],
                                                              "words":X_span[v_index],
                                                              "token_ids":np.zeros_like(X_span_att[v_index])},
                                                             {"starts_0":Y_span_starts[v_index],
                                                              "stops_0":Y_span_stops[v_index], 
                                                              "starts_1":Y_span_starts[v_index],
                                                              "stops_1":Y_span_stops[v_index]}),
                                            verbose=1,
                                            callbacks=[span_mcp, span_csvl, LossWeightAdjust(alpha=alpha, beta=beta, gamma=gamma, delta=delta)])
    
    # Loading best weights per fold
    span_detection_model.load_weights(RESULTS_DIR+MODEL_PREFIX+"BestCheckpoint_"+str(num)+".h5")
    
    pred_train = span_detection_model.predict(x = {"att_flags":X_span_att[t_index],
                                                   "words":X_span[t_index],
                                                   "token_ids":np.zeros_like(X_span_att[t_index])},
                                              batch_size=PREDICT_BATCH_SIZE)

    pred_val = span_detection_model.predict(x = {"att_flags":X_span_att[v_index],
                                                 "words":X_span[v_index],
                                                 "token_ids":np.zeros_like(X_span_att[v_index])},
                                            batch_size=PREDICT_BATCH_SIZE)
    
    pred_starts_train, pred_stops_train = (pred_train[0]/2.0 + pred_train[2]/2.0), (pred_train[1]/2.0 + pred_train[3]/2.0)
    pred_starts_val, pred_stops_val = (pred_val[0]/2.0 + pred_val[2]/2.0), (pred_val[1]/2.0 + pred_val[3]/2.0)
    
    # Accumulate test results after training every fold
    pred_test_fold = span_detection_model.predict(x = {"att_flags":X_span_att_test,
                                                       "words":X_span_test,
                                                       "token_ids":np.zeros_like(X_span_att_test)},
                                                  batch_size=PREDICT_BATCH_SIZE)
    if num==0:
        pred_test = []
        pred_test.append(pred_test_fold[0]/2.0 + pred_test_fold[2]/2.0)
        pred_test.append(pred_test_fold[1]/2.0 + pred_test_fold[3]/2.0)
    else:
        pred_test[0] += (pred_test_fold[0]/2.0 + pred_test_fold[2]/2.0)
        pred_test[1] += (pred_test_fold[1]/2.0 + pred_test_fold[3]/2.0)
    
    # Tabulate
    preds = {
        "train":{
            "starts":{
                "y_true":Y_span_starts[t_index].argmax(axis=1),
                "y_pred":pred_train[0].argmax(axis=1)
            },
            "stops":{
                "y_true":Y_span_stops[t_index].argmax(axis=1),
                "y_pred":pred_train[1].argmax(axis=1)
            }
        },
        "valid":{
            "starts":{
                "y_true":Y_span_starts[v_index].argmax(axis=1),
                "y_pred":pred_val[0].argmax(axis=1)
            },
            "stops":{
                "y_true":Y_span_stops[v_index].argmax(axis=1),
                "y_pred":pred_val[1].argmax(axis=1)
            }        
        }
    }
    
    print_metrics(pred_dict=preds)

    print("[INFO] Prediction shape for training data: ", pred_starts_train.shape, pred_stops_train.shape)
    print("[INFO] Prediction shape for validation data: ", pred_starts_val.shape, pred_stops_val.shape)

    print("[INFO] Normal predictions (StartIndex less than EndIndex) for training data: ",
          sum([s<e for s,e in zip(pred_starts_train.argmax(axis=1),
                                  pred_stops_train.argmax(axis=1))]),
          "out of", pred_starts_train.shape[0])
    print("[INFO] Normal predictions (StartIndex less than EndIndex) for validation data: ",
          sum([s<e for s,e in zip(pred_starts_val.argmax(axis=1),
                                  pred_stops_val.argmax(axis=1))]),
          "out of", pred_starts_val.shape[0],)

    pred_words_train = [post_process(tokenizer.decode(t[s:e])) if s<e else post_process(tokenizer.decode(t[e:])) for t, s,e in zip(X_span[t_index],
                                                                                                                                   pred_starts_train.argmax(axis=1),
                                                                                                                                   pred_stops_train.argmax(axis=1))]
    
    pred_words_val = [post_process(tokenizer.decode(t[s:e])) if s<e else post_process(tokenizer.decode(t[e:])) for t, s,e in zip(X_span[v_index],
                                                                                                                                 pred_starts_val.argmax(axis=1),
                                                                                                                                 pred_stops_val.argmax(axis=1))]

    print("[INFO] Training Jaccard Score: ",
          np.mean([jaccard(str1=i, str2=j) for i,j in zip([t for n,t in enumerate(Y_span_words) if n in t_index],
                                                          pred_words_train)]))
    print("[INFO] Validation Jaccard Score: ",
          np.mean([jaccard(str1=i, str2=j) for i,j in zip([t for n,t in enumerate(Y_span_words) if n in v_index],
                                                          pred_words_val)]))
    print("[INFO] Training for fold:", num, "finished at", ctime(time()))

print(ctime(time()))

[INFO] Training only the final layers at higher learning rates.
Train on 18316 samples, validate on 9159 samples
Epoch 1/3
 Loss weights recalibrated to alpha = 0.21, beta = 0.32, gamma = 0.23, delta = 0.23 
Epoch 2/3
 Loss weights recalibrated to alpha = 0.25, beta = 0.23, gamma = 0.26, delta = 0.26 
Epoch 3/3
 Loss weights recalibrated to alpha = 0.24, beta = 0.25, gamma = 0.25, delta = 0.26 
[INFO] Training only the final layers at lower learning rates.
Train on 18316 samples, validate on 9159 samples
Epoch 1/3
 Loss weights recalibrated to alpha = 0.24, beta = 0.25, gamma = 0.25, delta = 0.26 
Epoch 2/3
 Loss weights recalibrated to alpha = 0.24, beta = 0.24, gamma = 0.25, delta = 0.26 
Epoch 3/3
 Loss weights recalibrated to alpha = 0.25, beta = 0.23, gamma = 0.26, delta = 0.26 
[INFO] Unfreezing RoBerta layer and training at lowest learning rates.
Train on 18316 samples, validate on 9159 samples
 Loss weights recalibrated to alpha = 0.25, beta = 0.24, gamma = 0.27, delta = 0.25 


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[INFO] 	|| train 	|| starts 	|| confusion_matrix 	|| ../results/ConfusionMatrix_V37_train_starts.csv
[INFO] 	|| train 	|| stops 	|| confusion_matrix 	|| ../results/ConfusionMatrix_V37_train_stops.csv
[INFO] 	|| valid 	|| starts 	|| confusion_matrix 	|| ../results/ConfusionMatrix_V37_valid_starts.csv
[INFO] 	|| valid 	|| stops 	|| confusion_matrix 	|| ../results/ConfusionMatrix_V37_valid_stops.csv
[INFO] Prediction shape for training data:  (18316, 108) (18316, 108)
[INFO] Prediction shape for validation data:  (9159, 108) (9159, 108)
[INFO] Normal predictions (StartIndex less than EndIndex) for training data:  16866 out of 18316
[INFO] Normal predictions (StartIndex less than EndIndex) for validation data:  8462 out of 9159
[INFO] Training Jaccard Score:  0.681903762764903
[INFO] Validation Jaccard Score:  0.6784706172994983
[INFO] Training for fold: 0 finished at Sun Jun 14 15:47:11 2020
[INFO] Training only the final layers at higher learning rates.
Train on 18317 samples, validate o

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[INFO] 	|| train 	|| starts 	|| confusion_matrix 	|| ../results/ConfusionMatrix_V37_train_starts.csv
[INFO] 	|| train 	|| stops 	|| confusion_matrix 	|| ../results/ConfusionMatrix_V37_train_stops.csv
[INFO] 	|| valid 	|| starts 	|| confusion_matrix 	|| ../results/ConfusionMatrix_V37_valid_starts.csv
[INFO] 	|| valid 	|| stops 	|| confusion_matrix 	|| ../results/ConfusionMatrix_V37_valid_stops.csv
[INFO] Prediction shape for training data:  (18317, 108) (18317, 108)
[INFO] Prediction shape for validation data:  (9158, 108) (9158, 108)
[INFO] Normal predictions (StartIndex less than EndIndex) for training data:  16789 out of 18317
[INFO] Normal predictions (StartIndex less than EndIndex) for validation data:  8397 out of 9158
[INFO] Training Jaccard Score:  0.6762098000955359
[INFO] Validation Jaccard Score:  0.6750362260146804
[INFO] Training for fold: 1 finished at Sun Jun 14 16:27:48 2020
[INFO] Training only the final layers at higher learning rates.
Train on 18317 samples, validate 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


 stops 	|| recall_score
[INFO] 	|| train 	|| starts 	|| confusion_matrix 	|| ../results/ConfusionMatrix_V37_train_starts.csv
[INFO] 	|| train 	|| stops 	|| confusion_matrix 	|| ../results/ConfusionMatrix_V37_train_stops.csv
[INFO] 	|| valid 	|| starts 	|| confusion_matrix 	|| ../results/ConfusionMatrix_V37_valid_starts.csv
[INFO] 	|| valid 	|| stops 	|| confusion_matrix 	|| ../results/ConfusionMatrix_V37_valid_stops.csv
[INFO] Prediction shape for training data:  (18317, 108) (18317, 108)
[INFO] Prediction shape for validation data:  (9158, 108) (9158, 108)
[INFO] Normal predictions (StartIndex less than EndIndex) for training data:  17062 out of 18317
[INFO] Normal predictions (StartIndex less than EndIndex) for validation data:  8565 out of 9158
[INFO] Training Jaccard Score:  0.6806910109882429
[INFO] Validation Jaccard Score:  0.6743926937201156
[INFO] Training for fold: 2 finished at Sun Jun 14 17:08:30 2020
Sun Jun 14 17:08:30 2020


## Validation

#### Inference

In [53]:
pred_starts_test, pred_stops_test = pred_test[0]/NUM_FOLDS, pred_test[1]/NUM_FOLDS
print("[INFO] Prediction shape for testing data: ", pred_starts_test.shape, pred_stops_test.shape)

[INFO] Prediction shape for testing data:  (3534, 108) (3534, 108)


#### Postprocessing

In [54]:
print("Normal predictions (StartIndex less than EndIndex) for testing data:",
      sum([s<e for s,e in zip(pred_starts_test.argmax(axis=1),
                              pred_stops_test.argmax(axis=1))]), 
      "out of",
      pred_starts_test.shape[0])

Normal predictions (StartIndex less than EndIndex) for testing data: 3285 out of 3534


In [55]:
pred_words_test = [
    post_process(tokenizer.decode(t[s:e+1])) if s<e else post_process(tokenizer.decode(t[e:])) for t,s,e in zip(X_span_test,
                                                                                                              pred_starts_test.argmax(axis=1),
                                                                                                              pred_stops_test.argmax(axis=1))
]

In [56]:
check_idx = 158
#print([[t,i,j,k] for t,i,j,k in zip(tokenizer.decode(),X_test[check_idx],pred_starts_test[check_idx],pred_stops_test[check_idx])])
print(tokenizer.decode(X_span_test[check_idx]))
print(pred_starts_test.argmax(axis=1)[check_idx])
print(pred_stops_test.argmax(axis=1)[check_idx])
print(post_process(tokenizer.decode(X_span_test[check_idx][pred_starts_test.argmax(axis=1)[check_idx]:1+pred_stops_test.argmax(axis=1)[check_idx]])))

 yes! im down to 50% full on my dvr  i was at 98% like 3 days ago... lol i swear if i didnt have a dvr i would never watch tv   neutral 
1
38
yes! im down to 50% full on my dvr i was at 98% like 3 days ago... lol i swear if i didnt have a dvr i would never watch tv


## Submission

In [57]:
test_df_span['selected_text'] = pred_words_test

In [58]:
test_df_span["selected_text"] = np.where(test_df_span["sentiment"] == "neutral",
                                         test_df_span["text"],
                                         test_df_span["selected_text"])

In [59]:
test_df_span[["textID", "selected_text"]].to_csv(RESULTS_DIR+"submission.csv", index=False)

In [63]:
test_df_span.loc[test_df_span.sentiment!="neutral"][["text", "sentiment","selected_text"]].sample(25)

Unnamed: 0,text,sentiment,selected_text
3051,Happy Mothers` Day to my mom and all the mothers in the world,positive,happy mothers` day to my mom and all the mothers in the world
2242,thanks for a nice blog post! should however be given some creds since he has done at least half of the work on it,positive,thanks for a nice
694,"It depends on your goals & how much you want to spend Cannondale, Specialized and Cervelo are all good brands.",positive,"it depends on your goals & how much you want to spend cannondale, specialized and cervelo are all good brands."
3101,Oooh you just spoiled my teenage fantasy,negative,oooh you just spoiled my teenage fantasy
2483,Driving home to trade cars hopefully it makes it! http://myloc.me/21SL,positive,hopefully it makes it!
1378,Conference call with HP. They gave me an invalid pass code so I can`t attend,negative,conference call with hp. they gave me an invalid pass code so i can`t attend
3139,"I`m the bird with broken wings, she`s the song i love to sing you know who you are ****",positive,"i`m the bird with broken wings, she`s the song i love"
1857,the day goes on and on...i think im gonna write a song about it! still thinking itï¿½s impossible for me to get a true friend why????,negative,the day goes on and on...i think im gonna write a song about it! still thinking itï¿½s impossible for me to get a true friend why????
2227,guns and roses baby! yay! was hopin you would blip.fm it,positive,guns and roses baby! yay!
1305,my stomach hurts,negative,my stomach hurts
