## Settings

In [1]:
# CONTROLS
MODEL_PREFIX = "V35" # Better start stop indices No removal of samples, Using my tokenizer, Roberta weights, Sentiment(LeakyKFold), SpanNoNeutral(NoLeakKFlod) With Label Smoothing
MODEL_NUMBER = MODEL_PREFIX[-2:]

TRAIN_SPLIT_RATIO = 0.2
BATCH_SIZE = 16
PREDICT_BATCH_SIZE = 512
DROPOUT = 0.3
LABEL_SMOOTHING_PARAM = 0.2

RUN_ON_SAMPLE = False
EXCLUDE_NEUTRAL_CLASS = True
SENTIMENT_MAX_LR = 5e-4
SENTIMENT_MIN_LR = 5e-6
SENTIMENT_NUM_EPOCHS = [4, 2]
MAX_LR = 5e-3 #3e-5
MID_LR = 1e-4 #3e-5
MIN_LR = 1e-6 #3e-5
NUM_EPOCHS = [4, 4, 1]
NUM_FOLDS = 2

In [2]:
RESULTS_DIR = "../working/"
DATA_DIR = "../input/tweet-sentiment-extraction/"
MODEL_DIR = "../working/models/"
EXT_MODEL_DIR = "../input/robertamodelobjects/"

## Libraries

In [3]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score, classification_report
from sklearn.model_selection import StratifiedKFold, train_test_split, KFold
from sklearn.utils import class_weight

import pickle, os, sys, re, json, gc
from time import time, ctime
from pprint import pprint

import tensorflow as tf
from tensorflow.keras import Model, Input
from tensorflow.keras.layers import Conv1D, Conv2D, LSTM, Embedding, Dense, concatenate, MaxPooling2D, Softmax, Flatten
from tensorflow.keras.layers import BatchNormalization, Dropout, Reshape, Activation, Bidirectional, TimeDistributed
from tensorflow.keras.layers import RepeatVector, Multiply, Layer, LeakyReLU, Subtract
from tensorflow.keras.activations import softmax
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import initializers, regularizers, constraints
from tensorflow.keras.callbacks import *
from tensorflow.keras.callbacks import CSVLogger, ModelCheckpoint
import tensorflow.keras.backend as K
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.models import save_model, load_model

import tokenizers, transformers
from transformers import *

%matplotlib inline

In [4]:
def jaccard(str1, str2):
    a = set(str1)
    b = set(str2)
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [5]:
seeded_value = 88888
pd.set_option('display.max_colwidth', None)
np.random.seed(seeded_value)

In [6]:
print(ctime(time()))

Mon Jun  8 17:02:44 2020


In [7]:
print([
    tf.__version__,
    transformers.__version__,
    tokenizers.__version__
])

['2.1.0', '2.9.0', '0.7.0']


In [8]:
if not os.path.exists(MODEL_DIR):
    os.mkdir(MODEL_DIR)
    
if not os.path.exists(MODEL_DIR+"tokenizers"):
    os.mkdir(MODEL_DIR+"tokenizers")

if not os.path.exists(MODEL_DIR+"tokenizers/roberta_tokenizer"):
    os.mkdir(MODEL_DIR+"tokenizers/roberta_tokenizer")

<a href="https://www.tensorflow.org/guide/gpu#limiting_gpu_memory_growth"  target="_blank"><h2 id="limiting_gpu_memory_growth" data-text="Limiting GPU memory growth" tabindex="0">Limiting GPU memory growth</h2></a>
<p>By default, TensorFlow maps nearly all of the GPU memory of all GPUs (subject to
<a href="https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars"><code translate="no" dir="ltr">CUDA_VISIBLE_DEVICES</code></a>) visible to the process. This is done to more efficiently use the relatively precious GPU memory resources on the devices by reducing memory fragmentation. To limit TensorFlow to a specific set of GPUs we use the <code translate="no" dir="ltr">tf.config.experimental.set_visible_devices</code> method.</p>

In [9]:
print(tf.config.experimental.list_logical_devices('CPU'))
print(tf.config.experimental.list_logical_devices('GPU'))
print(tf.config.experimental.list_physical_devices('CPU'))
print(tf.config.experimental.list_physical_devices('GPU'))

[LogicalDevice(name='/device:CPU:0', device_type='CPU')]
[LogicalDevice(name='/device:GPU:0', device_type='GPU')]
[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [10]:
physical_devices = tf.config.list_physical_devices('GPU')
try:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
except:
    # Invalid device or cannot modify virtual devices once initialized.
    pass

## Is it okay to exclude neutral text from models and force it later?

In [11]:
df2 = pd.read_csv(DATA_DIR+"train.csv", encoding="utf8")
# cases where neutral's text and selected text columns are the same
neutrals = df2.loc[df2.sentiment=="neutral"].copy()
neutrals = neutrals.reset_index(drop=True)
neutrals["text"] = neutrals["text"].astype(str)
neutrals["selected_text"] = neutrals["selected_text"].astype(str)
print("COVERAGE:", np.sum(np.where((neutrals.text == neutrals.selected_text), 1, 0))/neutrals.shape[0])
print("MEAN JACCARD:", np.mean([jaccard(str1=i, str2=j) for i,j in zip(neutrals.selected_text,neutrals.text)]))

neutrals.loc[neutrals.text != neutrals.selected_text].sample(5)

COVERAGE: 0.5334592552617378
MEAN JACCARD: 0.9808316180492903


Unnamed: 0,textID,text,selected_text,sentiment
4896,c68863089e,_tifullyTragic ... in London last night so give it a few hours & I`m sure a few will appear,in London last night so give it a few hours & I`m sure a few will appear,neutral
2211,edd2aceb1c,who sings `I Remember`? i alwaysss hear it on Radio 1 but never catch the artist,who sings `I Remember`? i alwaysss hear it on Radio 1 but never catch the artist,neutral
8286,ee53bf0e43,this time there is a theme and it is 'purple',this time there is a theme and it is 'purple',neutral
4240,2736a522fa,why isnt everyone with you?,why isnt everyone with you?,neutral
10069,8ba7a1720e,"and by the way it`s short stack, not sway sway sway sway baby is a song of theirs","and by the way it`s short stack, not sway sway sway sway baby is a song of theirs",neutral


## Import Data

In [12]:
df = pd.read_csv(DATA_DIR+"train.csv", encoding="utf8")

print(pd.concat((df.dtypes, df.isna().sum()), axis=1))
print(df.shape)

# Counts of various columns
print({i:df[i].nunique() for i in df.columns})
print(df.describe())
df.head(2)

                    0  1
textID         object  0
text           object  1
selected_text  object  1
sentiment      object  0
(27481, 4)
{'textID': 27481, 'text': 27480, 'selected_text': 22463, 'sentiment': 3}
            textID  \
count        27481   
unique       27481   
top     b797972363   
freq             1   

                                                                                               text  \
count                                                                                         27480   
unique                                                                                        27480   
top     Power Outage, door to freezer propped open, 3G ice cream make for slippy floor  M-er F-er!!   
freq                                                                                              1   

       selected_text sentiment  
count          27480     27481  
unique         22463         3  
top             good   neutral  
freq             199     11118  

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative


In [13]:
test_df = pd.read_csv(DATA_DIR+"test.csv", encoding="utf8")
print(pd.concat((test_df.dtypes, test_df.isna().sum()), axis=1))
print(test_df.shape)

# Counts of various columns
print({i:test_df[i].nunique() for i in test_df.columns})
print(test_df.describe())
test_df.head(2)

                0  1
textID     object  0
text       object  0
sentiment  object  0
(3534, 3)
{'textID': 3534, 'text': 3534, 'sentiment': 3}
            textID  \
count         3534   
unique        3534   
top     68a6cf591a   
freq             1   

                                                              text sentiment  
count                                                         3534      3534  
unique                                                        3534         3  
top     Anybody else experiencing painful slowdowns with facebook?   neutral  
freq                                                             1      1430  


Unnamed: 0,textID,text,sentiment
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral
1,96d74cb729,Shanghai is also really exciting (precisely -- skyscrapers galore). Good tweeps in China: (SH) (BJ).,positive


In [14]:
df.loc[df['text'].astype('str').apply(lambda x : len(re.findall(pattern="ï¿½", string=x))>0)].head(2)

Unnamed: 0,textID,text,selected_text,sentiment
44,c77717b103,I love to! But I`m only available from 5pm. and where dear? Would love to help convert her vids.ï¿½,I love to!,positive
192,28dbada620,*phew* Will make a note in case anyone else runs into the same issueï¿½,*phew* Will make a note in case anyone else runs into the same issueï¿½,neutral


In [15]:
test_df.loc[test_df['text'].astype('str').apply(lambda x : len(re.findall(pattern="ï¿½", string=x))>0)].head(2)

Unnamed: 0,textID,text,sentiment
145,7223fdccc2,tikcets are only ï¿½91...each...BUT I SO WANT TO GO,positive
618,43ad351369,"AHHH - Whatchu talkinï¿½ baby? HAHAHA I canï¿½t believe youu:O heh, actually I can. Life is worth taking risks... http://tumblr.com/xs81qy54s",positive


In [16]:
df["set"], test_df["set"] = "train", "test"

### Create smaller sample for experimentation

In [17]:
if RUN_ON_SAMPLE:
    df = df.sample(500).reset_index(drop=True)
    test_df = test_df.sample(500).reset_index(drop=True)

#### Combine datasets for pretraining using sentiment labels

In [18]:
data = pd.concat((df[["text","set","sentiment"]],
                  test_df[["text","set","sentiment"]]), axis=0)
data["text"] = data["text"].astype(str)
data = data.sample(frac=1.0).reset_index(drop=True)
print(data.shape)

(31015, 3)


#### Sentiment count in combined data

In [19]:
data.groupby(["set","sentiment"])[["text"]].count()

Unnamed: 0_level_0,Unnamed: 1_level_0,text
set,sentiment,Unnamed: 2_level_1
test,negative,1001
test,neutral,1430
test,positive,1103
train,negative,7781
train,neutral,11118
train,positive,8582


#### Tokenization

In [20]:
tokenizer = tokenizers.ByteLevelBPETokenizer(vocab_file='../input/robertamodelobjects/vocab.json',
                                             merges_file='../input/robertamodelobjects/merges.txt',                                         
                                             add_prefix_space=True,
                                             lowercase=True)

In [21]:
with open("../input/robertamodelobjects/special_tokens_map.json") as f:
    special_tokens = json.load(f)

tokenizer.add_special_tokens([i for i in special_tokens.values()])

7

#### Preprocessing for sentiment detection

In [22]:
def trim_addspace(text:str) -> str:
    text = text.lower()
    text = " " + text.strip(" ") + " "
    return text

In [23]:
data["text_mod"] = data.apply(lambda x: trim_addspace(x.text), axis=1)
data["text_mod"] = "<s>"  + data["text_mod"] + "</s>"

In [24]:
sentiment_lookup = {"positive":2,"neutral":1,"negative":0}

In [25]:
def preprocess_sentiment(text_series=data.text_mod.tolist(), sentiment_series=data.sentiment):

    X_tokens = tokenizer.encode_batch(text_series)

    X = [i.ids for i in X_tokens]
    MAX_SEQ_LEN = max([len(i) for i in X])
    X = pad_sequences(X, maxlen=MAX_SEQ_LEN, padding="post")

    X_att = [i.attention_mask for i in X_tokens]
    X_att = pad_sequences(X_att, maxlen=MAX_SEQ_LEN, padding="post")

    Y = sentiment_series.apply(lambda x: sentiment_lookup[x]).values

    VOCAB_SIZE = tokenizer.get_vocab_size()

    print({
        "X":X.shape,
        "X_att":X_att.shape,
        "Y":Y.shape,
        "VOCAB_SIZE":VOCAB_SIZE,
        "MAX_SEQ_LEN":MAX_SEQ_LEN
    })
    
    return X_tokens, X, X_att, Y, VOCAB_SIZE, MAX_SEQ_LEN

X_sent_tokens, X_sent, X_sent_att, Y_sent, VOCAB_SIZE, MAX_SEQ_LEN_SENT = preprocess_sentiment(**{
    "text_series" : data.text.tolist(),
    "sentiment_series" : data.sentiment
})

{'X': (31015, 100), 'X_att': (31015, 100), 'Y': (31015,), 'VOCAB_SIZE': 50265, 'MAX_SEQ_LEN': 100}


# Import data for span detection

In [26]:
df_span = pd.read_csv(DATA_DIR+"train.csv", encoding="utf8").fillna('')

print(pd.concat((df_span.dtypes, df_span.isna().sum()), axis=1))
print(df_span.shape)

# Counts of various columns
print({i:df_span[i].nunique() for i in df_span.columns})
print(df_span.describe())
df_span.head(2)

                    0  1
textID         object  0
text           object  0
selected_text  object  0
sentiment      object  0
(27481, 4)
{'textID': 27481, 'text': 27481, 'selected_text': 22464, 'sentiment': 3}
            textID  \
count        27481   
unique       27481   
top     b797972363   
freq             1   

                                                                                               text  \
count                                                                                         27481   
unique                                                                                        27481   
top     Power Outage, door to freezer propped open, 3G ice cream make for slippy floor  M-er F-er!!   
freq                                                                                              1   

       selected_text sentiment  
count          27481     27481  
unique         22464         3  
top             good   neutral  
freq             199     11118  

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative


In [27]:
test_df_span = pd.read_csv(DATA_DIR+"test.csv", encoding="utf8").fillna('')
print(pd.concat((test_df_span.dtypes, test_df_span.isna().sum()), axis=1))
print(test_df_span.shape)

# Counts of various columns
print({i:test_df_span[i].nunique() for i in test_df_span.columns})
print(test_df_span.describe())
test_df_span.head(2)

                0  1
textID     object  0
text       object  0
sentiment  object  0
(3534, 3)
{'textID': 3534, 'text': 3534, 'sentiment': 3}
            textID  \
count         3534   
unique        3534   
top     68a6cf591a   
freq             1   

                                                              text sentiment  
count                                                         3534      3534  
unique                                                        3534         3  
top     Anybody else experiencing painful slowdowns with facebook?   neutral  
freq                                                             1      1430  


Unnamed: 0,textID,text,sentiment
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral
1,96d74cb729,Shanghai is also really exciting (precisely -- skyscrapers galore). Good tweeps in China: (SH) (BJ).,positive


#### Preprocessing for span detection

In [28]:
df_span["text"] = df_span["text"].astype(str)
df_span["selected_text"] = df_span["selected_text"].astype(str)
test_df_span["text"] = test_df_span["text"].astype(str)

In [29]:
def trim_addspace(text:str) -> str:
    text = text.lower()
    text = " " + text.strip(" ") + " "
    return text

In [30]:
def find_indices(text:str, selected_text:str) -> (str, str, int, int):
    
    text, selected_text = text.lower(), selected_text.lower()
    
    text = trim_addspace(text)
    
    substring_ = re.findall(pattern="\\s[^\s]*?"+re.escape(selected_text)+"[^\s]*?\\s", string=text)[0]
    
    return pd.Series([text, " "+substring_.strip(" "), text.find(substring_), len(substring_) + text.find(substring_)])

In [31]:
df_span[["text_mod", "selected_text_mod", "start", "stop"]] = df_span[['text','selected_text']].apply(lambda x: find_indices(x.text, x.selected_text), axis=1)

In [32]:
df_span.iloc[27476].to_dict()

{'textID': '4eac33d1c0',
 'text': ' wish we could come see u on Denver  husband lost his job and can`t afford it',
 'selected_text': 'd lost',
 'sentiment': 'negative',
 'text_mod': ' wish we could come see u on denver  husband lost his job and can`t afford it ',
 'selected_text_mod': ' husband lost',
 'start': 36,
 'stop': 50}

In [33]:
test_df_span['text_mod'] = test_df_span['text'].apply(trim_addspace)

In [34]:
df_span.loc[df_span.text_mod.str.contains("tonight, but no one will go")].to_dict()

{'textID': {12154: 'adfbcc6806'},
 'text': {12154: 'i wanna see `up` tonight, but no one will go with me. whhhyyy'},
 'selected_text': {12154: 'but no one will go with me.'},
 'sentiment': {12154: 'negative'},
 'text_mod': {12154: ' i wanna see `up` tonight, but no one will go with me. whhhyyy '},
 'selected_text_mod': {12154: ' but no one will go with me.'},
 'start': {12154: 26},
 'stop': {12154: 55}}

#### Cleaning for span detection

In [35]:
df_span["sentiment_code"] = df_span["sentiment"].astype("category")
X_sentiments = df_span["sentiment_code"].cat.codes.values

test_df_span["sentiment_code"] = test_df_span["sentiment"].astype("category")
X_sentiments_test = test_df_span["sentiment_code"].cat.codes.values

#### Adding special tokens

In [36]:
{t:tokenizer.encode(" "+t).ids for t in df_span.sentiment.unique()}

{'neutral': [7974], 'negative': [2430], 'positive': [1313]}

In [37]:
df_span["text_mod"] = "<s>" + df_span['text_mod'] + "</s> </s> " + df_span.sentiment + " </s>"
test_df_span["text_mod"] = "<s>" + test_df_span['text_mod'] + "</s> </s> " + test_df_span.sentiment + " </s>"

#### Exclusions for span detection

In [38]:
if EXCLUDE_NEUTRAL_CLASS:
    df_span = df_span.loc[df_span.sentiment!="neutral"].copy()
    df_span = df_span.reset_index(drop=True)
    print("EXCLUDE_NEUTRAL_CLASS:", df_span.shape)


if RUN_ON_SAMPLE:
    df_span = df_span.sample(1000).copy()
    df_span = df_span.reset_index(drop=True)
    print("Train RUN_ON_SAMPLE", df_span.shape)
    test_df_span = test_df_span.sample(1000).copy()
    test_df_span = test_df_span.reset_index(drop=True)
    print("Test  RUN_ON_SAMPLE", test_df_span.shape)

EXCLUDE_NEUTRAL_CLASS: (16363, 9)


#### Tokenization for span detection

In [39]:
X_span_tokens = tokenizer.encode_batch(df_span.text_mod.tolist())
Y_span_tokens = tokenizer.encode_batch(df_span.selected_text_mod.tolist())
X_span_tokens_test = tokenizer.encode_batch(test_df_span.text_mod.tolist())

In [40]:
X_span = [i.ids for i in X_span_tokens]
Y_span = [i.ids for i in Y_span_tokens]
X_span_test = [i.ids for i in X_span_tokens_test]

In [41]:
X_span_att = [i.attention_mask for i in X_span_tokens]
Y_span_att = [i.attention_mask for i in Y_span_tokens] # Useless
X_span_att_test = [i.attention_mask for i in X_span_tokens_test]

In [42]:
MAX_SEQ_LEN_SPAN = max([len(i) for i in X_span])

In [43]:
def get_extremities(l_string, s_string, print_it=False):
    len_l = len(l_string)
    len_s = len(s_string)
    
    for i in range(len_l - len_s + 1):
        if (i + len_s) <= len_l:
            substring = l_string[i:i+len_s]
            if substring == s_string:
                if print_it:
                    print(l_string)
                    print(substring)
                    print(i, i+len_s, substring)
                
                start_vector, end_vector = [0] * len_l, [0] * len_l
                start_vector[i], end_vector[i+len_s-1] = 1, 1
                
                return (start_vector, end_vector)

In [44]:
Y_span_starts, Y_span_stops = [], []
anomaly_idx, counter = [], 0
for num, (i,j) in enumerate(zip(X_span_tokens, Y_span_tokens)):
    x,y = i.ids, j.ids
    try:
        s,e = get_extremities(x, y)
        Y_span_starts.append(s)
        Y_span_stops.append(e)
    except TypeError as t:
        counter += 1
        anomaly_idx.append(num)
        Y_span_starts.append([0]*15)
        Y_span_stops.append([0]*15)
print(num + 1, "\t: #Processed")

print(counter,"\t: # of Anomalies")

16363 	: #Processed
2 	: # of Anomalies


In [45]:
check_idx = 758
print(df_span.text[check_idx])
print(df_span.selected_text[check_idx])
print([[i,j,k,l] for i,j,k,l in zip(X_span_tokens[check_idx].tokens,
                                    X_span_tokens[check_idx].ids,
                                    Y_span_starts[check_idx],
                                    Y_span_stops[check_idx])])
print([[i,j] for i,j in zip(Y_span_tokens[check_idx].ids,
                            Y_span_tokens[check_idx].tokens)])

_Luck13 they are SO cute
SO cute
[['<s>', 0, 0, 0], ['Ġ_', 18134, 0, 0], ['luck', 20540, 0, 0], ['13', 1558, 0, 0], ['Ġthey', 51, 0, 0], ['Ġare', 32, 0, 0], ['Ġso', 98, 1, 0], ['Ġcute', 11962, 0, 1], ['Ġ', 1437, 0, 0], ['</s>', 2, 0, 0], ['Ġ', 1437, 0, 0], ['</s>', 2, 0, 0], ['Ġpositive', 1313, 0, 0], ['Ġ', 1437, 0, 0], ['</s>', 2, 0, 0]]
[[98, 'Ġso'], [11962, 'Ġcute']]


#### Padding for span detection

In [46]:
X_span = pad_sequences(X_span, maxlen=MAX_SEQ_LEN_SPAN, padding="post")
X_span_att = pad_sequences(X_span_att, maxlen=MAX_SEQ_LEN_SPAN, padding="post")
Y_span = pad_sequences(Y_span, maxlen=MAX_SEQ_LEN_SPAN, padding="post")

Y_span_starts = pad_sequences(Y_span_starts, maxlen=MAX_SEQ_LEN_SPAN, padding="post")#.argmax(axis=1)
Y_span_stops = pad_sequences(Y_span_stops, maxlen=MAX_SEQ_LEN_SPAN, padding="post")#.argmax(axis=1)

X_span_test = pad_sequences(X_span_test, maxlen=MAX_SEQ_LEN_SPAN, padding="post")
X_span_att_test = pad_sequences(X_span_att_test, maxlen=MAX_SEQ_LEN_SPAN, padding="post")

In [47]:
pprint({
    "X_span" : X_span.shape,
    "X_span_att" : X_span_att.shape,
    "Y_span" : Y_span.shape,
    "Y_span_starts" : Y_span_starts.shape,
    "Y_span_stops" : Y_span_stops.shape,
    "X_span_test" : X_span_test.shape,
    "X_span_att_test" : X_span_att_test.shape,
    "VOCAB_SIZE":VOCAB_SIZE,
    "MAX_SEQ_LEN_SPAN":MAX_SEQ_LEN_SPAN
})

{'MAX_SEQ_LEN_SPAN': 78,
 'VOCAB_SIZE': 50265,
 'X_span': (16363, 78),
 'X_span_att': (16363, 78),
 'X_span_att_test': (3534, 78),
 'X_span_test': (3534, 78),
 'Y_span': (16363, 78),
 'Y_span_starts': (16363, 78),
 'Y_span_stops': (16363, 78)}


#### Cross validation for span detection

In [48]:
keep_flag = np.isin(Y_span_stops.argmax(axis=1),
                    np.unique(Y_span_stops.argmax(axis=1),
                              return_counts=True)[0][np.unique(Y_span_stops.argmax(axis=1),
                                                               return_counts=True)[1]>1])

In [49]:
sum(keep_flag), df_span.shape[0], df_span.shape[0] - sum(keep_flag)

(16359, 16363, 4)

In [50]:
print("\n",
     X_span.shape, "\t: X ", "\n",
     X_span_att.shape, "\t: X_att ", "\n",
     Y_span.shape, "\t: Y ", "\n",
     Y_span_starts.shape, "\t: Y_starts ", "\n",
     Y_span_stops.shape, "\t: Y_stops ", "\n",
     X_span_test.shape, "\t: X_test ", "\n",
     X_span_att_test.shape, "\t: X_att_test ", "\n"
)


 (16363, 78) 	: X  
 (16363, 78) 	: X_att  
 (16363, 78) 	: Y  
 (16363, 78) 	: Y_starts  
 (16363, 78) 	: Y_stops  
 (3534, 78) 	: X_test  
 (3534, 78) 	: X_att_test  



In [51]:
X_span = X_span[keep_flag]
X_span_att = X_span_att[keep_flag]
Y_span = Y_span[keep_flag]
Y_span_starts = Y_span_starts[keep_flag]
Y_span_stops = Y_span_stops[keep_flag]
X_span_test = X_span_test
X_span_att_test = X_span_att_test

In [52]:
print("\n",
     X_span.shape, "\t: X ", "\n",
     X_span_att.shape, "\t: X_att ", "\n",
     Y_span.shape, "\t: Y ", "\n",
     Y_span_starts.shape, "\t: Y_starts ", "\n",
     Y_span_stops.shape, "\t: Y_stops ", "\n",
     X_span_test.shape, "\t: X_test ", "\n",
     X_span_att_test.shape, "\t: X_att_test ", "\n"
)


 (16359, 78) 	: X  
 (16359, 78) 	: X_att  
 (16359, 78) 	: Y  
 (16359, 78) 	: Y_starts  
 (16359, 78) 	: Y_stops  
 (3534, 78) 	: X_test  
 (3534, 78) 	: X_att_test  



In [53]:
span_kf = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=seeded_value)

In [54]:
Y_span_words = [tokenizer.decode(i) for i in Y_span]

In [55]:
MAX_SEQ_LEN = max(MAX_SEQ_LEN_SENT, MAX_SEQ_LEN_SPAN)
MAX_SEQ_LEN, MAX_SEQ_LEN_SPAN, MAX_SEQ_LEN_SENT

(100, 78, 100)

In [56]:
X_span = pad_sequences(X_span, maxlen=MAX_SEQ_LEN, padding="post")
X_span_att = pad_sequences(X_span_att, maxlen=MAX_SEQ_LEN, padding="post")
Y_span = pad_sequences(Y_span, maxlen=MAX_SEQ_LEN, padding="post")
Y_span_starts = pad_sequences(Y_span_starts, maxlen=MAX_SEQ_LEN, padding="post")
Y_span_stops = pad_sequences(Y_span_stops, maxlen=MAX_SEQ_LEN, padding="post")

X_span_test = pad_sequences(X_span_test, maxlen=MAX_SEQ_LEN, padding="post")
X_span_att_test = pad_sequences(X_span_att_test, maxlen=MAX_SEQ_LEN, padding="post")

X_sent = pad_sequences(X_sent, maxlen=MAX_SEQ_LEN, padding="post")
X_sent_att = pad_sequences(X_sent_att, maxlen=MAX_SEQ_LEN, padding="post")

pprint({
    "X_span" : X_span.shape,
    "X_span_att" : X_span_att.shape,
    "Y_span" : Y_span.shape,
    
    "X_span_test" : X_span_test.shape,
    "X_span_att_test" : X_span_att_test.shape,
    
    "X_sent" : X_sent.shape,
    "X_sent_att" : X_sent_att.shape,
})

{'X_sent': (31015, 100),
 'X_sent_att': (31015, 100),
 'X_span': (16359, 100),
 'X_span_att': (16359, 100),
 'X_span_att_test': (3534, 100),
 'X_span_test': (3534, 100),
 'Y_span': (16359, 100)}


#### Model Specifications

In [57]:
def build_model():
    input_sequences = Input((MAX_SEQ_LEN), dtype=tf.int32, name="words")
    input_att_flags = Input((MAX_SEQ_LEN), dtype=tf.int32, name="att_flags")
    input_token_ids = Input((MAX_SEQ_LEN), dtype=tf.int32, name="token_ids")
    
    config = RobertaConfig.from_pretrained(EXT_MODEL_DIR+'config.json')
    roberta_model = TFRobertaModel.from_pretrained(EXT_MODEL_DIR+'tf_model.h5', config=config)
    x = roberta_model(inputs=input_sequences, attention_mask=input_att_flags, token_type_ids=input_token_ids)
    
    x1 = tf.keras.layers.Dropout(DROPOUT)(x[0])
    x1 = tf.keras.layers.Conv1D(768, 2,padding='same')(x1)
    x1 = tf.keras.layers.LeakyReLU()(x1)
    x1 = tf.keras.layers.Dense(1)(x1)
    x1 = tf.keras.layers.Flatten()(x1)
    output_starts_0 = tf.keras.layers.Activation('softmax', name="starts_0")(x1)
    
    x2 = tf.keras.layers.Dropout(DROPOUT)(x[0]) 
    x2 = tf.keras.layers.Conv1D(768, 2,padding='same')(x2)
    x2 = tf.keras.layers.LeakyReLU()(x2)
    x2 = tf.keras.layers.Dense(1)(x2)
    x2 = tf.keras.layers.Flatten()(x2)
    output_stops_0 = tf.keras.layers.Activation('softmax', name="stops_0")(x2)
    
    x3 = tf.keras.layers.Dropout(DROPOUT)(x[0]) 
    x3 = tf.keras.layers.Conv1D(768, 2,padding='same')(x3)
    x3 = tf.keras.layers.LeakyReLU()(x3)
    x3 = tf.keras.layers.Dense(1)(x3)
    x3 = tf.keras.layers.Flatten()(x3)
    x3 = tf.keras.layers.Dense(3)(x3)
    output_sentiment = tf.keras.layers.Activation('softmax', name="output_sentiments")(x3)
    
    sentiment_model = Model([input_att_flags, input_sequences, input_token_ids], [output_sentiment])
    
    output_subtract = tf.keras.layers.Subtract()([output_starts_0, output_stops_0])
    output_flat = concatenate([output_starts_0, output_stops_0, output_subtract, output_sentiment])
    output_starts_1 = Dense(MAX_SEQ_LEN, activation='softmax', name="starts_1")(output_flat)
    output_stops_1 = Dense(MAX_SEQ_LEN, activation='softmax', name="stops_1")(output_flat)
    
    span_detection_model = Model([input_att_flags, input_sequences, input_token_ids],
                                 [output_starts_0, output_stops_0, output_starts_1, output_stops_1])
    
    return sentiment_model, span_detection_model

In [58]:
sentiment_model, span_detection_model = build_model()

In [59]:
sentiment_model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
words (InputLayer)              [(None, 100)]        0                                            
__________________________________________________________________________________________________
att_flags (InputLayer)          [(None, 100)]        0                                            
__________________________________________________________________________________________________
token_ids (InputLayer)          [(None, 100)]        0                                            
__________________________________________________________________________________________________
tf_roberta_model (TFRobertaMode ((None, 100, 768), ( 124645632   words[0][0]                      
______________________________________________________________________________________________

In [60]:
span_detection_model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
words (InputLayer)              [(None, 100)]        0                                            
__________________________________________________________________________________________________
att_flags (InputLayer)          [(None, 100)]        0                                            
__________________________________________________________________________________________________
token_ids (InputLayer)          [(None, 100)]        0                                            
__________________________________________________________________________________________________
tf_roberta_model (TFRobertaMode ((None, 100, 768), ( 124645632   words[0][0]                      
____________________________________________________________________________________________

### Sentiment Model Fit

In [61]:
def infer_sentiment(x):
    encoded_repr = tokenizer.encode_batch(x.tolist())

    sample_text_ids = pad_sequences([i.ids for i in encoded_repr],
                                    maxlen=MAX_SEQ_LEN,
                                    padding="post")
    sample_text_att = pad_sequences([i.attention_mask for i in encoded_repr],
                                    maxlen=MAX_SEQ_LEN,
                                    padding="post")
    pred = sentiment_model.predict({"att_flags":sample_text_att,
                                    "words":sample_text_ids,
                                    "token_ids":np.zeros_like(sample_text_att)},
                                  batch_size=PREDICT_BATCH_SIZE)
    
    res = pd.DataFrame({"predicted_sentiment":pred.argmax(axis=1)})
    
    return res.predicted_sentiment.apply(lambda x:[k for k,v in sentiment_lookup.items() if v==x][0])

In [62]:
def get_sentiment_results(data):
    data["predicted_sentiment"] = infer_sentiment(x=data.text)

    print(classification_report(y_true=data.sentiment[data.index.isin(tr_index)],
                                y_pred=data.predicted_sentiment[data.index.isin(tr_index)]))

    print(classification_report(y_true=data.sentiment[data.index.isin(va_index)],
                                y_pred=data.predicted_sentiment[data.index.isin(va_index)]))

    print(confusion_matrix(y_true=data.sentiment,
                     y_pred=data.predicted_sentiment,
                     labels=['positive', 'neutral', 'negative']))

    print(confusion_matrix(y_true=data.sentiment,
                     y_pred=data.predicted_sentiment,
                     labels=['positive', 'neutral', 'negative'],
                     normalize="all"))

    data["set2"] = np.where(data.index.isin(tr_index), "train", "valid")
    print(data.groupby("set2").apply(lambda x : accuracy_score(y_true=x.sentiment, y_pred=x.predicted_sentiment)))

    print(data.groupby("set").apply(lambda x : accuracy_score(y_true=x.sentiment, y_pred=x.predicted_sentiment)))

    print(pd.concat({
        "accuracy" : data.groupby(["set", "set2"]).apply(lambda x : accuracy_score(y_true=x.sentiment,
                                                                                   y_pred=x.predicted_sentiment)),
        "count" : data.groupby(["set", "set2"])["sentiment"].count()
    }, axis=1))

In [63]:
p,c = np.unique(Y_sent, return_counts=True)
cw = class_weight.compute_class_weight('balanced', np.unique(Y_sent), Y_sent)

In [64]:
sentiment_kf = KFold(n_splits=NUM_FOLDS,shuffle=True,random_state=seeded_value)

In [65]:
sentiment_mcp = ModelCheckpoint(filepath=RESULTS_DIR+"Sentiment_"+MODEL_PREFIX+"BestCheckpoint.h5", monitor='val_loss',
                                verbose=0, save_best_only=False, save_weights_only=True, mode='auto', save_freq='epoch')

sentiment_csvl = CSVLogger(filename=RESULTS_DIR+"Sentiment_"+MODEL_PREFIX+"_LossLogs.csv",
                           separator=",", append=True)

In [66]:
for num, (tr_index, va_index) in enumerate(sentiment_kf.split(X_sent, Y_sent)):
    print("[INFO] ==================== FOLD#", num, "====================")

    sentiment_model.layers[3].trainable = False
    adam = Adam(learning_rate=SENTIMENT_MAX_LR)
    sentiment_model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
                            optimizer=adam,
                            metrics=['accuracy'])

    sent_history = sentiment_model.fit(x={"att_flags":X_sent_att[tr_index],
                                          "words":X_sent[tr_index],
                                          "token_ids":np.zeros_like(X_sent_att[tr_index])},
                                       y={"output_sentiments":Y_sent[tr_index]},
                                       shuffle=True,
                                       batch_size=BATCH_SIZE,
                                       epochs=SENTIMENT_NUM_EPOCHS[0],
                                       validation_data=({"att_flags":X_sent_att[va_index],
                                                         "words":X_sent[va_index],
                                                         "token_ids":np.zeros_like(X_sent_att[va_index])},
                                                        {"output_sentiments":Y_sent[va_index]}),
                                       verbose=1,
                                       class_weight=cw,
                                       callbacks=[sentiment_mcp, sentiment_csvl])

    sentiment_model.layers[3].trainable = True
    adam = Adam(learning_rate=SENTIMENT_MIN_LR)
    sentiment_model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
                            optimizer=adam,
                            metrics=['accuracy'])

    sent_history_finetuned = sentiment_model.fit(x={"att_flags":X_sent_att[tr_index],
                                                    "words":X_sent[tr_index],
                                                    "token_ids":np.zeros_like(X_sent_att[tr_index])},
                                                 y={"output_sentiments":Y_sent[tr_index]},
                                                 shuffle=True,
                                                 batch_size=BATCH_SIZE,
                                                 epochs=SENTIMENT_NUM_EPOCHS[1],
                                                 validation_data=({"att_flags":X_sent_att[va_index],
                                                                   "words":X_sent[va_index],
                                                                   "token_ids":np.zeros_like(X_sent_att[va_index])},
                                                                  {"output_sentiments":Y_sent[va_index]}),
                                                 verbose=1,
                                                 class_weight=cw,
                                                 callbacks=[sentiment_mcp, sentiment_csvl])
    
    get_sentiment_results(data)

Train on 15507 samples, validate on 15508 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Train on 15507 samples, validate on 15508 samples
Epoch 1/2
Epoch 2/2
Epoch 2/2
              precision    recall  f1-score   support

    negative       0.85      0.81      0.83      4386
     neutral       0.77      0.84      0.80      6244
    positive       0.89      0.82      0.86      4878

    accuracy                           0.83     15508
   macro avg       0.84      0.83      0.83     15508
weighted avg       0.83      0.83      0.83     15508

              precision    recall  f1-score   support

    negative       0.82      0.77      0.80      4396
     neutral       0.74      0.82      0.78      6304
    positive       0.87      0.80      0.83      4807

    accuracy                           0.80     15507
   macro avg       0.81      0.80      0.80     15507
weighted avg       0.80      0.80      0.80     15507

[[ 7888  1684   113]
 [  933 10387  1228]
 [  137  1699  6946]]
[[0.

#### Validate sentiment model

In [67]:
sample_text = data.text.sample(5).tolist()

encoded_repr = tokenizer.encode_batch(sample_text)

sample_text_ids = pad_sequences([i.ids for i in encoded_repr],
                                maxlen=MAX_SEQ_LEN,
                                padding="post")
sample_text_att = pad_sequences([i.attention_mask for i in encoded_repr],
                                maxlen=MAX_SEQ_LEN,
                                padding="post")
pred = sentiment_model.predict({"att_flags":sample_text_att,
                                "words":sample_text_ids,
                                "token_ids":np.zeros_like(sample_text_att)})

pprint({
    num:{
        "text":i,
        "predicted_sentiment":[k for k,v in sentiment_lookup.items() if v==j][0]
    } for num,(i,j) in enumerate(zip(sample_text, pred.argmax(axis=1)))
})

{0: {'predicted_sentiment': 'negative',
     'text': 'Going out to Miranda shopping centre to spend time with the '
             'family, before going away for 2 weeks to Malaysia. Gonna miss '
             'them!'},
 1: {'predicted_sentiment': 'neutral',
     'text': '  Now I need to find the Keynote one! At least I know where to '
             'go! #ScreenCastsOnline'},
 2: {'predicted_sentiment': 'positive',
     'text': 'relaxing night at home with best people'},
 3: {'predicted_sentiment': 'negative',
     'text': 'Watching Ace of Cakes: LOST edition omfgggg'},
 4: {'predicted_sentiment': 'negative',
     'text': 'doneeee wheeee hahaaaaaaaa so tired and sleepy  peter u suck not '
             'coming to my bday!'}}


In [68]:
sentiment_lookup

{'positive': 2, 'neutral': 1, 'negative': 0}

In [69]:
data.loc[(data.sentiment == "neutral") & (data.predicted_sentiment == "negative")].sample(5) # most incorrect in this cohort

Unnamed: 0,text,set,sentiment,text_mod,predicted_sentiment,set2
7520,i know its awful but never fear i`ve got loads of funds so we can have a drunken catch up post-exams ja?,train,neutral,<s> i know its awful but never fear i`ve got loads of funds so we can have a drunken catch up post-exams ja? </s>,negative,valid
9871,ahh naa i dont like rain nm really friend is comin over then goin 2 my grans,train,neutral,<s> ahh naa i dont like rain nm really friend is comin over then goin 2 my grans </s>,negative,train
13665,good news: finally finished my #EASactive workout that has been paused for 6 hours. bad news: my resistance band is torn,train,neutral,<s> good news: finally finished my #easactive workout that has been paused for 6 hours. bad news: my resistance band is torn </s>,negative,train
24207,Not actually managed to purchase anything from the ovi store yet. Constant server error on check out,train,neutral,<s> not actually managed to purchase anything from the ovi store yet. constant server error on check out </s>,negative,valid
4546,why not now you made me sad I thought you`d be jumping for joy,train,neutral,<s> why not now you made me sad i thought you`d be jumping for joy </s>,negative,valid


### Span Detection Model Fit

In [70]:
for i in range(NUM_FOLDS):
    if os.path.exists(RESULTS_DIR+MODEL_PREFIX+"_LossLogs_"+str(i)+".csv"):
        os.remove(RESULTS_DIR+MODEL_PREFIX+"_LossLogs_"+str(i)+".csv")

In [71]:
def print_metrics(pred_dict):
    print("[INFO] ","="*15,"Validation for FOLD#", num, "="*15)
    funcs = [accuracy_score, f1_score, precision_score, recall_score, confusion_matrix]
    for f in funcs:
        for data_set in ["train","valid"]:
            for var in ["starts", "stops"]:
                if f in [accuracy_score]:
                    res = f(**pred_dict[data_set][var])
                    print("[INFO] {:.2f}".format(100 * res), "\t||", data_set, "\t||", var, "\t||", f.__name__)
                elif f in [confusion_matrix]:
                    res = f(**pred_dict[data_set][var], labels=np.arange(MAX_SEQ_LEN))
                    np.savetxt(X=res, fmt='%i', delimiter=",",
                               fname=RESULTS_DIR+"ConfusionMatrix_"+MODEL_PREFIX+"_"+data_set+"_"+var+".csv")
                    print("[INFO] \t||", data_set, "\t||", var, "\t||", f.__name__, "\t||", 
                          RESULTS_DIR+"ConfusionMatrix_"+MODEL_PREFIX+"_"+data_set+"_"+var+".csv")
                else:
                    res = f(**pred_dict[data_set][var], average="macro")
                    print("[INFO] {:.2f}".format(100 * res), "\t||", data_set, "\t||", var, "\t||", f.__name__)
        print("=======================================================================")

In [72]:
def post_process(string):
    string = re.sub(pattern=" (negative|positive|neutral)[ ]+$", repl="", string=string)
    string = re.sub(pattern=" +", repl=" ", string=string.strip())
    return string

<a href="https://keras.io/guides/transfer_learning/#finetuning" target="_blank"><h2 id="finetuning">Fine-tuning</h2></a>
<p>Once your model has converged on the new data, you can try to unfreeze all or part of
 the base model and retrain the whole model end-to-end with a very low learning rate.</p>
 <p>This is an optional last step that can potentially give you incremental improvements.
 It could also potentially lead to quick overfitting -- keep that in mind.</p>
 <p>It is critical to only do this step <em>after</em> the model with frozen layers has been
trained to convergence. If you mix randomly-initialized trainable layers with
trainable layers that hold pre-trained features, the randomly-initialized layers will
cause very large gradient updates during training, which will destroy your pre-trained
 features.</p>
 <p>It's also critical to use a very low learning rate at this stage, because
you are training a much larger model than in the first round of training, on a dataset
 that is typically very small.
As a result, you are at risk of overfitting very quickly if you apply large weight
 updates. Here, you only want to readapt the pretrained weights in an incremental way.</p>

<a href="https://keras.io/guides/transfer_learning/#finetuning" target="_blank"><p><strong>Important note about <code>compile()</code> and <code>trainable</code></strong></p></a>
<p>Calling <code>compile()</code> on a model is meant to "freeze" the behavior of that model. This
 implies that the <code>trainable</code>
attribute values at the time the model is compiled should be preserved throughout the
 lifetime of that model,
until <code>compile</code> is called again. Hence, if you change any <code>trainable</code> value, make sure
 to call <code>compile()</code> again on your
model for your changes to be taken into account.</p>

In [73]:
span_detection_model_bkup = span_detection_model
span_detection_model.save_weights(filepath=RESULTS_DIR+"FinalSentimentModel.h5", overwrite=True)

In [74]:
for num, (t_index, v_index) in enumerate(span_kf.split(X_span, Y_span_stops)):
    print("[INFO] ==================== FOLD#", num, "====================")
    
    if num > 0:
        del span_history
        del span_detection_model
        gc.collect()
        K.clear_session()
    
    span_detection_model = span_detection_model_bkup
    span_detection_model.load_weights(RESULTS_DIR+"FinalSentimentModel.h5")
    
    span_mcp = ModelCheckpoint(filepath=RESULTS_DIR+MODEL_PREFIX+"BestCheckpoint_"+str(num)+".h5", monitor='val_loss',
                               verbose=0, save_best_only=False, save_weights_only=True, mode='auto', save_freq='epoch')
    
    span_csvl = CSVLogger(filename=RESULTS_DIR+MODEL_PREFIX+"_LossLogs_"+str(num)+".csv",
                          separator=",",
                          append=True)
    
    print("[INFO] Training only the final layers at higher learning rates.")
    span_detection_model.layers[3].trainable = False
    adam = Adam(learning_rate=MAX_LR)
    span_detection_model.compile(loss=tf.keras.losses.CategoricalCrossentropy(from_logits=False, label_smoothing=LABEL_SMOOTHING_PARAM),
                                 optimizer=adam,
                                 metrics=['accuracy'],
                                 loss_weights={"starts_0":1.0,"stops_0":1.0,"starts_1":1.0,"stops_1":1.0})
    span_history = span_detection_model.fit(x={"att_flags":X_span_att[t_index],
                                               "words":X_span[t_index],
                                               "token_ids":np.zeros_like(X_span_att[t_index])},
                                            y={"starts_0":Y_span_starts[t_index],
                                               "stops_0":Y_span_stops[t_index], 
                                               "starts_1":Y_span_starts[t_index],
                                               "stops_1":Y_span_stops[t_index]},
                                            shuffle=True,
                                            batch_size=BATCH_SIZE,
                                            epochs=NUM_EPOCHS[0],
                                            validation_data=({"att_flags":X_span_att[v_index],
                                                              "words":X_span[v_index],
                                                              "token_ids":np.zeros_like(X_span_att[v_index])},
                                                             {"starts_0":Y_span_starts[v_index],
                                                              "stops_0":Y_span_stops[v_index], 
                                                              "starts_1":Y_span_starts[v_index],
                                                              "stops_1":Y_span_stops[v_index]}),
                                            verbose=1,
                                            callbacks=[span_mcp, span_csvl])
    
    print("[INFO] Training only the final layers at lower learning rates.")
    adam = Adam(learning_rate=MID_LR)
    span_detection_model.compile(loss=tf.keras.losses.CategoricalCrossentropy(from_logits=False, label_smoothing=LABEL_SMOOTHING_PARAM),
                                 optimizer=adam,
                                 metrics=['accuracy'],
                                 loss_weights={"starts_0":1.0,"stops_0":1.0,"starts_1":1.0,"stops_1":1.0})
    span_history = span_detection_model.fit(x={"att_flags":X_span_att[t_index],
                                               "words":X_span[t_index],
                                               "token_ids":np.zeros_like(X_span_att[t_index])},
                                            y={"starts_0":Y_span_starts[t_index],
                                               "stops_0":Y_span_stops[t_index], 
                                               "starts_1":Y_span_starts[t_index],
                                               "stops_1":Y_span_stops[t_index]},
                                            shuffle=True,
                                            batch_size=BATCH_SIZE,
                                            epochs=NUM_EPOCHS[1],
                                            validation_data=({"att_flags":X_span_att[v_index],
                                                              "words":X_span[v_index],
                                                              "token_ids":np.zeros_like(X_span_att[v_index])},
                                                             {"starts_0":Y_span_starts[v_index],
                                                              "stops_0":Y_span_stops[v_index], 
                                                              "starts_1":Y_span_starts[v_index],
                                                              "stops_1":Y_span_stops[v_index]}),
                                            verbose=1,
                                            callbacks=[span_mcp, span_csvl])
    
    print("[INFO] Unfreezing RoBerta layer and training at lowest learning rates.")
    span_detection_model.layers[3].trainable = True
    adam = Adam(learning_rate=MIN_LR)
    span_detection_model.compile(loss=tf.keras.losses.CategoricalCrossentropy(from_logits=False, label_smoothing=LABEL_SMOOTHING_PARAM),
                                 optimizer=adam,
                                 metrics=['accuracy'],
                                 loss_weights={"starts_0":1.0,"stops_0":1.0,"starts_1":1.0,"stops_1":1.0})
    span_history = span_detection_model.fit(x={"att_flags":X_span_att[t_index],
                                               "words":X_span[t_index],
                                               "token_ids":np.zeros_like(X_span_att[t_index])},
                                            y={"starts_0":Y_span_starts[t_index],
                                               "stops_0":Y_span_stops[t_index], 
                                               "starts_1":Y_span_starts[t_index],
                                               "stops_1":Y_span_stops[t_index]},
                                            shuffle=True,
                                            batch_size=BATCH_SIZE,
                                            epochs=NUM_EPOCHS[2],
                                            validation_data=({"att_flags":X_span_att[v_index],
                                                              "words":X_span[v_index],
                                                              "token_ids":np.zeros_like(X_span_att[v_index])},
                                                             {"starts_0":Y_span_starts[v_index],
                                                              "stops_0":Y_span_stops[v_index], 
                                                              "starts_1":Y_span_starts[v_index],
                                                              "stops_1":Y_span_stops[v_index]}),
                                            verbose=1,
                                            callbacks=[span_mcp, span_csvl])
    
    # Loading best weights per fold
    span_detection_model.load_weights(RESULTS_DIR+MODEL_PREFIX+"BestCheckpoint_"+str(num)+".h5")
    
    pred_train = span_detection_model.predict(x = {"att_flags":X_span_att[t_index],
                                                   "words":X_span[t_index],
                                                   "token_ids":np.zeros_like(X_span_att[t_index])},
                                              batch_size=PREDICT_BATCH_SIZE)

    pred_val = span_detection_model.predict(x = {"att_flags":X_span_att[v_index],
                                                 "words":X_span[v_index],
                                                 "token_ids":np.zeros_like(X_span_att[v_index])},
                                            batch_size=PREDICT_BATCH_SIZE)
    
    pred_starts_train, pred_stops_train = (pred_train[0]/2.0 + pred_train[2]/2.0), (pred_train[1]/2.0 + pred_train[3]/2.0)
    pred_starts_val, pred_stops_val = (pred_val[0]/2.0 + pred_val[2]/2.0), (pred_val[1]/2.0 + pred_val[3]/2.0)
    
    # Accumulate test results after training every fold
    pred_test_fold = span_detection_model.predict(x = {"att_flags":X_span_att_test,
                                                       "words":X_span_test,
                                                       "token_ids":np.zeros_like(X_span_att_test)},
                                                  batch_size=PREDICT_BATCH_SIZE)
    if num==0:
        pred_test = []
        pred_test.append(pred_test_fold[0]/2.0 + pred_test_fold[2]/2.0)
        pred_test.append(pred_test_fold[1]/2.0 + pred_test_fold[3]/2.0)
    else:
        pred_test[0] += (pred_test_fold[0]/2.0 + pred_test_fold[2]/2.0)
        pred_test[1] += (pred_test_fold[1]/2.0 + pred_test_fold[3]/2.0)
    
    # Tabulate
    preds = {
        "train":{
            "starts":{
                "y_true":Y_span_starts[t_index].argmax(axis=1),
                "y_pred":pred_train[0].argmax(axis=1)
            },
            "stops":{
                "y_true":Y_span_stops[t_index].argmax(axis=1),
                "y_pred":pred_train[1].argmax(axis=1)
            }
        },
        "valid":{
            "starts":{
                "y_true":Y_span_starts[v_index].argmax(axis=1),
                "y_pred":pred_val[0].argmax(axis=1)
            },
            "stops":{
                "y_true":Y_span_stops[v_index].argmax(axis=1),
                "y_pred":pred_val[1].argmax(axis=1)
            }        
        }
    }
    
    print_metrics(pred_dict=preds)

    print("[INFO] Prediction shape for training data: ", pred_starts_train.shape, pred_stops_train.shape)
    print("[INFO] Prediction shape for validation data: ", pred_starts_val.shape, pred_stops_val.shape)

    print("[INFO] Normal predictions (StartIndex less than EndIndex) for training data: ",
          sum([s<e for s,e in zip(pred_starts_train.argmax(axis=1),
                                  pred_stops_train.argmax(axis=1))]),
          "out of", pred_starts_train.shape[0])
    print("[INFO] Normal predictions (StartIndex less than EndIndex) for validation data: ",
          sum([s<e for s,e in zip(pred_starts_val.argmax(axis=1),
                                  pred_stops_val.argmax(axis=1))]),
          "out of", pred_starts_val.shape[0],)

    pred_words_train = [post_process(tokenizer.decode(t[s:e])) if s<e else post_process(tokenizer.decode(t[e:])) for t, s,e in zip(X_span[t_index],
                                                                                                                                   pred_starts_train.argmax(axis=1),
                                                                                                                                   pred_stops_train.argmax(axis=1))]
    
    pred_words_val = [post_process(tokenizer.decode(t[s:e])) if s<e else post_process(tokenizer.decode(t[e:])) for t, s,e in zip(X_span[v_index],
                                                                                                                                 pred_starts_val.argmax(axis=1),
                                                                                                                                 pred_stops_val.argmax(axis=1))]

    print("[INFO] Training Jaccard Score: ",
          np.mean([jaccard(str1=i, str2=j) for i,j in zip([t for n,t in enumerate(Y_span_words) if n in t_index],
                                                          pred_words_train)]))
    print("[INFO] Validation Jaccard Score: ",
          np.mean([jaccard(str1=i, str2=j) for i,j in zip([t for n,t in enumerate(Y_span_words) if n in v_index],
                                                          pred_words_val)]))
    print("[INFO] Training for fold:", num, "finished at", ctime(time()))
print(ctime(time()))

[INFO] Training only the final layers at higher learning rates.
Train on 8179 samples, validate on 8180 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
[INFO] Training only the final layers at lower learning rates.
Train on 8179 samples, validate on 8180 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
[INFO] Unfreezing RoBerta layer and training at lowest learning rates.
Train on 8179 samples, validate on 8180 samples
[INFO] 52.46 	|| train 	|| starts 	|| accuracy_score
[INFO] 53.92 	|| train 	|| stops 	|| accuracy_score
[INFO] 50.87 	|| valid 	|| starts 	|| accuracy_score
[INFO] 50.88 	|| valid 	|| stops 	|| accuracy_score
[INFO] 41.99 	|| train 	|| starts 	|| f1_score
[INFO] 47.62 	|| train 	|| stops 	|| f1_score
[INFO] 39.92 	|| valid 	|| starts 	|| f1_score
[INFO] 43.50 	|| valid 	|| stops 	|| f1_score
[INFO] 40.01 	|| train 	|| starts 	|| precision_score
[INFO] 44.94 	|| train 	|| stops 	|| precision_score
[INFO] 37.55 	|| valid 	|| starts 	|| precision_score
[INFO] 40.67 	|| vali

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[INFO] Normal predictions (StartIndex less than EndIndex) for training data:  6185 out of 8179
[INFO] Normal predictions (StartIndex less than EndIndex) for validation data:  6191 out of 8180
[INFO] Training Jaccard Score:  0.5403877637739355
[INFO] Validation Jaccard Score:  0.532749916223115
[INFO] Training for fold: 0 finished at Mon Jun  8 18:09:34 2020
[INFO] Training only the final layers at higher learning rates.
Train on 8180 samples, validate on 8179 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
[INFO] Training only the final layers at lower learning rates.
Train on 8180 samples, validate on 8179 samples
Epoch 1/4
Epoch 2/4
[INFO] 52.11 	|| train 	|| starts 	|| accuracy_score
[INFO] 53.74 	|| train 	|| stops 	|| accuracy_score
[INFO] 49.86 	|| valid 	|| starts 	|| accuracy_score
[INFO] 51.90 	|| valid 	|| stops 	|| accuracy_score
[INFO] 40.54 	|| train 	|| starts 	|| f1_score
[INFO] 45.24 	|| train 	|| stops 	|| f1_score
[INFO] 41.58 	|| valid 	|| starts 	|| f1_score
[INFO] 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[INFO] Normal predictions (StartIndex less than EndIndex) for validation data:  6306 out of 8179
[INFO] Training Jaccard Score:  0.5327974569639415
[INFO] Validation Jaccard Score:  0.5297035385868454
[INFO] Training for fold: 1 finished at Mon Jun  8 18:27:40 2020
Mon Jun  8 18:27:40 2020


## Validation

#### Inference

In [75]:
pred_starts_test, pred_stops_test = pred_test[0]/NUM_FOLDS, pred_test[1]/NUM_FOLDS
print("[INFO] Prediction shape for testing data: ", pred_starts_test.shape, pred_stops_test.shape)

[INFO] Prediction shape for testing data:  (3534, 100) (3534, 100)


#### Postprocessing

In [76]:
print("Normal predictions (StartIndex less than EndIndex) for testing data:",
      sum([s<e for s,e in zip(pred_starts_test.argmax(axis=1),
                              pred_stops_test.argmax(axis=1))]), 
      "out of",
      pred_starts_test.shape[0])

Normal predictions (StartIndex less than EndIndex) for testing data: 2924 out of 3534


In [77]:
pred_words_test = [
    post_process(tokenizer.decode(t[s:e+1])) if s<e else post_process(tokenizer.decode(t[e:])) for t,s,e in zip(X_span_test,
                                                                                                              pred_starts_test.argmax(axis=1),
                                                                                                              pred_stops_test.argmax(axis=1))
]

In [78]:
check_idx = 158
#print([[t,i,j,k] for t,i,j,k in zip(tokenizer.decode(),X_test[check_idx],pred_starts_test[check_idx],pred_stops_test[check_idx])])
print(tokenizer.decode(X_span_test[check_idx]))
print(pred_starts_test.argmax(axis=1)[check_idx])
print(pred_stops_test.argmax(axis=1)[check_idx])
print(post_process(tokenizer.decode(X_span_test[check_idx][pred_starts_test.argmax(axis=1)[check_idx]:1+pred_stops_test.argmax(axis=1)[check_idx]])))

 yes! im down to 50% full on my dvr  i was at 98% like 3 days ago... lol i swear if i didnt have a dvr i would never watch tv   neutral 
25
38
i swear if i didnt have a dvr i would never watch tv


## Submission

In [79]:
test_df_span['selected_text'] = pred_words_test

In [80]:
test_df_span["selected_text"] = np.where(test_df_span["sentiment"] == "neutral",
                                         test_df_span["text"],
                                         test_df_span["selected_text"])

In [81]:
test_df_span[["textID", "selected_text"]].to_csv(RESULTS_DIR+"submission.csv", index=False)

In [82]:
test_df_span.loc[test_df_span.sentiment!="neutral"][["text", "sentiment","selected_text"]].sample(25)

Unnamed: 0,text,sentiment,selected_text
325,Thank you so much phaoloo !!!!,positive,thank you so much phaoloo
334,"Well, good luck then.",positive,good luck then.
866,me because I might not have enough money for college!!!,negative,me because i might not have enough money for college!!!
1484,"#warmfuzzies to you, my friend",positive,"#warmfuzzies to you, my friend"
130,Welcome!,positive,welcome!
66,"Hi all, just recovering from a party, looking forward to an exciting bank holiday around the diy shops...life cant get much better.surely",positive,exciting bank holiday around the diy shops...life cant get much better.surely
1474,"`If I don`t believe in Him, why would He believe in me?` -Bring Me The Horizon A chill goes down my spine whenever I hear that line.",negative,a chill goes down my spine whenever i hear that line.
321,Jus got back from a run up sunset blvd! My cuzin tried to kill me my legs are still movin and I`m sittin down!!,negative,kill me my legs are still movin and i`m sittin down!!
3316,Listening `Hallelujah` on Youtube. Leonard Cohen wins. #xfactor,positive,listening `hallelujah` on youtube. leonard cohen wins.
3211,hello I`m up late playing on the internet. I love you!,positive,i love you!
