## Settings

In [1]:
# CONTROLS
MODEL_PREFIX = "V36" # Better start stop indices No removal of samples, Using my tokenizer, Roberta weights, Sentiment(LeakyKFold), SpanNoNeutral(NoLeakKFlod) With Label Smoothing
MODEL_NUMBER = MODEL_PREFIX[-2:]

TRAIN_SPLIT_RATIO = 0.2
BATCH_SIZE = 16
PREDICT_BATCH_SIZE = 512
DROPOUT = 0.3
LABEL_SMOOTHING_PARAM = 0.2

RUN_ON_SAMPLE = True
EXCLUDE_NEUTRAL_CLASS = True
SENTIMENT_MAX_LR = 5e-4
SENTIMENT_MID_LR = 5e-5
SENTIMENT_MIN_LR = 5e-6
SENTIMENT_NUM_EPOCHS = [4, 4, 2]
MAX_LR = 5e-4 #5e-3 #3e-5
MID_LR = 5e-5 #1e-4 #3e-5
MIN_LR = 5e-6 #1e-6 #3e-5
NUM_EPOCHS = [4, 4, 1]
NUM_FOLDS = 3

In [2]:
RESULTS_DIR = "../results/"
DATA_DIR = "../data/"
MODEL_DIR = "../data/models/roberta-base/"
EXT_MODEL_DIR = "../data/models/roberta-tokenizer/"

## Libraries

In [3]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score, classification_report
from sklearn.model_selection import StratifiedKFold, train_test_split, KFold
from sklearn.utils import class_weight

import pickle, os, sys, re, json, gc
from time import time, ctime
from pprint import pprint

import tensorflow as tf
from tensorflow.keras import Model, Input
from tensorflow.keras.layers import Conv1D, Conv2D, LSTM, Embedding, Dense, concatenate, MaxPooling2D, Softmax, Flatten
from tensorflow.keras.layers import BatchNormalization, Dropout, Reshape, Activation, Bidirectional, TimeDistributed
from tensorflow.keras.layers import RepeatVector, Multiply, Layer, LeakyReLU, Subtract
from tensorflow.keras.activations import softmax
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import initializers, regularizers, constraints
from tensorflow.keras.callbacks import *
from tensorflow.keras.callbacks import CSVLogger, ModelCheckpoint
import tensorflow.keras.backend as K
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.models import save_model, load_model

import tokenizers, transformers
from transformers import *

%matplotlib inline

In [4]:
def jaccard(str1, str2):
    a = set(str1)
    b = set(str2)
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [5]:
seeded_value = 6666
pd.set_option('display.max_colwidth', None)
np.random.seed(seeded_value)

In [6]:
print(ctime(time()))

Fri Jun 12 21:19:40 2020


In [7]:
print([
    tf.__version__,
    transformers.__version__,
    tokenizers.__version__
])

['2.1.0', '2.8.0', '0.5.2']


In [8]:
if not os.path.exists(MODEL_DIR):
    os.mkdir(MODEL_DIR)
    
if not os.path.exists(MODEL_DIR+"tokenizers"):
    os.mkdir(MODEL_DIR+"tokenizers")

if not os.path.exists(MODEL_DIR+"tokenizers/roberta_tokenizer"):
    os.mkdir(MODEL_DIR+"tokenizers/roberta_tokenizer")

<a href="https://www.tensorflow.org/guide/gpu#limiting_gpu_memory_growth"  target="_blank"><h2 id="limiting_gpu_memory_growth" data-text="Limiting GPU memory growth" tabindex="0">Limiting GPU memory growth</h2></a>
<p>By default, TensorFlow maps nearly all of the GPU memory of all GPUs (subject to
<a href="https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars"><code translate="no" dir="ltr">CUDA_VISIBLE_DEVICES</code></a>) visible to the process. This is done to more efficiently use the relatively precious GPU memory resources on the devices by reducing memory fragmentation. To limit TensorFlow to a specific set of GPUs we use the <code translate="no" dir="ltr">tf.config.experimental.set_visible_devices</code> method.</p>

In [9]:
print(tf.config.experimental.list_logical_devices('CPU'))
print(tf.config.experimental.list_logical_devices('GPU'))
print(tf.config.experimental.list_physical_devices('CPU'))
print(tf.config.experimental.list_physical_devices('GPU'))

[LogicalDevice(name='/device:CPU:0', device_type='CPU')]
[LogicalDevice(name='/device:GPU:0', device_type='GPU')]
[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [10]:
physical_devices = tf.config.list_physical_devices('GPU')
try:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
except:
    # Invalid device or cannot modify virtual devices once initialized.
    pass

## Is it okay to exclude neutral text from models and force it later?

In [11]:
df2 = pd.read_csv(DATA_DIR+"train.csv", encoding="utf8")
# cases where neutral's text and selected text columns are the same
neutrals = df2.loc[df2.sentiment=="neutral"].copy()
neutrals = neutrals.reset_index(drop=True)
neutrals["text"] = neutrals["text"].astype(str)
neutrals["selected_text"] = neutrals["selected_text"].astype(str)
print("COVERAGE:", np.sum(np.where((neutrals.text == neutrals.selected_text), 1, 0))/neutrals.shape[0])
print("MEAN JACCARD:", np.mean([jaccard(str1=i, str2=j) for i,j in zip(neutrals.selected_text,neutrals.text)]))

neutrals.loc[neutrals.text != neutrals.selected_text].sample(5)

COVERAGE: 0.5334592552617378
MEAN JACCARD: 0.9808316180492903


Unnamed: 0,textID,text,selected_text,sentiment
5449,c6217bd9cc,"Good morning! Aww, sorry that you were stuck in the airport for 12 hours!!","Good morning! Aww, sorry that you were stuck in the airport for 12 hours!!",neutral
2004,c9a3c66dc8,"Wow, just saw your Tweet about the Proflowers fiasco. That`s so not fun!","Wow, just saw your Tweet about the Proflowers fiasco. That`s so not fun!",neutral
5540,737f01fd34,Dimples was in the preview... are they not on today? It hasn`t aired here yet... via http://twib.es/CPF,Dimples was in the preview... are they not on today? It hasn`t aired here yet..,neutral
1315,f8e02e629d,yessssss wore myself out this weekend planting my garden and working,yessssss wore myself out this weekend planting my garden and working,neutral
9169,56114801d1,i want candy!!!,i want candy!!,neutral


## Import Data

In [12]:
df = pd.read_csv(DATA_DIR+"train.csv", encoding="utf8")

print(pd.concat((df.dtypes, df.isna().sum()), axis=1))
print(df.shape)

# Counts of various columns
print({i:df[i].nunique() for i in df.columns})
print(df.describe())
df.head(2)

                    0  1
textID         object  0
text           object  1
selected_text  object  1
sentiment      object  0
(27481, 4)
{'textID': 27481, 'text': 27480, 'selected_text': 22463, 'sentiment': 3}
            textID  \
count        27481   
unique       27481   
top     47753d6abc   
freq             1   

                                                                                                           text  \
count                                                                                                     27480   
unique                                                                                                    27480   
top       do you know if anyone from the believers never die tour is going on warped? i know  i can`t wait haha   
freq                                                                                                          1   

       selected_text sentiment  
count          27480     27481  
unique         22463         3  
top  

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative


In [13]:
test_df = pd.read_csv(DATA_DIR+"test.csv", encoding="utf8")
print(pd.concat((test_df.dtypes, test_df.isna().sum()), axis=1))
print(test_df.shape)

# Counts of various columns
print({i:test_df[i].nunique() for i in test_df.columns})
print(test_df.describe())
test_df.head(2)

                0  1
textID     object  0
text       object  0
sentiment  object  0
(3534, 3)
{'textID': 3534, 'text': 3534, 'sentiment': 3}
            textID                  text sentiment
count         3534                  3534      3534
unique        3534                  3534         3
top     32e183bbad  I wanna feel my chin   neutral
freq             1                     1      1430


Unnamed: 0,textID,text,sentiment
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral
1,96d74cb729,Shanghai is also really exciting (precisely -- skyscrapers galore). Good tweeps in China: (SH) (BJ).,positive


In [14]:
df.loc[df['text'].astype('str').apply(lambda x : len(re.findall(pattern="ï¿½", string=x))>0)].head(2)

Unnamed: 0,textID,text,selected_text,sentiment
44,c77717b103,I love to! But I`m only available from 5pm. and where dear? Would love to help convert her vids.ï¿½,I love to!,positive
192,28dbada620,*phew* Will make a note in case anyone else runs into the same issueï¿½,*phew* Will make a note in case anyone else runs into the same issueï¿½,neutral


In [15]:
test_df.loc[test_df['text'].astype('str').apply(lambda x : len(re.findall(pattern="ï¿½", string=x))>0)].head(2)

Unnamed: 0,textID,text,sentiment
145,7223fdccc2,tikcets are only ï¿½91...each...BUT I SO WANT TO GO,positive
618,43ad351369,"AHHH - Whatchu talkinï¿½ baby? HAHAHA I canï¿½t believe youu:O heh, actually I can. Life is worth taking risks... http://tumblr.com/xs81qy54s",positive


In [16]:
df["set"], test_df["set"] = "train", "test"

### Create smaller sample for experimentation

In [17]:
if RUN_ON_SAMPLE:
    df = df.sample(1000).reset_index(drop=True)
    test_df = test_df.sample(1000).reset_index(drop=True)

#### Combine datasets for pretraining using sentiment labels

In [18]:
data = pd.concat((df[["text","set","sentiment"]],
                  test_df[["text","set","sentiment"]]), axis=0)
data["text"] = data["text"].astype(str)
data = data.sample(frac=1.0).reset_index(drop=True)
print(data.shape)

(2000, 3)


#### Sentiment count in combined data

In [19]:
data.groupby(["set","sentiment"])[["text"]].count()

Unnamed: 0_level_0,Unnamed: 1_level_0,text
set,sentiment,Unnamed: 2_level_1
test,negative,294
test,neutral,398
test,positive,308
train,negative,297
train,neutral,403
train,positive,300


#### Tokenization

In [20]:
tokenizer = tokenizers.ByteLevelBPETokenizer(vocab_file=EXT_MODEL_DIR+'/vocab.json',
                                             merges_file=EXT_MODEL_DIR+'/merges.txt',                                         
                                             add_prefix_space=True,
                                             lowercase=True)

In [21]:
with open(EXT_MODEL_DIR+"/special_tokens_map.json") as f:
    special_tokens = json.load(f)

tokenizer.add_special_tokens([i for i in special_tokens.values()])

0

#### Preprocessing for sentiment detection

In [22]:
def trim_addspace(text:str) -> str:
    text = text.lower()
    text = " " + text.strip(" ") + " "
    return text

In [23]:
data["text_mod"] = data.apply(lambda x: trim_addspace(x.text), axis=1)
data["text_mod"] = "<s>"  + data["text_mod"] + "</s>"

In [24]:
sentiment_lookup = {"positive":2,"neutral":1,"negative":0}

In [25]:
def preprocess_sentiment(text_series=data.text_mod.tolist(), sentiment_series=data.sentiment):

    X_tokens = tokenizer.encode_batch(text_series)

    X = [i.ids for i in X_tokens]
    MAX_SEQ_LEN = max([len(i) for i in X])
    X = pad_sequences(X, maxlen=MAX_SEQ_LEN, padding="post")

    X_att = [i.attention_mask for i in X_tokens]
    X_att = pad_sequences(X_att, maxlen=MAX_SEQ_LEN, padding="post")

    Y = sentiment_series.apply(lambda x: sentiment_lookup[x]).values

    VOCAB_SIZE = tokenizer.get_vocab_size()

    print({
        "X":X.shape,
        "X_att":X_att.shape,
        "Y":Y.shape,
        "VOCAB_SIZE":VOCAB_SIZE,
        "MAX_SEQ_LEN":MAX_SEQ_LEN
    })
    
    return X_tokens, X, X_att, Y, VOCAB_SIZE, MAX_SEQ_LEN

X_sent_tokens, X_sent, X_sent_att, Y_sent, VOCAB_SIZE, MAX_SEQ_LEN_SENT = preprocess_sentiment(**{
    "text_series" : data.text.tolist(),
    "sentiment_series" : data.sentiment
})

{'X': (2000, 49), 'X_att': (2000, 49), 'Y': (2000,), 'VOCAB_SIZE': 50265, 'MAX_SEQ_LEN': 49}


# Import data for span detection

In [26]:
df_span = pd.read_csv(DATA_DIR+"train.csv", encoding="utf8").fillna('')

print(pd.concat((df_span.dtypes, df_span.isna().sum()), axis=1))
print(df_span.shape)

# Counts of various columns
print({i:df_span[i].nunique() for i in df_span.columns})
print(df_span.describe())
df_span.head(2)

                    0  1
textID         object  0
text           object  0
selected_text  object  0
sentiment      object  0
(27481, 4)
{'textID': 27481, 'text': 27481, 'selected_text': 22464, 'sentiment': 3}
            textID  \
count        27481   
unique       27481   
top     47753d6abc   
freq             1   

                                                                                                           text  \
count                                                                                                     27481   
unique                                                                                                    27481   
top       do you know if anyone from the believers never die tour is going on warped? i know  i can`t wait haha   
freq                                                                                                          1   

       selected_text sentiment  
count          27481     27481  
unique         22464         3  
top  

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative


In [27]:
test_df_span = pd.read_csv(DATA_DIR+"test.csv", encoding="utf8").fillna('')
print(pd.concat((test_df_span.dtypes, test_df_span.isna().sum()), axis=1))
print(test_df_span.shape)

# Counts of various columns
print({i:test_df_span[i].nunique() for i in test_df_span.columns})
print(test_df_span.describe())
test_df_span.head(2)

                0  1
textID     object  0
text       object  0
sentiment  object  0
(3534, 3)
{'textID': 3534, 'text': 3534, 'sentiment': 3}
            textID                  text sentiment
count         3534                  3534      3534
unique        3534                  3534         3
top     32e183bbad  I wanna feel my chin   neutral
freq             1                     1      1430


Unnamed: 0,textID,text,sentiment
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral
1,96d74cb729,Shanghai is also really exciting (precisely -- skyscrapers galore). Good tweeps in China: (SH) (BJ).,positive


#### Preprocessing for span detection

In [28]:
df_span["text"] = df_span["text"].astype(str)
df_span["selected_text"] = df_span["selected_text"].astype(str)
test_df_span["text"] = test_df_span["text"].astype(str)

In [29]:
def trim_addspace(text:str) -> str:
    text = text.lower()
    text = " " + text.strip(" ") + " "
    return text

In [30]:
def find_indices(text:str, selected_text:str) -> (str, str, int, int):
    
    text, selected_text = text.lower(), selected_text.lower()
    
    text = trim_addspace(text)
    
    substring_ = re.findall(pattern="\\s[^\s]*?"+re.escape(selected_text)+"[^\s]*?\\s", string=text)[0]
    
    return pd.Series([text, " "+substring_.strip(" "), text.find(substring_), len(substring_) + text.find(substring_)])

In [31]:
df_span[["text_mod", "selected_text_mod", "start", "stop"]] = df_span[['text','selected_text']].apply(lambda x: find_indices(x.text, x.selected_text), axis=1)

In [32]:
df_span.iloc[27476].to_dict()

{'textID': '4eac33d1c0',
 'text': ' wish we could come see u on Denver  husband lost his job and can`t afford it',
 'selected_text': 'd lost',
 'sentiment': 'negative',
 'text_mod': ' wish we could come see u on denver  husband lost his job and can`t afford it ',
 'selected_text_mod': ' husband lost',
 'start': 36,
 'stop': 50}

In [33]:
test_df_span['text_mod'] = test_df_span['text'].apply(trim_addspace)

In [34]:
df_span.loc[df_span.text_mod.str.contains("tonight, but no one will go")].to_dict()

{'textID': {12154: 'adfbcc6806'},
 'text': {12154: 'i wanna see `up` tonight, but no one will go with me. whhhyyy'},
 'selected_text': {12154: 'but no one will go with me.'},
 'sentiment': {12154: 'negative'},
 'text_mod': {12154: ' i wanna see `up` tonight, but no one will go with me. whhhyyy '},
 'selected_text_mod': {12154: ' but no one will go with me.'},
 'start': {12154: 26},
 'stop': {12154: 55}}

#### Cleaning for span detection

In [35]:
df_span["sentiment_code"] = df_span["sentiment"].astype("category")
X_sentiments = df_span["sentiment_code"].cat.codes.values

test_df_span["sentiment_code"] = test_df_span["sentiment"].astype("category")
X_sentiments_test = test_df_span["sentiment_code"].cat.codes.values

#### Adding special tokens

In [36]:
{t:tokenizer.encode(" "+t).ids for t in df_span.sentiment.unique()}

{'neutral': [7974], 'negative': [2430], 'positive': [1313]}

In [37]:
df_span["text_mod"] = "<s>" + df_span['text_mod'] + "</s> </s> " + df_span.sentiment + " </s>"
test_df_span["text_mod"] = "<s>" + test_df_span['text_mod'] + "</s> </s> " + test_df_span.sentiment + " </s>"

#### Exclusions for span detection

In [38]:
if EXCLUDE_NEUTRAL_CLASS:
    df_span = df_span.loc[df_span.sentiment!="neutral"].copy()
    df_span = df_span.reset_index(drop=True)
    print("EXCLUDE_NEUTRAL_CLASS:", df_span.shape)


if RUN_ON_SAMPLE:
    df_span = df_span.sample(2000).copy()
    df_span = df_span.reset_index(drop=True)
    print("Train RUN_ON_SAMPLE", df_span.shape)
    test_df_span = test_df_span.sample(2000).copy()
    test_df_span = test_df_span.reset_index(drop=True)
    print("Test  RUN_ON_SAMPLE", test_df_span.shape)

EXCLUDE_NEUTRAL_CLASS: (16363, 9)
Train RUN_ON_SAMPLE (2000, 9)
Test  RUN_ON_SAMPLE (2000, 5)


#### Tokenization for span detection

In [39]:
X_span_tokens = tokenizer.encode_batch(df_span.text_mod.tolist())
Y_span_tokens = tokenizer.encode_batch(df_span.selected_text_mod.tolist())
X_span_tokens_test = tokenizer.encode_batch(test_df_span.text_mod.tolist())

In [40]:
X_span = [i.ids for i in X_span_tokens]
Y_span = [i.ids for i in Y_span_tokens]
X_span_test = [i.ids for i in X_span_tokens_test]

In [41]:
X_span_att = [i.attention_mask for i in X_span_tokens]
Y_span_att = [i.attention_mask for i in Y_span_tokens] # Useless
X_span_att_test = [i.attention_mask for i in X_span_tokens_test]

In [42]:
MAX_SEQ_LEN_SPAN = max([len(i) for i in X_span])

In [43]:
def get_extremities(l_string, s_string, print_it=False):
    len_l = len(l_string)
    len_s = len(s_string)
    
    for i in range(len_l - len_s + 1):
        if (i + len_s) <= len_l:
            substring = l_string[i:i+len_s]
            if substring == s_string:
                if print_it:
                    print(l_string)
                    print(substring)
                    print(i, i+len_s, substring)
                
                start_vector, end_vector = [0] * len_l, [0] * len_l
                start_vector[i], end_vector[i+len_s-1] = 1, 1
                
                return (start_vector, end_vector)

In [44]:
Y_span_starts, Y_span_stops = [], []
anomaly_idx, counter = [], 0
for num, (i,j) in enumerate(zip(X_span_tokens, Y_span_tokens)):
    x,y = i.ids, j.ids
    try:
        s,e = get_extremities(x, y)
        Y_span_starts.append(s)
        Y_span_stops.append(e)
    except TypeError as t:
        counter += 1
        anomaly_idx.append(num)
        Y_span_starts.append([0]*15)
        Y_span_stops.append([0]*15)
print(num + 1, "\t: #Processed")

print(counter,"\t: # of Anomalies")

2000 	: #Processed
0 	: # of Anomalies


In [45]:
check_idx = 758
print(df_span.text[check_idx])
print(df_span.selected_text[check_idx])
print([[i,j,k,l] for i,j,k,l in zip(X_span_tokens[check_idx].tokens,
                                    X_span_tokens[check_idx].ids,
                                    Y_span_starts[check_idx],
                                    Y_span_stops[check_idx])])
print([[i,j] for i,j in zip(Y_span_tokens[check_idx].ids,
                            Y_span_tokens[check_idx].tokens)])

 I haven`t had a good homemade flour tortilla in ages.
haven`t had a good
[['<s>', 0, 0, 0], ['Ġi', 939, 0, 0], ['Ġhaven', 2220, 1, 0], ['`', 12905, 0, 0], ['t', 90, 0, 0], ['Ġhad', 56, 0, 0], ['Ġa', 10, 0, 0], ['Ġgood', 205, 0, 1], ['Ġhomemade', 17798, 0, 0], ['Ġflour', 15039, 0, 0], ['Ġtort', 17082, 0, 0], ['illa', 4699, 0, 0], ['Ġin', 11, 0, 0], ['Ġages', 4864, 0, 0], ['.', 4, 0, 0], ['Ġ', 1437, 0, 0], ['</s>', 2, 0, 0], ['Ġ', 1437, 0, 0], ['</s>', 2, 0, 0], ['Ġnegative', 2430, 0, 0], ['Ġ', 1437, 0, 0], ['</s>', 2, 0, 0]]
[[2220, 'Ġhaven'], [12905, '`'], [90, 't'], [56, 'Ġhad'], [10, 'Ġa'], [205, 'Ġgood']]


#### Padding for span detection

In [46]:
X_span = pad_sequences(X_span, maxlen=MAX_SEQ_LEN_SPAN, padding="post")
X_span_att = pad_sequences(X_span_att, maxlen=MAX_SEQ_LEN_SPAN, padding="post")
Y_span = pad_sequences(Y_span, maxlen=MAX_SEQ_LEN_SPAN, padding="post")

Y_span_starts = pad_sequences(Y_span_starts, maxlen=MAX_SEQ_LEN_SPAN, padding="post")#.argmax(axis=1)
Y_span_stops = pad_sequences(Y_span_stops, maxlen=MAX_SEQ_LEN_SPAN, padding="post")#.argmax(axis=1)

X_span_test = pad_sequences(X_span_test, maxlen=MAX_SEQ_LEN_SPAN, padding="post")
X_span_att_test = pad_sequences(X_span_att_test, maxlen=MAX_SEQ_LEN_SPAN, padding="post")

In [47]:
pprint({
    "X_span" : X_span.shape,
    "X_span_att" : X_span_att.shape,
    "Y_span" : Y_span.shape,
    "Y_span_starts" : Y_span_starts.shape,
    "Y_span_stops" : Y_span_stops.shape,
    "X_span_test" : X_span_test.shape,
    "X_span_att_test" : X_span_att_test.shape,
    "VOCAB_SIZE":VOCAB_SIZE,
    "MAX_SEQ_LEN_SPAN":MAX_SEQ_LEN_SPAN
})

{'MAX_SEQ_LEN_SPAN': 59,
 'VOCAB_SIZE': 50265,
 'X_span': (2000, 59),
 'X_span_att': (2000, 59),
 'X_span_att_test': (2000, 59),
 'X_span_test': (2000, 59),
 'Y_span': (2000, 59),
 'Y_span_starts': (2000, 59),
 'Y_span_stops': (2000, 59)}


#### Cross validation for span detection

In [48]:
keep_flag = np.isin(Y_span_stops.argmax(axis=1),
                    np.unique(Y_span_stops.argmax(axis=1),
                              return_counts=True)[0][np.unique(Y_span_stops.argmax(axis=1),
                                                               return_counts=True)[1]>1])

In [49]:
sum(keep_flag), df_span.shape[0], df_span.shape[0] - sum(keep_flag)

(1998, 2000, 2)

In [50]:
print("\n",
     X_span.shape, "\t: X ", "\n",
     X_span_att.shape, "\t: X_att ", "\n",
     Y_span.shape, "\t: Y ", "\n",
     Y_span_starts.shape, "\t: Y_starts ", "\n",
     Y_span_stops.shape, "\t: Y_stops ", "\n",
     X_span_test.shape, "\t: X_test ", "\n",
     X_span_att_test.shape, "\t: X_att_test ", "\n"
)


 (2000, 59) 	: X  
 (2000, 59) 	: X_att  
 (2000, 59) 	: Y  
 (2000, 59) 	: Y_starts  
 (2000, 59) 	: Y_stops  
 (2000, 59) 	: X_test  
 (2000, 59) 	: X_att_test  



In [51]:
X_span = X_span[keep_flag]
X_span_att = X_span_att[keep_flag]
Y_span = Y_span[keep_flag]
Y_span_starts = Y_span_starts[keep_flag]
Y_span_stops = Y_span_stops[keep_flag]
X_span_test = X_span_test
X_span_att_test = X_span_att_test

In [52]:
print("\n",
     X_span.shape, "\t: X ", "\n",
     X_span_att.shape, "\t: X_att ", "\n",
     Y_span.shape, "\t: Y ", "\n",
     Y_span_starts.shape, "\t: Y_starts ", "\n",
     Y_span_stops.shape, "\t: Y_stops ", "\n",
     X_span_test.shape, "\t: X_test ", "\n",
     X_span_att_test.shape, "\t: X_att_test ", "\n"
)


 (1998, 59) 	: X  
 (1998, 59) 	: X_att  
 (1998, 59) 	: Y  
 (1998, 59) 	: Y_starts  
 (1998, 59) 	: Y_stops  
 (2000, 59) 	: X_test  
 (2000, 59) 	: X_att_test  



In [53]:
Y_span_words = [tokenizer.decode(i) for i in Y_span]

In [54]:
MAX_SEQ_LEN = max(MAX_SEQ_LEN_SENT, MAX_SEQ_LEN_SPAN)
MAX_SEQ_LEN, MAX_SEQ_LEN_SPAN, MAX_SEQ_LEN_SENT

(59, 59, 49)

In [55]:
X_span = pad_sequences(X_span, maxlen=MAX_SEQ_LEN, padding="post")
X_span_att = pad_sequences(X_span_att, maxlen=MAX_SEQ_LEN, padding="post")
Y_span = pad_sequences(Y_span, maxlen=MAX_SEQ_LEN, padding="post")
Y_span_starts = pad_sequences(Y_span_starts, maxlen=MAX_SEQ_LEN, padding="post")
Y_span_stops = pad_sequences(Y_span_stops, maxlen=MAX_SEQ_LEN, padding="post")

X_span_test = pad_sequences(X_span_test, maxlen=MAX_SEQ_LEN, padding="post")
X_span_att_test = pad_sequences(X_span_att_test, maxlen=MAX_SEQ_LEN, padding="post")

X_sent = pad_sequences(X_sent, maxlen=MAX_SEQ_LEN, padding="post")
X_sent_att = pad_sequences(X_sent_att, maxlen=MAX_SEQ_LEN, padding="post")

pprint({
    "X_span" : X_span.shape,
    "X_span_att" : X_span_att.shape,
    "Y_span" : Y_span.shape,
    
    "X_span_test" : X_span_test.shape,
    "X_span_att_test" : X_span_att_test.shape,
    
    "X_sent" : X_sent.shape,
    "X_sent_att" : X_sent_att.shape,
})

{'X_sent': (2000, 59),
 'X_sent_att': (2000, 59),
 'X_span': (1998, 59),
 'X_span_att': (1998, 59),
 'X_span_att_test': (2000, 59),
 'X_span_test': (2000, 59),
 'Y_span': (1998, 59)}


#### Model Specifications

In [56]:
def build_model():
    input_sequences = Input((MAX_SEQ_LEN), dtype=tf.int32, name="words")
    input_att_flags = Input((MAX_SEQ_LEN), dtype=tf.int32, name="att_flags")
    input_token_ids = Input((MAX_SEQ_LEN), dtype=tf.int32, name="token_ids")
    
    config = RobertaConfig.from_pretrained(MODEL_DIR+'config.json')
    roberta_model = TFRobertaModel.from_pretrained(MODEL_DIR+'tf_model.h5', config=config)
    x = roberta_model(inputs=input_sequences, attention_mask=input_att_flags, token_type_ids=input_token_ids)
    
    x1 = tf.keras.layers.Dropout(DROPOUT)(x[0])
    x1 = tf.keras.layers.Conv1D(768, 2,padding='same')(x1)
    x1 = tf.keras.layers.LeakyReLU()(x1)
    x1 = tf.keras.layers.Dense(1)(x1)
    x1 = tf.keras.layers.Flatten()(x1)
    output_starts_0 = tf.keras.layers.Activation('softmax', name="starts_0")(x1)
    
    x2 = tf.keras.layers.Dropout(DROPOUT)(x[0]) 
    x2 = tf.keras.layers.Conv1D(768, 2,padding='same')(x2)
    x2 = tf.keras.layers.LeakyReLU()(x2)
    x2 = tf.keras.layers.Dense(1)(x2)
    x2 = tf.keras.layers.Flatten()(x2)
    output_stops_0 = tf.keras.layers.Activation('softmax', name="stops_0")(x2)
    
    x3 = tf.keras.layers.Dropout(DROPOUT)(x[0]) 
    x3 = tf.keras.layers.Conv1D(768, 2,padding='same')(x3)
    x3 = tf.keras.layers.LeakyReLU()(x3)
    x3 = tf.keras.layers.Dense(1)(x3)
    x3 = tf.keras.layers.Flatten()(x3)
    x3 = tf.keras.layers.Dense(3)(x3)
    output_sentiment = tf.keras.layers.Activation('softmax', name="output_sentiments")(x3)
    
    sentiment_model = Model([input_att_flags, input_sequences, input_token_ids], [output_sentiment])
    
    output_subtract = tf.keras.layers.Subtract()([output_starts_0, output_stops_0])
    output_flat = concatenate([output_starts_0, output_stops_0, output_subtract, output_sentiment])
    output_starts_1 = Dense(MAX_SEQ_LEN, activation='softmax', name="starts_1")(output_flat)
    output_stops_1 = Dense(MAX_SEQ_LEN, activation='softmax', name="stops_1")(output_flat)
    
    span_detection_model = Model([input_att_flags, input_sequences, input_token_ids],
                                 [output_starts_0, output_stops_0, output_starts_1, output_stops_1])
    
    return sentiment_model, span_detection_model

In [57]:
sentiment_model, span_detection_model = build_model()

In [58]:
sentiment_model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
words (InputLayer)              [(None, 59)]         0                                            
__________________________________________________________________________________________________
att_flags (InputLayer)          [(None, 59)]         0                                            
__________________________________________________________________________________________________
token_ids (InputLayer)          [(None, 59)]         0                                            
__________________________________________________________________________________________________
tf_roberta_model (TFRobertaMode ((None, 59, 768), (N 124645632   words[0][0]                      
______________________________________________________________________________________________

In [59]:
span_detection_model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
words (InputLayer)              [(None, 59)]         0                                            
__________________________________________________________________________________________________
att_flags (InputLayer)          [(None, 59)]         0                                            
__________________________________________________________________________________________________
token_ids (InputLayer)          [(None, 59)]         0                                            
__________________________________________________________________________________________________
tf_roberta_model (TFRobertaMode ((None, 59, 768), (N 124645632   words[0][0]                      
____________________________________________________________________________________________

### Sentiment Model Fit

In [60]:
def infer_sentiment(x):
    encoded_repr = tokenizer.encode_batch(x.tolist())

    sample_text_ids = pad_sequences([i.ids for i in encoded_repr],
                                    maxlen=MAX_SEQ_LEN,
                                    padding="post")
    sample_text_att = pad_sequences([i.attention_mask for i in encoded_repr],
                                    maxlen=MAX_SEQ_LEN,
                                    padding="post")
    pred = sentiment_model.predict({"att_flags":sample_text_att,
                                    "words":sample_text_ids,
                                    "token_ids":np.zeros_like(sample_text_att)},
                                  batch_size=PREDICT_BATCH_SIZE)
    
    res = pd.DataFrame({"predicted_sentiment":pred.argmax(axis=1)})
    
    return res.predicted_sentiment.apply(lambda x:[k for k,v in sentiment_lookup.items() if v==x][0])

In [61]:
def get_sentiment_results(data):
    data["predicted_sentiment"] = infer_sentiment(x=data.text)

    print(classification_report(y_true=data.sentiment[data.index.isin(tr_index)],
                                y_pred=data.predicted_sentiment[data.index.isin(tr_index)]))

    print(classification_report(y_true=data.sentiment[data.index.isin(va_index)],
                                y_pred=data.predicted_sentiment[data.index.isin(va_index)]))

    print(confusion_matrix(y_true=data.sentiment,
                     y_pred=data.predicted_sentiment,
                     labels=['positive', 'neutral', 'negative']))

    print(confusion_matrix(y_true=data.sentiment,
                     y_pred=data.predicted_sentiment,
                     labels=['positive', 'neutral', 'negative'],
                     normalize="all"))

    data["set2"] = np.where(data.index.isin(tr_index), "train", "valid")
    print(data.groupby("set2").apply(lambda x : accuracy_score(y_true=x.sentiment, y_pred=x.predicted_sentiment)))

    print(data.groupby("set").apply(lambda x : accuracy_score(y_true=x.sentiment, y_pred=x.predicted_sentiment)))

    print(pd.concat({
        "accuracy" : data.groupby(["set", "set2"]).apply(lambda x : accuracy_score(y_true=x.sentiment,
                                                                                   y_pred=x.predicted_sentiment)),
        "count" : data.groupby(["set", "set2"])["sentiment"].count()
    }, axis=1))

In [62]:
p,c = np.unique(Y_sent, return_counts=True)
cw = class_weight.compute_class_weight('balanced', np.unique(Y_sent), Y_sent)

In [63]:
for i in range(NUM_FOLDS):
    if os.path.exists(RESULTS_DIR+MODEL_PREFIX+"_LossLogs_"+str(i)+".csv"):
        os.remove(RESULTS_DIR+MODEL_PREFIX+"_LossLogs_"+str(i)+".csv")

In [64]:
def print_metrics(pred_dict):
    print("[INFO] ","="*15,"Validation for FOLD#", num, "="*15)
    funcs = [accuracy_score, f1_score, precision_score, recall_score, confusion_matrix]
    for f in funcs:
        for data_set in ["train","valid"]:
            for var in ["starts", "stops"]:
                if f in [accuracy_score]:
                    res = f(**pred_dict[data_set][var])
                    print("[INFO] {:.2f}".format(100 * res), "\t||", data_set, "\t||", var, "\t||", f.__name__)
                elif f in [confusion_matrix]:
                    res = f(**pred_dict[data_set][var], labels=np.arange(MAX_SEQ_LEN))
                    np.savetxt(X=res, fmt='%i', delimiter=",",
                               fname=RESULTS_DIR+"ConfusionMatrix_"+MODEL_PREFIX+"_"+data_set+"_"+var+".csv")
                    print("[INFO] \t||", data_set, "\t||", var, "\t||", f.__name__, "\t||", 
                          RESULTS_DIR+"ConfusionMatrix_"+MODEL_PREFIX+"_"+data_set+"_"+var+".csv")
                else:
                    res = f(**pred_dict[data_set][var], average="macro")
                    print("[INFO] {:.2f}".format(100 * res), "\t||", data_set, "\t||", var, "\t||", f.__name__)
        print("=======================================================================")

In [65]:
def post_process(string):
    string = re.sub(pattern=" (negative|positive|neutral)[ ]+$", repl="", string=string)
    string = re.sub(pattern=" +", repl=" ", string=string.strip())
    return string

In [66]:
sentiment_kf = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=seeded_value)

In [67]:
span_kf = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=seeded_value)

In [69]:
for (num2, (tr_index, va_index)), (num, (t_index, v_index)) in zip(enumerate(sentiment_kf.split(X_sent, Y_sent)), enumerate(span_kf.split(X_span, Y_span_stops))):
    
    if num > 0:
        del span_history
        del span_detection_model
        del sent_history
        del sentiment_model
        gc.collect()
        K.clear_session()
        
        sentiment_model, span_detection_model = build_model()
    
    print("[INFO] ==================== FOLD#", num, "====================")
    
    sentiment_mcp = ModelCheckpoint(filepath=RESULTS_DIR+"Sentiment_"+MODEL_PREFIX+"BestCheckpoint_"+str(num)+".h5", monitor='val_loss',
                                    verbose=0, save_best_only=False, save_weights_only=True, mode='auto', save_freq='epoch')

    sentiment_csvl = CSVLogger(filename=RESULTS_DIR+"Sentiment_"+MODEL_PREFIX+"_LossLogs_"+str(num)+".csv",
                               separator=",", append=True)

    span_mcp = ModelCheckpoint(filepath=RESULTS_DIR+MODEL_PREFIX+"BestCheckpoint_"+str(num)+".h5", monitor='val_loss',
                               verbose=0, save_best_only=False, save_weights_only=True, mode='auto', save_freq='epoch')
    
    span_csvl = CSVLogger(filename=RESULTS_DIR+MODEL_PREFIX+"_LossLogs_"+str(num)+".csv",
                          separator=",", append=True)
    
    print("[INFO] Training Sentiment only the final layers at higher learning rates.")
    sentiment_model.layers[3].trainable = False
    adam = Adam(learning_rate=SENTIMENT_MAX_LR)
    sentiment_model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
                            optimizer=adam,
                            metrics=['accuracy'])

    sent_history = sentiment_model.fit(x={"att_flags":X_sent_att[tr_index],
                                          "words":X_sent[tr_index],
                                          "token_ids":np.zeros_like(X_sent_att[tr_index])},
                                       y={"output_sentiments":Y_sent[tr_index]},
                                       shuffle=True,
                                       batch_size=BATCH_SIZE,
                                       epochs=SENTIMENT_NUM_EPOCHS[0],
                                       validation_data=({"att_flags":X_sent_att[va_index],
                                                         "words":X_sent[va_index],
                                                         "token_ids":np.zeros_like(X_sent_att[va_index])},
                                                        {"output_sentiments":Y_sent[va_index]}),
                                       verbose=1,
                                       class_weight=cw,
                                       callbacks=[sentiment_mcp, sentiment_csvl])

    print("[INFO] Training Span only the final layers at higher learning rates.")
    span_detection_model.layers[3].trainable = False
    adam = Adam(learning_rate=MAX_LR)
    span_detection_model.compile(loss=tf.keras.losses.CategoricalCrossentropy(from_logits=False, label_smoothing=LABEL_SMOOTHING_PARAM),
                                 optimizer=adam,
                                 metrics=['accuracy'],
                                 loss_weights={"starts_0":1.0,"stops_0":1.0,"starts_1":1.0,"stops_1":2.0})
    span_history = span_detection_model.fit(x={"att_flags":X_span_att[t_index],
                                               "words":X_span[t_index],
                                               "token_ids":np.zeros_like(X_span_att[t_index])},
                                            y={"starts_0":Y_span_starts[t_index],
                                               "stops_0":Y_span_stops[t_index], 
                                               "starts_1":Y_span_starts[t_index],
                                               "stops_1":Y_span_stops[t_index]},
                                            shuffle=True,
                                            batch_size=BATCH_SIZE,
                                            epochs=NUM_EPOCHS[0],
                                            validation_data=({"att_flags":X_span_att[v_index],
                                                              "words":X_span[v_index],
                                                              "token_ids":np.zeros_like(X_span_att[v_index])},
                                                             {"starts_0":Y_span_starts[v_index],
                                                              "stops_0":Y_span_stops[v_index], 
                                                              "starts_1":Y_span_starts[v_index],
                                                              "stops_1":Y_span_stops[v_index]}),
                                            verbose=1,
                                            callbacks=[span_mcp, span_csvl])
    
    print("[INFO] Training Sentiment only the final layers at lower learning rates.")
    sentiment_model.layers[3].trainable = False
    adam = Adam(learning_rate=SENTIMENT_MID_LR)
    sentiment_model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
                            optimizer=adam,
                            metrics=['accuracy'])

    sent_history = sentiment_model.fit(x={"att_flags":X_sent_att[tr_index],
                                          "words":X_sent[tr_index],
                                          "token_ids":np.zeros_like(X_sent_att[tr_index])},
                                       y={"output_sentiments":Y_sent[tr_index]},
                                       shuffle=True,
                                       batch_size=BATCH_SIZE,
                                       epochs=SENTIMENT_NUM_EPOCHS[1],
                                       validation_data=({"att_flags":X_sent_att[va_index],
                                                         "words":X_sent[va_index],
                                                         "token_ids":np.zeros_like(X_sent_att[va_index])},
                                                        {"output_sentiments":Y_sent[va_index]}),
                                       verbose=1,
                                       class_weight=cw,
                                       callbacks=[sentiment_mcp, sentiment_csvl])
    
    print("[INFO] Training Span only the final layers at lower learning rates.")
    adam = Adam(learning_rate=MID_LR)
    span_detection_model.compile(loss=tf.keras.losses.CategoricalCrossentropy(from_logits=False, label_smoothing=LABEL_SMOOTHING_PARAM),
                                 optimizer=adam,
                                 metrics=['accuracy'],
                                 loss_weights={"starts_0":1.0,"stops_0":1.0,"starts_1":1.0,"stops_1":2.0})
    span_history = span_detection_model.fit(x={"att_flags":X_span_att[t_index],
                                               "words":X_span[t_index],
                                               "token_ids":np.zeros_like(X_span_att[t_index])},
                                            y={"starts_0":Y_span_starts[t_index],
                                               "stops_0":Y_span_stops[t_index], 
                                               "starts_1":Y_span_starts[t_index],
                                               "stops_1":Y_span_stops[t_index]},
                                            shuffle=True,
                                            batch_size=BATCH_SIZE,
                                            epochs=NUM_EPOCHS[1],
                                            validation_data=({"att_flags":X_span_att[v_index],
                                                              "words":X_span[v_index],
                                                              "token_ids":np.zeros_like(X_span_att[v_index])},
                                                             {"starts_0":Y_span_starts[v_index],
                                                              "stops_0":Y_span_stops[v_index], 
                                                              "starts_1":Y_span_starts[v_index],
                                                              "stops_1":Y_span_stops[v_index]}),
                                            verbose=1,
                                            callbacks=[span_mcp, span_csvl])
    
    print("[INFO] Training Sentiment only the final layers at higher learning rates.")
    sentiment_model.layers[3].trainable = True
    adam = Adam(learning_rate=SENTIMENT_MIN_LR)
    sentiment_model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
                            optimizer=adam,
                            metrics=['accuracy'])

    sent_history_finetuned = sentiment_model.fit(x={"att_flags":X_sent_att[tr_index],
                                                    "words":X_sent[tr_index],
                                                    "token_ids":np.zeros_like(X_sent_att[tr_index])},
                                                 y={"output_sentiments":Y_sent[tr_index]},
                                                 shuffle=True,
                                                 batch_size=BATCH_SIZE,
                                                 epochs=SENTIMENT_NUM_EPOCHS[2],
                                                 validation_data=({"att_flags":X_sent_att[va_index],
                                                                   "words":X_sent[va_index],
                                                                   "token_ids":np.zeros_like(X_sent_att[va_index])},
                                                                  {"output_sentiments":Y_sent[va_index]}),
                                                 verbose=1,
                                                 class_weight=cw,
                                                 callbacks=[sentiment_mcp, sentiment_csvl])
    
    print("[INFO] Unfreezing Span RoBerta layer and training at lowest learning rates.")
    span_detection_model.layers[3].trainable = True
    adam = Adam(learning_rate=MIN_LR)
    span_detection_model.compile(loss=tf.keras.losses.CategoricalCrossentropy(from_logits=False, label_smoothing=LABEL_SMOOTHING_PARAM),
                                 optimizer=adam,
                                 metrics=['accuracy'],
                                 loss_weights={"starts_0":1.0,"stops_0":1.0,"starts_1":1.0,"stops_1":2.0})
    span_history = span_detection_model.fit(x={"att_flags":X_span_att[t_index],
                                               "words":X_span[t_index],
                                               "token_ids":np.zeros_like(X_span_att[t_index])},
                                            y={"starts_0":Y_span_starts[t_index],
                                               "stops_0":Y_span_stops[t_index], 
                                               "starts_1":Y_span_starts[t_index],
                                               "stops_1":Y_span_stops[t_index]},
                                            shuffle=True,
                                            batch_size=BATCH_SIZE,
                                            epochs=NUM_EPOCHS[2],
                                            validation_data=({"att_flags":X_span_att[v_index],
                                                              "words":X_span[v_index],
                                                              "token_ids":np.zeros_like(X_span_att[v_index])},
                                                             {"starts_0":Y_span_starts[v_index],
                                                              "stops_0":Y_span_stops[v_index], 
                                                              "starts_1":Y_span_starts[v_index],
                                                              "stops_1":Y_span_stops[v_index]}),
                                            verbose=1,
                                            callbacks=[span_mcp, span_csvl])
    
    get_sentiment_results(data)
    
    # Loading best weights per fold
    span_detection_model.load_weights(RESULTS_DIR+MODEL_PREFIX+"BestCheckpoint_"+str(num)+".h5")
    
    pred_train = span_detection_model.predict(x = {"att_flags":X_span_att[t_index],
                                                   "words":X_span[t_index],
                                                   "token_ids":np.zeros_like(X_span_att[t_index])},
                                              batch_size=PREDICT_BATCH_SIZE)

    pred_val = span_detection_model.predict(x = {"att_flags":X_span_att[v_index],
                                                 "words":X_span[v_index],
                                                 "token_ids":np.zeros_like(X_span_att[v_index])},
                                            batch_size=PREDICT_BATCH_SIZE)
    
    pred_starts_train, pred_stops_train = (pred_train[0]/2.0 + pred_train[2]/2.0), (pred_train[1]/2.0 + pred_train[3]/2.0)
    pred_starts_val, pred_stops_val = (pred_val[0]/2.0 + pred_val[2]/2.0), (pred_val[1]/2.0 + pred_val[3]/2.0)
    
    # Accumulate test results after training every fold
    pred_test_fold = span_detection_model.predict(x = {"att_flags":X_span_att_test,
                                                       "words":X_span_test,
                                                       "token_ids":np.zeros_like(X_span_att_test)},
                                                  batch_size=PREDICT_BATCH_SIZE)
    
    if num==0:
        pred_test = []
        pred_test.append(pred_test_fold[0]/2.0 + pred_test_fold[2]/2.0)
        pred_test.append(pred_test_fold[1]/2.0 + pred_test_fold[3]/2.0)
    else:
        pred_test[0] += (pred_test_fold[0]/2.0 + pred_test_fold[2]/2.0)
        pred_test[1] += (pred_test_fold[1]/2.0 + pred_test_fold[3]/2.0)
    
    # Tabulate
    preds = {
        "train":{
            "starts":{
                "y_true":Y_span_starts[t_index].argmax(axis=1),
                "y_pred":pred_train[0].argmax(axis=1)
            },
            "stops":{
                "y_true":Y_span_stops[t_index].argmax(axis=1),
                "y_pred":pred_train[1].argmax(axis=1)
            }
        },
        "valid":{
            "starts":{
                "y_true":Y_span_starts[v_index].argmax(axis=1),
                "y_pred":pred_val[0].argmax(axis=1)
            },
            "stops":{
                "y_true":Y_span_stops[v_index].argmax(axis=1),
                "y_pred":pred_val[1].argmax(axis=1)
            }        
        }
    }
    
    print_metrics(pred_dict=preds)

    print("[INFO] Prediction shape for training data: ", pred_starts_train.shape, pred_stops_train.shape)
    print("[INFO] Prediction shape for validation data: ", pred_starts_val.shape, pred_stops_val.shape)

    print("[INFO] Normal predictions (StartIndex less than EndIndex) for training data: ",
          sum([s<e for s,e in zip(pred_starts_train.argmax(axis=1),
                                  pred_stops_train.argmax(axis=1))]),
          "out of", pred_starts_train.shape[0])
    print("[INFO] Normal predictions (StartIndex less than EndIndex) for validation data: ",
          sum([s<e for s,e in zip(pred_starts_val.argmax(axis=1),
                                  pred_stops_val.argmax(axis=1))]),
          "out of", pred_starts_val.shape[0])

    pred_words_train = [post_process(tokenizer.decode(t[s:e])) if s<e else post_process(tokenizer.decode(t[e:])) for t, s,e in zip(X_span[t_index],
                                                                                                                                   pred_starts_train.argmax(axis=1),
                                                                                                                                   pred_stops_train.argmax(axis=1))]
    
    pred_words_val = [post_process(tokenizer.decode(t[s:e])) if s<e else post_process(tokenizer.decode(t[e:])) for t, s,e in zip(X_span[v_index],
                                                                                                                                 pred_starts_val.argmax(axis=1),
                                                                                                                                 pred_stops_val.argmax(axis=1))]

    print("[INFO] Training Jaccard Score: ",
          np.mean([jaccard(str1=i, str2=j) for i,j in zip([t for n,t in enumerate(Y_span_words) if n in t_index],
                                                          pred_words_train)]))
    print("[INFO] Validation Jaccard Score: ",
          np.mean([jaccard(str1=i, str2=j) for i,j in zip([t for n,t in enumerate(Y_span_words) if n in v_index],
                                                          pred_words_val)]))
    print("[INFO] Training for fold:", num, "finished at", ctime(time()))

print(ctime(time()))

[INFO] Training Sentiment only the final layers at higher learning rates.
Train on 1333 samples, validate on 667 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
[INFO] Training Span only the final layers at higher learning rates.
Train on 1332 samples, validate on 666 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
[INFO] Training Sentiment only the final layers at lower learning rates.
Train on 1333 samples, validate on 667 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
[INFO] Training Span only the final layers at lower learning rates.
Train on 1332 samples, validate on 666 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
[INFO] Training Sentiment only the final layers at higher learning rates.
Train on 1333 samples, validate on 667 samples
Epoch 1/2
Epoch 2/2
[INFO] Unfreezing Span RoBerta layer and training at lowest learning rates.
Train on 1332 samples, validate on 666 samples
              precision    recall  f1-score   support

    negative       0.13      0.00      0.01      

  _warn_prf(average, modifier, msg_start, len(result))


[INFO] 53.68 	|| train 	|| starts 	|| accuracy_score
[INFO] 53.90 	|| train 	|| stops 	|| accuracy_score
[INFO] 48.65 	|| valid 	|| starts 	|| accuracy_score
[INFO] 43.69 	|| valid 	|| stops 	|| accuracy_score
[INFO] 44.61 	|| train 	|| starts 	|| f1_score
[INFO] 51.26 	|| train 	|| stops 	|| f1_score
[INFO] 35.57 	|| valid 	|| starts 	|| f1_score
[INFO] 39.54 	|| valid 	|| stops 	|| f1_score
[INFO] 47.10 	|| train 	|| starts 	|| precision_score
[INFO] 49.82 	|| train 	|| stops 	|| precision_score
[INFO] 39.25 	|| valid 	|| starts 	|| precision_score
[INFO] 37.94 	|| valid 	|| stops 	|| precision_score
[INFO] 45.16 	|| train 	|| starts 	|| recall_score
[INFO] 57.20 	|| train 	|| stops 	|| recall_score
[INFO] 36.29 	|| valid 	|| starts 	|| recall_score
[INFO] 45.50 	|| valid 	|| stops 	|| recall_score
[INFO] 	|| train 	|| starts 	|| confusion_matrix 	|| ../results/ConfusionMatrix_V36_train_starts.csv
[INFO] 	|| train 	|| stops 	|| confusion_matrix 	|| ../results/ConfusionMatrix_V36_trai

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[INFO] Training Jaccard Score:  0.5436226589407911
[INFO] Validation Jaccard Score:  0.5167843380680915
[INFO] Training for fold: 0 finished at Fri Jun 12 21:51:27 2020
[INFO] Training Sentiment only the final layers at higher learning rates.
Train on 1333 samples, validate on 667 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
[INFO] Training Span only the final layers at higher learning rates.
Train on 1332 samples, validate on 666 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
[INFO] Training Sentiment only the final layers at lower learning rates.
Train on 1333 samples, validate on 667 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
[INFO] Training Span only the final layers at lower learning rates.
Train on 1332 samples, validate on 666 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
[INFO] Training Sentiment only the final layers at higher learning rates.
Train on 1333 samples, validate on 667 samples
Epoch 1/2
  16/1333 [..............................] - ETA: 1:57:30

KeyError: 'accuracy'

#### Validate sentiment model

In [None]:
sample_text = data.text.sample(5).tolist()

encoded_repr = tokenizer.encode_batch(sample_text)

sample_text_ids = pad_sequences([i.ids for i in encoded_repr],
                                maxlen=MAX_SEQ_LEN,
                                padding="post")
sample_text_att = pad_sequences([i.attention_mask for i in encoded_repr],
                                maxlen=MAX_SEQ_LEN,
                                padding="post")
pred = sentiment_model.predict({"att_flags":sample_text_att,
                                "words":sample_text_ids,
                                "token_ids":np.zeros_like(sample_text_att)})

pprint({
    num:{
        "text":i,
        "predicted_sentiment":[k for k,v in sentiment_lookup.items() if v==j][0]
    } for num,(i,j) in enumerate(zip(sample_text, pred.argmax(axis=1)))
})

In [None]:
sentiment_lookup

In [None]:
data.loc[(data.sentiment == "neutral") & (data.predicted_sentiment == "negative")].sample(5) # most incorrect in this cohort

### Span Detection Model Fit

<a href="https://keras.io/guides/transfer_learning/#finetuning" target="_blank"><h2 id="finetuning">Fine-tuning</h2></a>
<p>Once your model has converged on the new data, you can try to unfreeze all or part of
 the base model and retrain the whole model end-to-end with a very low learning rate.</p>
 <p>This is an optional last step that can potentially give you incremental improvements.
 It could also potentially lead to quick overfitting -- keep that in mind.</p>
 <p>It is critical to only do this step <em>after</em> the model with frozen layers has been
trained to convergence. If you mix randomly-initialized trainable layers with
trainable layers that hold pre-trained features, the randomly-initialized layers will
cause very large gradient updates during training, which will destroy your pre-trained
 features.</p>
 <p>It's also critical to use a very low learning rate at this stage, because
you are training a much larger model than in the first round of training, on a dataset
 that is typically very small.
As a result, you are at risk of overfitting very quickly if you apply large weight
 updates. Here, you only want to readapt the pretrained weights in an incremental way.</p>

<a href="https://keras.io/guides/transfer_learning/#finetuning" target="_blank"><p><strong>Important note about <code>compile()</code> and <code>trainable</code></strong></p></a>
<p>Calling <code>compile()</code> on a model is meant to "freeze" the behavior of that model. This
 implies that the <code>trainable</code>
attribute values at the time the model is compiled should be preserved throughout the
 lifetime of that model,
until <code>compile</code> is called again. Hence, if you change any <code>trainable</code> value, make sure
 to call <code>compile()</code> again on your
model for your changes to be taken into account.</p>

## Validation

#### Inference

In [None]:
pred_starts_test, pred_stops_test = pred_test[0]/NUM_FOLDS, pred_test[1]/NUM_FOLDS
print("[INFO] Prediction shape for testing data: ", pred_starts_test.shape, pred_stops_test.shape)

#### Postprocessing

In [None]:
print("Normal predictions (StartIndex less than EndIndex) for testing data:",
      sum([s<e for s,e in zip(pred_starts_test.argmax(axis=1),
                              pred_stops_test.argmax(axis=1))]), 
      "out of",
      pred_starts_test.shape[0])

In [None]:
pred_words_test = [
    post_process(tokenizer.decode(t[s:e+1])) if s<e else post_process(tokenizer.decode(t[e:])) for t,s,e in zip(X_span_test,
                                                                                                              pred_starts_test.argmax(axis=1),
                                                                                                              pred_stops_test.argmax(axis=1))
]

In [None]:
check_idx = 158
#print([[t,i,j,k] for t,i,j,k in zip(tokenizer.decode(),X_test[check_idx],pred_starts_test[check_idx],pred_stops_test[check_idx])])
print(tokenizer.decode(X_span_test[check_idx]))
print(pred_starts_test.argmax(axis=1)[check_idx])
print(pred_stops_test.argmax(axis=1)[check_idx])
print(post_process(tokenizer.decode(X_span_test[check_idx][pred_starts_test.argmax(axis=1)[check_idx]:1+pred_stops_test.argmax(axis=1)[check_idx]])))

## Submission

In [None]:
test_df_span['selected_text'] = pred_words_test

In [None]:
test_df_span["selected_text"] = np.where(test_df_span["sentiment"] == "neutral",
                                         test_df_span["text"],
                                         test_df_span["selected_text"])

In [None]:
test_df_span[["textID", "selected_text"]].to_csv(RESULTS_DIR+"submission.csv", index=False)

In [None]:
test_df_span.loc[test_df_span.sentiment!="neutral"][["text", "sentiment","selected_text"]].sample(25)