In [1]:
import pandas as pd
import numpy as np

import tensorflow as tf
import tensorflow.keras.backend as K

from sklearn.model_selection import StratifiedKFold

from transformers import *
import tokenizers

In [2]:
MAX_LEN = 96
PATH = "../input/tf-roberta/"
tokenizer = tokenizers.ByteLevelBPETokenizer(
    vocab_file = PATH + "vocab-roberta-base.json", 
    merges_file = PATH + "merges-roberta-base.txt", 
    lowercase = True,
    add_prefix_space = True
)
sentiment_id = {"positive": 1313, "negative": 2430, "neutral": 7974}

## Train data

In [3]:
train = pd.read_csv("../input/tweet-sentiment-extraction/train.csv").fillna("")
train["text"] = train["text"].astype(str)
train["selected_text"] = train["selected_text"].astype(str)
train.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [4]:
ct = train.shape[0]

input_ids = np.ones((ct,MAX_LEN),dtype="int32")
attention_mask = np.zeros((ct,MAX_LEN),dtype="int32")
token_type_ids = np.zeros((ct,MAX_LEN),dtype="int32") # roBerta take input IDS as same zeros
start_tokens = np.zeros((ct,MAX_LEN),dtype="int32")
end_tokens = np.zeros((ct,MAX_LEN),dtype="int32")

for k in range(train.shape[0]):
    # FIND OVERLAP
    text1 = " "+" ".join(train.loc[k, "text"].split())
    text2 = " ".join(train.loc[k, "selected_text"].split())
    idx = text1.find(text2)
    chars = np.zeros((len(text1)))
    chars[idx:idx+len(text2)]=1
    if text1[idx-1]==" ":
        chars[idx-1] = 1 
    enc = tokenizer.encode(text1) 
        
    # ID_OFFSETS
    offsets = []; idx=0
    for t in enc.ids:
        w = tokenizer.decode([t])
        offsets.append((idx,idx+len(w)))
        idx += len(w)
    
    # START END TOKENS
    toks = []
    for i,(a,b) in enumerate(offsets):
        sm = np.sum(chars[a:b])
        if sm>0: toks.append(i) 
        
    s_tok = sentiment_id[train.loc[k, "sentiment"]]
    # INPUT IDS
    input_ids[k,:len(enc.ids)+5] = [0] + enc.ids + [2,2] + [s_tok] + [2]
    # ATTENTION MASK
    attention_mask[k,:len(enc.ids)+5] = 1
    if len(toks)>0:
        # START TOKENS
        start_tokens[k,toks[0]+1] = 1
        # END TOKENS
        end_tokens[k,toks[-1]+1] = 1

## Test data

In [5]:
test = pd.read_csv("../input/tweet-sentiment-extraction/test.csv").fillna("")
test["text"] = test["text"].astype(str)

In [6]:
ct = test.shape[0]

input_ids_t = np.ones((ct,MAX_LEN),dtype="int32")
attention_mask_t = np.zeros((ct,MAX_LEN),dtype="int32")
token_type_ids_t = np.zeros((ct,MAX_LEN),dtype="int32") # roBerta take input IDS as same zeros

for k in range(test.shape[0]):
    text1 = " "+" ".join(test.loc[k, "text"].split())
    enc = tokenizer.encode(text1)                
    s_tok = sentiment_id[test.loc[k, "sentiment"]]
    # INPUT IDS
    input_ids_t[k,:len(enc.ids)+5] = [0] + enc.ids + [2,2] + [s_tok] + [2]
    # ATTENTION MASK
    attention_mask_t[k,:len(enc.ids)+5] = 1

## Model

In [7]:
def build_model():
    ids = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    att = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    tok = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)

    config = RobertaConfig.from_pretrained(PATH + "config-roberta-base.json")
    bert_model = TFRobertaModel.from_pretrained(PATH + "pretrained-roberta-base.h5", config=config)
    x = bert_model(ids,attention_mask=att,token_type_ids=tok)
    
    x1 = tf.keras.layers.Dropout(0.1)(x[0])
    x1 = tf.keras.layers.Conv1D(128, 2,padding="same")(x1)
    x1 = tf.keras.layers.LeakyReLU()(x1)
    x1 = tf.keras.layers.Conv1D(64, 2,padding="same")(x1)
    x1 = tf.keras.layers.Dense(1)(x1)
    x1 = tf.keras.layers.Flatten()(x1)
    x1 = tf.keras.layers.Activation("softmax")(x1)
    
    x2 = tf.keras.layers.Dropout(0.1)(x[0])
    x2 = tf.keras.layers.Conv1D(128, 2,padding="same")(x2)
    x2 = tf.keras.layers.LeakyReLU()(x2)
    x2 = tf.keras.layers.Conv1D(64, 2,padding="same")(x2)
    x2 = tf.keras.layers.Dense(1)(x2)
    x2 = tf.keras.layers.Flatten()(x2)
    x2 = tf.keras.layers.Activation("softmax")(x2)

    model = tf.keras.models.Model(inputs=[ids, att, tok], outputs=[x1,x2])
    optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
    model.compile(loss="categorical_crossentropy", optimizer=optimizer)

    return model

## Metric

In [8]:
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    if (len(a)==0) & (len(b)==0):
        return 0.5
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [9]:
def scheduler(epoch):
    return 3e-5 * 0.2**epoch

## Train

In [10]:
jac = []
VER = "0"
DISPLAY = 1 # USE display=1 FOR INTERACTIVE

oof_start = np.zeros((input_ids.shape[0], MAX_LEN))
oof_end = np.zeros((input_ids.shape[0], MAX_LEN))

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=69)

for fold,(idxT,idxV) in enumerate(skf.split(input_ids,train.sentiment.values)):
    print("#"*25)
    print(f"### FOLD {fold+1}")
    print("#"*25)
    
    K.clear_session()
    model = build_model()
        
    sv = tf.keras.callbacks.ModelCheckpoint(
        f"v{VER}-roberta-{fold}.h5", monitor="val_loss", verbose=1, save_best_only=True,
        save_weights_only=True, mode="auto", save_freq="epoch")
    
    reduce_lr = tf.keras.callbacks.LearningRateScheduler(scheduler)

    model.fit([input_ids[idxT,], attention_mask[idxT,], token_type_ids[idxT,]], [start_tokens[idxT,], end_tokens[idxT,]], 
        epochs=5, batch_size=8, verbose=DISPLAY, callbacks=[sv, reduce_lr],
        validation_data=([input_ids[idxV,],attention_mask[idxV,],token_type_ids[idxV,]], 
        [start_tokens[idxV,], end_tokens[idxV,]]))
    
    print("Loading model...")
    model.load_weights(f"v{VER}-roberta-{fold}.h5")

    # Predicting OOF...
    oof_start[idxV,],oof_end[idxV,] = model.predict([input_ids[idxV,],attention_mask[idxV,],token_type_ids[idxV,]],verbose=DISPLAY)
    
    # DISPLAY FOLD JACCARD
    all = []
    for k in idxV:
        a = np.argmax(oof_start[k,])
        b = np.argmax(oof_end[k,])
        if a>b: 
            st = train.loc[k, "text"] # IMPROVE CV/LB with better choice here
        else:
            text1 = " "+" ".join(train.loc[k, "text"].split())
            enc = tokenizer.encode(text1)
            st = tokenizer.decode(enc.ids[a-1:b])
        all.append(jaccard(st,train.loc[k, "selected_text"]))
    jac.append(np.mean(all))
    print(">>>> FOLD %i Jaccard =" %(fold+1),np.mean(all))
    print()

#########################
### FOLD 1
#########################
Train on 21984 samples, validate on 5497 samples
Epoch 1/5
Epoch 00001: val_loss improved from inf to 1.70690, saving model to v0-roberta-0.h5
Epoch 2/5
Epoch 00004: val_loss did not improve from 1.59449
Epoch 5/5
Epoch 00005: val_loss did not improve from 1.59449
Loading model...
>>>> FOLD 1 Jaccard = 0.7117579471428798

#########################
### FOLD 2
#########################
Train on 21985 samples, validate on 5496 samples
Epoch 1/5
Epoch 00001: val_loss improved from inf to 1.62888, saving model to v0-roberta-1.h5
Epoch 2/5
Epoch 00002: val_loss improved from 1.62888 to 1.55179, saving model to v0-roberta-1.h5
Epoch 3/5
Epoch 1/5
Epoch 00005: val_loss did not improve from 1.65435
Loading model...
>>>> FOLD 3 Jaccard = 0.7004080105678578

#########################
### FOLD 4
#########################
Train on 21985 samples, validate on 5496 samples
Epoch 1/5
Epoch 00004: val_loss did not improve from 1.61198
Epoch 

In [11]:
print(">>>> OVERALL 5Fold CV Jaccard =", np.mean(jac))

>>>> OVERALL 5Fold CV Jaccard = 0.7080173194703552


## Prediction

In [12]:
preds_start = np.zeros((input_ids_t.shape[0], MAX_LEN))
preds_end = np.zeros((input_ids_t.shape[0], MAX_LEN))

for fold in range(skf.n_splits):
    print("Loading model...")
    model.load_weights(f"v{VER}-roberta-{fold}.h5")
    
    preds = model.predict([input_ids_t, attention_mask_t, token_type_ids_t], verbose=DISPLAY)
    preds_start += preds[0]/skf.n_splits
    preds_end += preds[1]/skf.n_splits

Loading model...
Loading model...
Loading model...
Loading model...
Loading model...


## Submission

In [13]:
all = []
for k in range(input_ids_t.shape[0]):
    a = np.argmax(preds_start[k,])
    b = np.argmax(preds_end[k,])
    if a>b: 
        st = test.loc[k, "text"]
    else:
        text1 = " "+" ".join(test.loc[k, "text"].split())
        enc = tokenizer.encode(text1)
        st = tokenizer.decode(enc.ids[a-1:b])
    all.append(st)

In [14]:
test["selected_text"] = all
test[["textID", "selected_text"]].to_csv("submission.csv",index=False)
pd.set_option("max_colwidth", 60)
test.sample(25)

Unnamed: 0,textID,text,sentiment,selected_text
1358,b34a3fd669,cookies are good,positive,cookies are good
2708,ad0dd05c20,"ali, just like you do!!! have such a wonderful sunday!",positive,wonderful
1476,d72748e39c,u never sent me carrie. ur an ****. but an **** that i ...,negative,miss
2994,a657e300d0,Sunburn is really bad now. Regretting sitting in the sun...,negative,sunburn is really bad
201,a1320a5050,http://twitpic.com/4w67k - Camping at black butte lake,neutral,http://twitpic.com/4w67k - camping at black butte lake
1875,d80d99b28c,Whuuurrrrr - glands really swollen now. Guess the weeken...,negative,whuuurrrrr - glands really swollen now.
1841,845495ffac,But I do know is that I am extremely happy with him and ...,positive,happy
1655,a5e47d75c0,"Please send me those youtube links, Erin watched most o...",negative,didn`t get to see the cowboys
230,6cd35dd82e,I do my humble best Going on a works paintball day soo...,positive,i do my humble best
1721,f60dbc9737,A big welcome to Twitterlandz grrl! Really wish I could...,positive,glad to hear it was a success
