In [None]:
#ROOT_PATH = '/content/drive/My Drive/Qishi/NLP/project'
ROOT_PATH = '/kaggle'

In [None]:
import numpy as np
import pandas as pd
import os
import warnings
import random
import torch 
from torch import nn
import tokenizers
from transformers import *

import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.layers import Input, Dropout, Conv1D, LeakyReLU, Dense, Flatten, Activation, Reshape, Concatenate, Bidirectional, LSTM
from tensorflow.keras.models import Model

import gc



In [None]:
warnings.filterwarnings('ignore')

TEXT_LEN_BY_WORD = 35

In [None]:
"""
Utility functions
"""

def selected_text_by_word_probs(data, start_word_probs, end_word_probs):
    data = data.reset_index(drop=True)
    selected_text = []
    for i in range(data.shape[0]):
        start_idx = np.argmax(start_word_probs[i,])
        end_idx = np.argmax(end_word_probs[i,])
        selected_text.append(" ".join(data.loc[i, 'text'].split()[start_idx:end_idx+1]))
    return selected_text

def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    if (len(a)==0) & (len(b)==0): return 0.5
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

# Pytroch roberta token to word probabilities

In [None]:
"""
**************************************************************
Pytroch token to word probabilities
**************************************************************
"""
PTROBERTA_MAX_LEN = 128

def pt_roberta_prediction_in_word(test, len_by_word=TEXT_LEN_BY_WORD, n_splits=10, model_path='pt-roberta-finetune/'):
    class TweetDataset(torch.utils.data.Dataset):
        def __init__(self, df, max_len=PTROBERTA_MAX_LEN):
            self.df = df
            self.max_len = max_len
            self.tokenizer = tokenizers.ByteLevelBPETokenizer(
                vocab_file=ROOT_PATH+'/input/roberta-base/vocab.json', 
                merges_file=ROOT_PATH+'/input/roberta-base/merges.txt', 
                lowercase=True,
                add_prefix_space=True)

        def __getitem__(self, index):
            data = {}
            row = self.df.iloc[index]
            
            ids, masks, tweet, offsets, words_to_tokens_index = self.get_input_data(row)
            data['ids'] = ids
            data['masks'] = masks
            data['tweet'] = tweet
            data['offsets'] = offsets
            data['words_to_tokens_index'] = words_to_tokens_index

            return data

        def __len__(self):
            return len(self.df)
        
        def get_input_data(self, row):
            tweet = " " + " ".join(row.text.lower().split())
            encoding = self.tokenizer.encode(tweet)
            sentiment_id = self.tokenizer.encode(row.sentiment).ids
            ids = [0] + sentiment_id + [2, 2] + encoding.ids + [2]
            offsets = [(0, 0)] * 4 + encoding.offsets + [(0, 0)]

            words_to_tokens_index = []
            char_position = 0
            token_position = 4
            for i, word in enumerate(tweet.split()):
                words_to_tokens_index.append(token_position)
                char_position += 1 + len(word)
                while (offsets[token_position][0] < char_position) & (token_position < len(offsets)-1):
                    token_position += 1 
            
            pad_len = self.max_len - len(words_to_tokens_index)
            if pad_len > 0:
                words_to_tokens_index += [0] * pad_len

            pad_len = self.max_len - len(ids)
            if pad_len > 0:
                ids += [1] * pad_len
                offsets += [(0, 0)] * pad_len
                    
            ids = torch.tensor(ids)
            masks = torch.where(ids != 1, torch.tensor(1), torch.tensor(0))
            offsets = torch.tensor(offsets)
            words_to_tokens_index = torch.tensor(words_to_tokens_index)

            return ids, masks, tweet, offsets, words_to_tokens_index



    class TweetModel(nn.Module):
        def __init__(self):
            super(TweetModel, self).__init__()
            
            config = RobertaConfig.from_pretrained(
                ROOT_PATH+'/input/roberta-base/config.json', output_hidden_states=True)    
            self.roberta = RobertaModel.from_pretrained(
                ROOT_PATH+'/input/roberta-base/pytorch_model.bin', config=config)
            self.dropout = nn.Dropout(0.5)
            self.fc = nn.Linear(config.hidden_size, 2)
            nn.init.normal_(self.fc.weight, std=0.02)
            nn.init.normal_(self.fc.bias, 0)

        def forward(self, input_ids, attention_mask):
            _, _, hs = self.roberta(input_ids, attention_mask)
             
            x = torch.stack([hs[-1], hs[-2], hs[-3]])
            x = torch.max(x, 0)[0]
            x = self.dropout(x)
            x = self.fc(x)
            start_logits, end_logits = x.split(1, dim=-1)
            start_logits = start_logits.squeeze(-1)
            end_logits = end_logits.squeeze(-1)
                    
            return start_logits, end_logits


    test['text'] = test['text'].astype(str)
    test_loader = torch.utils.data.DataLoader(
        TweetDataset(test), 
        batch_size=32, 
        shuffle=False, 
        num_workers=2)

    start_word_probs = []
    end_word_probs=[]
    models = []
    for fold in range(n_splits):
        model = TweetModel()
        model.cuda()
        model.load_state_dict(torch.load(model_path+'/roberta_finetune_fold%s.pth'%(fold+1)))
        model.eval()
        models.append(model)
        del model
        gc.collect()
        
    for data in test_loader:
        ids = data['ids'].cuda()
        masks = data['masks'].cuda()
        tweet = data['tweet']
        offsets = data['offsets'].numpy()
        words_to_tokens_index = data['words_to_tokens_index'].numpy()

        start_logits = []
        end_logits = []
        for model in models:
            with torch.no_grad():
                output = model(ids, masks)
                start_logits.append(torch.softmax(output[0], dim=1).cpu().detach().numpy())
                end_logits.append(torch.softmax(output[1], dim=1).cpu().detach().numpy())

        start_logits = np.mean(start_logits, axis=0)
        end_logits = np.mean(end_logits, axis=0)        

        for i in range(len(ids)):
            num_words = len(tweet[i].split())
            pad_len = len_by_word - num_words
#             start_word_probs.append(start_logits[i][words_to_tokens_index[i][:num_words]].tolist() + [0]*pad_len)
#             end_word_probs.append(end_logits[i][words_to_tokens_index[i][:num_words]].tolist() + [0]*pad_len)
            start_word_probs_row = []
            end_word_probs_row = []
            for j in range(num_words-1):
                start_idx = words_to_tokens_index[i, j]
                end_idx = words_to_tokens_index[i, j+1]
                if start_idx==end_idx:
                    end_idx += 1
                start_word_probs_row.append(max(start_logits[i][start_idx:end_idx]))
                end_word_probs_row.append(max(end_logits[i][start_idx:end_idx]))
            start_word_probs_row.append(max(start_logits[i][words_to_tokens_index[i][num_words-1]:]))
            end_word_probs_row.append(max(end_logits[i][words_to_tokens_index[i][num_words-1]:]))
            
            start_word_probs.append(start_word_probs_row + [0]*pad_len)
            end_word_probs.append(end_word_probs_row + [0]*pad_len)
                
    del models, test_loader
    gc.collect()
    
    return (np.array(start_word_probs), np.array(end_word_probs))

# Pytorch albert QA tokens to words

In [None]:
"""
**************************************************************
Pytroch alberta token to word probabilities
**************************************************************
"""
PTALBERT_MAX_LEN = 168

def pt_albert_prediction_in_word(test, len_by_word=TEXT_LEN_BY_WORD, n_splits=10, model_path='albert-base/'):
    class TweetDataset(torch.utils.data.Dataset):
        def __init__(self, df, max_len=PTROBERTA_MAX_LEN):
            self.df = df
            self.max_len = max_len
            self.tokenizer = AlbertTokenizer.from_pretrained(ROOT_PATH+'/input/albert-pretrained-models-pytorch/albert-base-v2-spiece.model')

        def __getitem__(self, index):
            data = {}
            row = self.df.iloc[index]
            input_dict = self.get_input_data(row)
            data['input_ids'] = torch.tensor(input_dict['input_ids'],dtype=torch.long)
            data['token_type_ids'] = torch.tensor(input_dict['token_type_ids'],dtype=torch.long)
            data['attention_mask'] = torch.tensor(input_dict['attention_mask'],dtype=torch.long)
            data['tweet'] = input_dict['tweet']
            data['input_text'] = input_dict['input_text']
            data['offsets'] = torch.tensor(input_dict['offsets'],dtype=torch.long)
            data['sentiment'] = row['sentiment']
            data['words_to_tokens_index'] = torch.tensor(input_dict['words_to_tokens_index'],dtype=torch.long)
            
            return data

        def __len__(self):
            return len(self.df)
        
        def get_input_data(self, row):
            input_dict = {}
            tweet = " " + " ".join(row.text.lower().split())
            sentiment = row['sentiment']
            question = f" which words actually lead to the f{sentiment} sentiment description?"
            input_text = question + " [SEP] " + tweet
            input_text = " " + " ".join(input_text.lower().split())
            input_ids = self.tokenizer.encode(input_text)
            sep = self.tokenizer.convert_tokens_to_ids("[SEP]")
            unk = self.tokenizer.convert_tokens_to_ids("<unk>")
            token_type_ids = [0 if i <= input_ids.index(sep) else 1 for i in range(len(input_ids))]
            attention_mask = [1] * len(input_ids)
            offsets = []
            offset_start = 0

            for input_id in input_ids[1:-1]:
                token = self.tokenizer.convert_ids_to_tokens(input_id)
                if input_id == unk: # <unk>
                    token = ' '
                offset_end = offset_start+len(token)
                offsets.append((offset_start, offset_end))
                offset_start = offset_end
            
            words_to_tokens_index = []
            token_position = input_ids.index(sep)+1
            print(input_text)
            print(input_ids)
            print(sep)
            print(token_position)
            for i, word in enumerate(tweet.split()):
                words_to_tokens_index.append(token_position)
                token_position += len(self.tokenizer.tokenize(word))            
            
            
            pad_len = self.max_len - len(words_to_tokens_index)
            if pad_len > 0:
                words_to_tokens_index += [0] * pad_len
            
            
            pad_len = self.max_len - len(input_ids)
            if pad_len > 0:
                input_ids += [0] * pad_len
                offsets += [(0, 0)] * pad_len
                attention_mask += [0] * pad_len
                token_type_ids += [0] * pad_len

                
            input_dict['input_ids'] = input_ids
            input_dict['token_type_ids'] = token_type_ids
            input_dict['attention_mask'] = attention_mask
            input_dict['offsets'] = offsets
            input_dict['input_text'] = input_text  
            input_dict['tweet'] = tweet 
            input_dict['words_to_tokens_index'] = words_to_tokens_index
            
            return input_dict
            

    class TweetModel(nn.Module):
        def __init__(self):
            super(TweetModel, self).__init__()

            config = AlbertConfig.from_pretrained(
                ROOT_PATH+'/input/albert-pretrained-models-pytorch/albert-base-v2-config.json', output_hidden_states=True)    
            self.roberta = AlbertForQuestionAnswering.from_pretrained(
                ROOT_PATH+'/input/albert-pretrained-models-pytorch/albert-base-v2-pytorch_model.bin', config=config)
            self.dropout = nn.Dropout(0.5)
            self.fc = nn.Linear(config.hidden_size, 2)
            nn.init.normal_(self.fc.weight, std=0.02)
            nn.init.normal_(self.fc.bias, 0)

        def forward(self, input_ids, attention_mask):
            start_logits, end_logits, hs = self.roberta(input_ids, attention_mask)
            return start_logits, end_logits


    test['text'] = test['text'].astype(str)
    test_loader = torch.utils.data.DataLoader(
        TweetDataset(test), 
        batch_size=32, 
        shuffle=False, 
        num_workers=2)

    start_word_probs = []
    end_word_probs=[]
    models = []
    for fold in range(n_splits):
        model = TweetModel()
        model.cuda()
        model.load_state_dict(torch.load(model_path+'/albertQA_finetune_fold%s.pth'%(fold+1)))
        model.eval()
        models.append(model)
        del model
        gc.collect()
        
    for data in test_loader:
        ids = data['input_ids'].cuda()
        masks = data['attention_mask'].cuda()
        tweet = data['tweet']
        input_text = data['input_text']
        offsets = data['offsets'].numpy()
        words_to_tokens_index = data['words_to_tokens_index'].numpy()
        print(words_to_tokens_index)
        
        start_logits = []
        end_logits = []
        for model in models:
            with torch.no_grad():
                output = model(ids, masks)
                start_logits.append(torch.softmax(output[0], dim=1).cpu().detach().numpy())
                end_logits.append(torch.softmax(output[1], dim=1).cpu().detach().numpy())

        start_logits = np.mean(start_logits, axis=0)
        end_logits = np.mean(end_logits, axis=0)        

        for i in range(len(ids)):
            print(i)
            num_words = len(tweet[i].split())
            pad_len = len_by_word - num_words
#             start_word_probs.append(start_logits[i][words_to_tokens_index[i][:num_words]].tolist() + [0]*pad_len)
#             end_word_probs.append(end_logits[i][words_to_tokens_index[i][:num_words]].tolist() + [0]*pad_len)
            start_word_probs_row = []
            end_word_probs_row = []
            for j in range(num_words-1):
                #print(j)
                start_idx = words_to_tokens_index[i, j]
                end_idx = words_to_tokens_index[i, j+1]
                if start_idx==end_idx:
                    end_idx += 1
                start_word_probs_row.append(max(start_logits[i][start_idx:end_idx]))
                end_word_probs_row.append(max(end_logits[i][start_idx:end_idx]))
            start_word_probs_row.append(max(start_logits[i][words_to_tokens_index[i][num_words-1]:]))
            end_word_probs_row.append(max(end_logits[i][words_to_tokens_index[i][num_words-1]:]))
            
            start_word_probs.append(start_word_probs_row + [0]*pad_len)
            end_word_probs.append(end_word_probs_row + [0]*pad_len)
                
    del models, test_loader
    gc.collect()
    
    return (np.array(start_word_probs), np.array(end_word_probs))

# Pytorch XLNET tokens to words

In [None]:
"""
**************************************************************
Pytroch token to word probabilities: xlnet
**************************************************************
"""
PTXLNET_MAX_LEN = 108
TOKENIZER = XLNetTokenizer.from_pretrained(ROOT_PATH+'/input/xlnetbasecased', 
                                                        remove_space=False,
                                                        do_lower_case=True)

def pt_xlnet_prediction_in_word(test, len_by_word=TEXT_LEN_BY_WORD, n_splits=10, model_path='xlnet-finetune/'):
    def process_data(tweet, sentiment, tokenizer, max_len):
        input_ids = np.ones(max_len)
        masks = np.zeros(max_len)
        token_type_ids = np.zeros(max_len)
        sentiment_id = {s:tokenizer.encode(s)[0] for s in ['positive', 'negative', 'neutral']}
        text1 = " " + " ".join(tweet.split())
        #print(text1)
        enc = tokenizer.encode(text1)
        s_tok = sentiment_id[sentiment]
        input_ids[:len(enc)+5] = [0] + enc + [2,2] + [s_tok] + [2]
        masks[:len(enc)+5] = 1

        words_to_tokens_index = np.zeros(max_len)
        #words_to_tokens_index = []
        token_position = 1
        for i, word in enumerate(text1.split()):
            words_to_tokens_index[i] = token_position
            token_position += len(tokenizer.tokenize(word))    
        
        return {
            'ids': input_ids,
            'masks': masks,
            'token_type_ids': token_type_ids,
            'orig_tweet': tweet,
            'sentiment': sentiment,
            'words_to_tokens_index':words_to_tokens_index
        }

    class TweetDataset:
        def __init__(self, tweet, sentiment):
            self.tweet = tweet
            self.sentiment = sentiment
            self.tokenizer = TOKENIZER
            self.max_len = PTXLNET_MAX_LEN

        def __len__(self):
            return len(self.tweet)

        def __getitem__(self, item):
            data = process_data(
                self.tweet[item], 
                self.sentiment[item],
                self.tokenizer,
                self.max_len
            )

            return {
                'ids': torch.tensor(data["ids"], dtype=torch.long),
                'masks': torch.tensor(data["masks"], dtype=torch.long),
                'token_type_ids': torch.tensor(data["token_type_ids"], dtype=torch.long),
                'orig_tweet': data["orig_tweet"],
                'sentiment': data["sentiment"],
                'words_to_tokens_index': torch.tensor(data["words_to_tokens_index"], dtype=torch.long)
            }

    class TweetModel(XLNetLMHeadModel):
        def __init__(self, conf):
            super(TweetModel, self).__init__(conf)
            self.xlnet = XLNetModel.from_pretrained(ROOT_PATH+"/input/xlnetbasecased/",config=conf)
            self.drop_out = nn.Dropout(0.3)
            self.l0 = nn.Linear(768, 2)
            torch.nn.init.normal_(self.l0.weight, std=0.02)
            #self.conv1d_1 = nn.Conv1d(PTXLNET_MAX_LEN, 64, 3, padding=1)
            #self.conv1d_2 = nn.Conv1d(64, 2, 3, padding=1)

        def forward(self, ids, masks, token_type_ids):
            _, out = self.xlnet(
                ids,
                attention_mask=masks,
                token_type_ids=token_type_ids
            )

            out = torch.stack((out[-1], out[-2], out[-3]),dim=0)
            out = torch.mean(out, 0)
            #out = torch.cat((out[-1], out[-2]), dim=-1)
            out = self.drop_out(out)
            logits = self.l0(out)
            start_logits, end_logits = logits.split(1, dim=-1)
            start_logits = start_logits.squeeze(-1)
            end_logits = end_logits.squeeze(-1)
            return start_logits, end_logits


    test['text'] = test['text'].astype(str)
    
    model_config = XLNetConfig.from_pretrained(ROOT_PATH+"/input/xlnetbasecased/config.json")
    model_config.output_hidden_states = True
    test_dataset = TweetDataset(
        tweet=test.text.values,
        sentiment=test.sentiment.values)
    test_loader = torch.utils.data.DataLoader(
            test_dataset, 
            batch_size=32, 
            shuffle=False, 
            num_workers=2)
    
    start_word_probs = []
    end_word_probs=[]
    models = []
    for fold in range(n_splits):
        model = TweetModel(conf=model_config)
        model.cuda()
        model.load_state_dict(torch.load(model_path+'/XLNetmodel_%s.bin'%(fold)))
        model.eval()
        models.append(model)
        del model
        gc.collect()
        
    for data in test_loader:
        ids = data['ids'].cuda()
        masks = data['masks'].cuda()
        tweet = data['orig_tweet']
        sentiment = data['sentiment']
        token_type_ids = data['token_type_ids'].cuda()
        words_to_tokens_index = data['words_to_tokens_index'].numpy()

        start_logits = []
        end_logits = []
        for model in models:
            with torch.no_grad():
                output = model(ids, masks, token_type_ids)
                start_logits.append(torch.softmax(output[0], dim=1).cpu().detach().numpy())
                end_logits.append(torch.softmax(output[1], dim=1).cpu().detach().numpy())

        start_logits = np.mean(start_logits, axis=0)
        end_logits = np.mean(end_logits, axis=0)        

        for i in range(len(ids)):
            num_words = len(tweet[i].split())
            pad_len = len_by_word - num_words
#             start_word_probs.append(start_logits[i][words_to_tokens_index[i][:num_words]].tolist() + [0]*pad_len)
#             end_word_probs.append(end_logits[i][words_to_tokens_index[i][:num_words]].tolist() + [0]*pad_len)
            start_word_probs_row = []
            end_word_probs_row = []
            for j in range(num_words-1):
                start_idx = words_to_tokens_index[i, j]
                end_idx = words_to_tokens_index[i, j+1]
                if start_idx==end_idx:
                    end_idx += 1
                start_word_probs_row.append(max(start_logits[i][start_idx:end_idx]))
                end_word_probs_row.append(max(end_logits[i][start_idx:end_idx]))
            start_word_probs_row.append(max(start_logits[i][words_to_tokens_index[i][num_words-1]:]))
            end_word_probs_row.append(max(end_logits[i][words_to_tokens_index[i][num_words-1]:]))
            
            start_word_probs.append(start_word_probs_row + [0]*pad_len)
            end_word_probs.append(end_word_probs_row + [0]*pad_len)
                
    del models, test_loader
    gc.collect()
    
    return (np.array(start_word_probs), np.array(end_word_probs))

# Tensorflow ROBERTA token to word probabilities

In [None]:
"""
**************************************************************
Tensorflow token to word probabilities
**************************************************************
"""

TFROBERTA_MAX_LEN = 96
def tf_roberta_prediction_in_word(test, len_by_word=TEXT_LEN_BY_WORD, n_splits=5, model_path='tf-roberta-finetune/', token_probs=False):

    def bert_data_transform(data, max_length=TFROBERTA_MAX_LEN):
        '''
        Transform data into arrays that BERT understands 
        '''
        tokenizer = tokenizers.ByteLevelBPETokenizer(
        vocab_file=ROOT_PATH + '/input/tf-roberta/vocab-roberta-base.json', 
        merges_file=ROOT_PATH + '/input/tf-roberta/merges-roberta-base.txt', 
        lowercase=True,
        add_prefix_space=True)

        sentiment_id = {s:tokenizer.encode(s).ids[0] for s in ['positive', 'negative', 'neutral']}

        ct = data.shape[0]
        input_ids = np.ones((ct, max_length),dtype='int32')
        attention_mask = np.zeros((ct,max_length),dtype='int32')
        token_type_ids = np.zeros((ct,max_length),dtype='int32')
        words_to_tokens_index = np.zeros((ct,max_length),dtype='int32')

        for k in range(ct):
            # FIND OVERLAP
            text1 = " "+" ".join(data.loc[k,'text'].split())
            enc = tokenizer.encode(text1)
            s_tok = sentiment_id[data.loc[k,'sentiment']]
            input_ids[k,:len(enc.ids)+5] = [0] + enc.ids + [2,2] + [s_tok] + [2]
            attention_mask[k,:len(enc.ids)+5] = 1

            # ID_OFFSETS
            offsets = []; idx=0
            for t in enc.ids:
                w = tokenizer.decode([t])
                offsets.append((idx,idx+len(w)))
                idx += len(w)            

            char_position = 0
            token_position = 1
            for i, word in enumerate(text1.split()):
                words_to_tokens_index[k, i] = token_position
                char_position += 1 + len(word)
                while (offsets[token_position-1][0] < char_position) & (token_position < len(offsets)):
                    token_position += 1
#             print(offsets)
#             print(words_to_tokens_index)
        return (input_ids, attention_mask, token_type_ids, words_to_tokens_index)
        

    def build_model(max_length=TFROBERTA_MAX_LEN):
        '''
        Add layer on top of BERT
        '''        
        ids = Input((max_length,), dtype=tf.int32)
        att = Input((max_length,), dtype=tf.int32)
        tok = Input((max_length,), dtype=tf.int32)

        
        config = RobertaConfig.from_pretrained(ROOT_PATH + '/input/tf-roberta/config-roberta-base.json')
        config.output_hidden_states=True
        bert_model = TFRobertaModel.from_pretrained(ROOT_PATH + '/input/tf-roberta/pretrained-roberta-base.h5', config=config)
        
        x = bert_model(ids,attention_mask=att)
        
        def output_layer(bert_output, name='start', activation='softmax'):
            #x_bert = K.max(K.stack(bert_output), axis=0)
            x_bert = Dropout(0.1)(bert_output)
            x_bert = Conv1D(128, 2,padding='same')(x_bert)
            x_bert = LeakyReLU()(x_bert)
            x_bert = Conv1D(64, 2, padding='same')(x_bert)
            x_bert = Dense(1)(x_bert)                      
            x_output = Flatten()(x_bert)
            x_output = Activation(activation, name=name)(x_output)
             
            return x_output
        
        x1_output = output_layer(x[0], name='start')
        x2_output = output_layer(x[0], name='end')
        #x1_output = output_layer([x[-1], x[-2], x[-3]], name='start')
        #x2_output = output_layer([x[-1], x[-2], x[-3]], name='end')
        x_output = output_layer(x[0], name='span', activation='sigmoid')   
    
        model = Model(inputs=[ids, att], outputs=[x1_output,x2_output, x_output])

        return model
    
        
    #test = test.reset_index(drop=True)
    input_ids, attention_mask, token_type_ids, words_to_tokens_index = bert_data_transform(test)
    
    preds_start = np.zeros((input_ids.shape[0],input_ids.shape[1]))
    preds_end = np.zeros((input_ids.shape[0],input_ids.shape[1]))

    for fold in range(n_splits):
        K.clear_session()
        model = build_model()
        
        model.load_weights(model_path+'/v4-roberta-%i.h5'%(fold))
        #print('Predicting Test...%s'%fold)
        preds = model.predict([input_ids,attention_mask])
        preds_start += preds[0]/n_splits
        preds_end += preds[1]/n_splits
        
        del model
        gc.collect()
        
    start_word_probs = np.zeros((input_ids.shape[0],len_by_word))
    end_word_probs = np.zeros((input_ids.shape[0],len_by_word))
    for i in range(input_ids.shape[0]):
        num_words = len(test.loc[i, 'text'].split())
        for j in range(num_words-1):
            start_idx = words_to_tokens_index[i, j]
            end_idx = words_to_tokens_index[i, j+1]
            if start_idx==end_idx:
                end_idx += 1
            start_word_probs[i, j] = max(preds_start[i, start_idx:end_idx])
            end_word_probs[i, j] = max(preds_end[i, start_idx:end_idx])
        start_word_probs[i, num_words-1] = max(preds_start[i, words_to_tokens_index[i, j]:])
        end_word_probs[i, num_words-1] = max(preds_end[i, words_to_tokens_index[i, j]:])
    
    if token_probs:
        return (preds_start, preds_end)
    else:
        return (start_word_probs, end_word_probs)

In [None]:
"""
**************************************************************
Tensorflow token to word probabilities (ALEX)
**************************************************************
"""

TFROBERTA_MAX_LEN = 96
def tf_roberta_prediction_in_word_2(test, len_by_word=TEXT_LEN_BY_WORD, n_splits=5, model_path='tf-roberta-finetune/', token_probs=False):

    def bert_data_transform(data, max_length=TFROBERTA_MAX_LEN):
        '''
        Transform data into arrays that BERT understands 
        '''
        tokenizer = tokenizers.ByteLevelBPETokenizer(
        vocab_file=ROOT_PATH + '/input/tf-roberta/vocab-roberta-base.json', 
        merges_file=ROOT_PATH + '/input/tf-roberta/merges-roberta-base.txt', 
        lowercase=True,
        add_prefix_space=True)

        sentiment_id = {s:tokenizer.encode(s).ids[0] for s in ['positive', 'negative', 'neutral']}

        ct = data.shape[0]
        input_ids = np.ones((ct, max_length),dtype='int32')
        attention_mask = np.zeros((ct,max_length),dtype='int32')
        token_type_ids = np.zeros((ct,max_length),dtype='int32')
        words_to_tokens_index = np.zeros((ct,max_length),dtype='int32')

        for k in range(ct):
            # FIND OVERLAP
            text1 = " "+" ".join(data.loc[k,'text'].split())
            enc = tokenizer.encode(text1)
            s_tok = sentiment_id[data.loc[k,'sentiment']]
            input_ids[k,:len(enc.ids)+5] = [0] + enc.ids + [2,2] + [s_tok] + [2]
            attention_mask[k,:len(enc.ids)+5] = 1

            # ID_OFFSETS
            offsets = []; idx=0
            for t in enc.ids:
                w = tokenizer.decode([t])
                offsets.append((idx,idx+len(w)))
                idx += len(w)            

            char_position = 0
            token_position = 1
            for i, word in enumerate(text1.split()):
                words_to_tokens_index[k, i] = token_position
                char_position += 1 + len(word)
                while (offsets[token_position-1][0] < char_position) & (token_position < len(offsets)):
                    token_position += 1
#             print(offsets)
#             print(words_to_tokens_index)
        return (input_ids, attention_mask, token_type_ids, words_to_tokens_index)
        

    def build_model(max_length=TFROBERTA_MAX_LEN):
        '''
        Add layer on top of BERT
        '''        
        ids = Input((max_length,), dtype=tf.int32)
        att = Input((max_length,), dtype=tf.int32)
        tok = Input((max_length,), dtype=tf.int32)

        
        config = RobertaConfig.from_pretrained(ROOT_PATH + '/input/tf-roberta/config-roberta-base.json')
        config.output_hidden_states=True
        bert_model = TFRobertaModel.from_pretrained(ROOT_PATH + '/input/tf-roberta/pretrained-roberta-base.h5', config=config)
        
        
        x = bert_model(ids,attention_mask=att,token_type_ids=tok)
    
        x1 = tf.keras.layers.Dropout(0.1)(x[0]) 
        x1 = tf.keras.layers.Conv1D(1,1)(x1)
        x1 = tf.keras.layers.Flatten()(x1)
        x1 = tf.keras.layers.Activation('softmax')(x1)
    
        x2 = tf.keras.layers.Dropout(0.1)(x[0]) 
        x2 = tf.keras.layers.Conv1D(1,1)(x2)
        x2 = tf.keras.layers.Flatten()(x2)
        x2 = tf.keras.layers.Activation('softmax')(x2)
    
        model = Model(inputs=[ids, att, tok], outputs=[x1, x2])

        return model
    
        
    #test = test.reset_index(drop=True)
    input_ids, attention_mask, token_type_ids, words_to_tokens_index = bert_data_transform(test)
    
    preds_start = np.zeros((input_ids.shape[0],input_ids.shape[1]))
    preds_end = np.zeros((input_ids.shape[0],input_ids.shape[1]))

    for fold in range(n_splits):
        K.clear_session()
        model = build_model()
        
        model.load_weights(model_path+'/reg_0_05-roberta-%i.h5'%(fold))
        #print('Predicting Test...%s'%fold)
        preds = model.predict([input_ids,attention_mask, token_type_ids])
        preds_start += preds[0]/n_splits
        preds_end += preds[1]/n_splits
        
        del model
        gc.collect()
        
    start_word_probs = np.zeros((input_ids.shape[0],len_by_word))
    end_word_probs = np.zeros((input_ids.shape[0],len_by_word))
    for i in range(input_ids.shape[0]):
        num_words = len(test.loc[i, 'text'].split())
        for j in range(num_words-1):
            start_idx = words_to_tokens_index[i, j]
            end_idx = words_to_tokens_index[i, j+1]
            if start_idx==end_idx:
                end_idx += 1
            start_word_probs[i, j] = max(preds_start[i, start_idx:end_idx])
            end_word_probs[i, j] = max(preds_end[i, start_idx:end_idx])
        start_word_probs[i, num_words-1] = max(preds_start[i, words_to_tokens_index[i, j]:])
        end_word_probs[i, num_words-1] = max(preds_end[i, words_to_tokens_index[i, j]:])
            
    if token_probs:
        return (preds_start, preds_end)
    else:
        return (start_word_probs, end_word_probs)

In [None]:
def is_connect(l):
    return l in {"'", "-", '_'}

def extend_select(tokenizer, predicted_start, predicted_end, original_text, extend_threshold  = 0):

    a = predicted_start
    b = predicted_end

    enc = tokenizer.encode(original_text)
    res = tokenizer.decode(enc.ids[a-1:b])

    if b < len(enc.ids):
        next_wd = tokenizer.decode(enc.ids[b:(b+1)])
        #if next_wd != '' and next_wd[0] != ' ':
        if next_wd != '' and next_wd[0] != ' ' and ((next_wd[0].isalnum() and res[-1].isalnum()) or (is_connect(next_wd[0]) or (is_connect(res[-1])))):
            last_wd = tokenizer.decode(enc.ids[(b-1):b])

            if len(last_wd)/(len(last_wd) + len(next_wd)) > extend_threshold:
                res = res + next_wd
            else:
                res = tokenizer.decode(enc.ids[a-1:b-1])

    if a > 1 and res[0].isalnum() and tokenizer.decode(enc.ids[(a-2):(a-1)])[-1] == '#':
        res = '#' + res
        return res

    if a > 1:
        prev_wd = tokenizer.decode(enc.ids[(a-2):(a-1)])

        if prev_wd != '' and prev_wd[-1] != ' ' and res[0] != ' ' and ((prev_wd[-1].isalnum() and res[0].isalnum()) or (is_connect(prev_wd[-1]) or (is_connect(res[0])))):

            first_wd = tokenizer.decode(enc.ids[(a-1):a])

            if len(first_wd)/(len(first_wd) + len(prev_wd)) > extend_threshold:
                res = prev_wd + res
            else:
                res = tokenizer.decode(enc.ids[a:b-1])

    return res

def predict_decode(tokenizer, text_data, vec_start, vec_end, vec_idx):
    all_selected_text = []
    for k in vec_idx:
        a = np.argmax(vec_start[k,])
        b = np.argmax(vec_end[k,])
        if a>b: 
            st = text_data.loc[k,'text']
        else:
            text1 = " "+" ".join(text_data.loc[k,'text'].split())
            enc = tokenizer.encode(text1)
            #st = tokenizer.decode(enc.ids[a-1:b])
            st = extend_select(tokenizer, a, b, text1)
        all_selected_text.append(st)
    return all_selected_text

# Validate on train fold

In [None]:
# train_fd = pd.read_csv(ROOT_PATH+'/input/tweet-train-folds-v2/train_folds.csv')

In [None]:
# train_fd.head()

In [None]:
# test = train_fd.sample(3, random_state=777).reset_index(drop=True)
# test
#pt_roberta_start_word_probs, pt_roberta_end_word_probs = pt_roberta_prediction_in_word(test, len_by_word=TEXT_LEN_BY_WORD, n_splits=1, model_path=ROOT_PATH+'/input/ptrobertafinetune716')
#tf_roberta_start_word_probs, tf_roberta_end_word_probs = tf_roberta_prediction_in_word_2(test, len_by_word=TEXT_LEN_BY_WORD, n_splits=1, model_path=ROOT_PATH+'/input/alex-roberta')
#pt_albert_start_word_probs, pt_albert_end_word_probs = pt_albert_prediction_in_word(test, len_by_word=TEXT_LEN_BY_WORD, n_splits=1, model_path=ROOT_PATH+'/input/albert-base')
#pt_xlnet_start_word_probs, pt_xlnet_end_word_probs = pt_xlnet_prediction_in_word(test, len_by_word=TEXT_LEN_BY_WORD, n_splits=1, model_path=ROOT_PATH+'/input/xlnet-base-models')

In [None]:
# start_1, end_1 = tf_roberta_prediction_in_word_2(test, len_by_word=TEXT_LEN_BY_WORD, n_splits=1, model_path=ROOT_PATH+'/input/alex-roberta', token_probs=True)
# start_2, end_2 = tf_roberta_prediction_in_word(test, len_by_word=TEXT_LEN_BY_WORD, n_splits=1, model_path=ROOT_PATH+'/input/tfrobertafinetuneaddspan711', token_probs=True)

In [None]:
# r=0.5
# start_probs = np.zeros((test.shape[0],TFROBERTA_MAX_LEN),dtype=np.float32)
# end_probs = np.zeros((test.shape[0],TFROBERTA_MAX_LEN),dtype=np.float32)

# start_probs += r*start_1
# start_probs += (1-r)*start_2

# end_probs += r*end_1
# end_probs += (1-r)*end_2

In [None]:
# tokenizer = tokenizers.ByteLevelBPETokenizer(
#         vocab_file=ROOT_PATH + '/input/tf-roberta/vocab-roberta-base.json', 
#         merges_file=ROOT_PATH + '/input/tf-roberta/merges-roberta-base.txt', 
#         lowercase=True,
#         add_prefix_space=True)

# predict_decode(tokenizer, test, start_probs, end_probs, test.index)

In [None]:
# train = train_fd[train_fd['kfold']==0].copy().reset_index(drop=True)    
# train = train.loc[27:29, ].reset_index(drop=True)
# pt_albert_start_word_probs, pt_albert_end_word_probs = pt_albert_prediction_in_word(train, len_by_word=TEXT_LEN_BY_WORD, n_splits=1, model_path=ROOT_PATH+'/input/albert-base')

In [None]:
#train.loc[1, 'text']

In [None]:
# # # #for k in range(5):
# k=0
# print('>>>> FOLD %i >>>>>>>'%k)

# train = train_fd[train_fd['kfold']==k].copy().reset_index(drop=True)    
# start_word_probs_1, end_word_probs_1 = tf_roberta_prediction_in_word_2(train, len_by_word=TEXT_LEN_BY_WORD, n_splits=1, model_path=ROOT_PATH+'/input/alex-roberta')
# start_word_probs_2, end_word_probs_2 = tf_roberta_prediction_in_word(train, len_by_word=TEXT_LEN_BY_WORD, n_splits=5, model_path=ROOT_PATH+'/input/tfrobertafinetuneaddspan711')

# selelcted_text_1 = selected_text_by_word_probs(train, start_word_probs_1, end_word_probs_1)
# selelcted_text_2 = selected_text_by_word_probs(train, start_word_probs_2, end_word_probs_2)

# start_word_probs = np.zeros((train.shape[0],TEXT_LEN_BY_WORD),dtype=np.float32)
# end_word_probs = np.zeros((train.shape[0],TEXT_LEN_BY_WORD),dtype=np.float32)

# start_word_probs += r*start_word_probs_1
# start_word_probs += (1-r)*start_word_probs_2

# end_word_probs += r*end_word_probs_1
# end_word_probs += (1-r)*end_word_probs_2

# all_selected_text = selected_text_by_word_probs(train, start_word_probs, end_word_probs)

# jac_1 = []
# jac_2 = []
# ensemble_jac = []
# for i in range(train.shape[0]):
#     jac_1.append(jaccard(train.loc[i, 'selected_text'], selelcted_text_1[i]))
#     jac_2.append(jaccard(train.loc[i, 'selected_text'], selelcted_text_2[i]))
#     ensemble_jac.append(jaccard(train.loc[i, 'selected_text'], all_selected_text[i]))

# print('>>>> jaccard 1 =', np.mean(jac_1))
# print('>>>> jaccard 2 =', np.mean(jac_2))
# print('>>>> ensemble jaccard =', np.mean(ensemble_jac))

In [None]:
# tokenizer = tokenizers.ByteLevelBPETokenizer(
#         vocab_file=ROOT_PATH + '/input/tf-roberta/vocab-roberta-base.json', 
#         merges_file=ROOT_PATH + '/input/tf-roberta/merges-roberta-base.txt', 
#         lowercase=True,
#         add_prefix_space=True)

# # # #for k in range(5):
# k=0
# print('>>>> FOLD %i >>>>>>>'%k)

# train = train_fd[train_fd['kfold']==k].copy().reset_index(drop=True)    
# start_1, end_1 = tf_roberta_prediction_in_word_2(train, len_by_word=TEXT_LEN_BY_WORD, n_splits=1, model_path=ROOT_PATH+'/input/alex-roberta', token_probs=True)
# start_2, end_2 = tf_roberta_prediction_in_word(train, len_by_word=TEXT_LEN_BY_WORD, n_splits=5, model_path=ROOT_PATH+'/input/tfrobertafinetuneaddspan711', token_probs=True)

# selelcted_text_1 = predict_decode(tokenizer,train, start_1, end_1, train.index)
# selelcted_text_2 = predict_decode(tokenizer,train, start_2, end_2, train.index)

# jac_1 = []
# jac_2 = []

# for i in range(train.shape[0]):
#     jac_1.append(jaccard(train.loc[i, 'selected_text'], selelcted_text_1[i]))
#     jac_2.append(jaccard(train.loc[i, 'selected_text'], selelcted_text_2[i]))
    
# print('>>>> jaccard 1 =', np.mean(jac_1))
# print('>>>> jaccard 2 =', np.mean(jac_2))

In [None]:
# r=0.1594

In [None]:
# start_probs = np.zeros((train.shape[0],TFROBERTA_MAX_LEN),dtype=np.float32)
# end_probs = np.zeros((train.shape[0],TFROBERTA_MAX_LEN),dtype=np.float32)

# start_probs += r*start_1
# start_probs += (1-r)*start_2

# end_probs += r*end_1
# end_probs += (1-r)*end_2

# all_selected_text = predict_decode(tokenizer,train, start_probs, end_probs, train.index)

# ensemble_jac = []
# for i in range(train.shape[0]):
#     ensemble_jac.append(jaccard(train.loc[i, 'selected_text'], all_selected_text[i]))

# print('>>>> ensemble jaccard =', np.mean(ensemble_jac))

# Submission

In [None]:
r=0.5

In [None]:
tokenizer = tokenizers.ByteLevelBPETokenizer(
        vocab_file=ROOT_PATH + '/input/tf-roberta/vocab-roberta-base.json', 
        merges_file=ROOT_PATH + '/input/tf-roberta/merges-roberta-base.txt', 
        lowercase=True,
        add_prefix_space=True)

In [None]:
# """
# **************************************************************
# Ensemble
# **************************************************************
# """
# test = pd.read_csv(ROOT_PATH+'/input/tweet-sentiment-extraction/test.csv')
# #test = test.sample(3, random_state=SEED).reset_index(drop=True)

# start_1, end_1 = tf_roberta_prediction_in_word_2(test, len_by_word=TEXT_LEN_BY_WORD, n_splits=1, model_path=ROOT_PATH+'/input/alex-roberta', token_probs=True)
# start_2, end_2 = tf_roberta_prediction_in_word(test, len_by_word=TEXT_LEN_BY_WORD, n_splits=5, model_path=ROOT_PATH+'/input/tfrobertafinetuneaddspan711', token_probs=True)

# start_word_probs = np.zeros((test.shape[0],TEXT_LEN_BY_WORD),dtype=np.float32)
# end_word_probs = np.zeros((test.shape[0],TEXT_LEN_BY_WORD),dtype=np.float32)

# start_word_probs += r*start_word_probs_1
# start_word_probs += (1-r)*start_word_probs_2

# end_word_probs += r*end_word_probs_1
# end_word_probs += (1-r)*end_word_probs_2


# all_selected_text = predict_decode(tokenizer,test, start_probs, end_probs, test.index)

# test['selected_text'] = all_selected_text
# test[['textID','selected_text']].to_csv('submission.csv',index=False)

In [None]:
# """
# **************************************************************
# Ensemble
# **************************************************************
# """
# test = pd.read_csv(ROOT_PATH+'/input/tweet-sentiment-extraction/test.csv')
# #test = test.sample(3, random_state=SEED).reset_index(drop=True)

# start_1, end_1 = tf_roberta_prediction_in_word_2(test, len_by_word=TEXT_LEN_BY_WORD, n_splits=1, model_path=ROOT_PATH+'/input/alex-roberta', token_probs=True)
# start_2, end_2 = tf_roberta_prediction_in_word(test, len_by_word=TEXT_LEN_BY_WORD, n_splits=5, model_path=ROOT_PATH+'/input/tfrobertafinetuneaddspan711', token_probs=True)

# start_probs = np.zeros((test.shape[0],TFROBERTA_MAX_LEN),dtype=np.float32)
# end_probs = np.zeros((test.shape[0],TFROBERTA_MAX_LEN),dtype=np.float32)

# start_probs += r*start_1
# start_probs += (1-r)*start_2

# end_probs += r*end_1
# end_probs += (1-r)*end_2


# all_selected_text = predict_decode(tokenizer,test, start_probs, end_probs, test.index)

# test['selected_text'] = all_selected_text
# test[['textID','selected_text']].to_csv('submission.csv',index=False)

In [None]:
"""
**************************************************************
Ensemble
**************************************************************
"""
test = pd.read_csv(ROOT_PATH+'/input/tweet-sentiment-extraction/test.csv')
#test = test.sample(3, random_state=SEED).reset_index(drop=True)

start_1, end_1 = tf_roberta_prediction_in_word_2(test, len_by_word=TEXT_LEN_BY_WORD, n_splits=8, model_path=ROOT_PATH+'/input/reg-0-05', token_probs=True)
start_2, end_2 = tf_roberta_prediction_in_word(test, len_by_word=TEXT_LEN_BY_WORD, n_splits=5, model_path=ROOT_PATH+'/input/tfrobertafinetuneaddspan711', token_probs=True)

In [None]:
start_probs = np.zeros((test.shape[0],TFROBERTA_MAX_LEN),dtype=np.float32)
end_probs = np.zeros((test.shape[0],TFROBERTA_MAX_LEN),dtype=np.float32)


for i in range(test.shape[0]):
    if 0.8*np.var(start_1[i,:]) >= np.var(start_2[i,:]): 
        start_probs = start_1
    else:
        start_probs = start_2

    if 0.8*np.var(end_1[i,:]) >= np.var(end_2[i,:]): 
        end_probs = end_1
    else:
        end_probs = end_2


all_selected_text = predict_decode(tokenizer,test, start_probs, end_probs, test.index)

test['selected_text'] = all_selected_text
test[['textID','selected_text']].to_csv('submission.csv',index=False)

In [None]:
# probs_var = np.zeros((test.shape[0],4),dtype=np.float32)
# for i in range(test.shape[0]):
#     probs_var[i, 0] = np.var(start_1[i,])
#     probs_var[i, 1] = np.var(start_2[i,])
#     probs_var[i, 2] = np.var(end_1[i,])
#     probs_var[i, 3] = np.var(end_2[i,])

In [None]:
# start_1[:5,]

In [None]:
# probs_var[:15,]