# Libraries

In [None]:
import numpy as np
import pandas as pd
import os
import warnings
import random
import torch 
from torch import nn
import torch.optim as optim
from sklearn.model_selection import StratifiedKFold
import tokenizers
from transformers import RobertaModel, RobertaConfig

warnings.filterwarnings('ignore')

# Seed

In [None]:
def seed_everything(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = True

seed = 42
seed_everything(seed)

# Data Loader

In [None]:
class TweetDataset(torch.utils.data.Dataset):
    def __init__(self, df, max_len=105):
        self.df = df
        self.max_len = max_len
        self.labeled = 'selected_text' in df
        self.tokenizer = tokenizers.ByteLevelBPETokenizer(
            vocab_file='../input/robertalarge/vocab.json', 
            merges_file='../input/robertalarge/merges.txt', 
            lowercase=True,
            add_prefix_space=True)

    def __getitem__(self, index):
        data = {}
        row = self.df.iloc[index]
        
        ids, masks, tweet, offsets, sentiment, ID_ = self.get_input_data(row)
        data['ids'] = ids              
        data['masks'] = masks      
        data['tweet'] = tweet      
        data['offsets'] = offsets 
        data['sentiment'] = sentiment
        data['ID'] = ID_
        
        if self.labeled:
            start_idx, end_idx = self.get_target_idx(row, tweet, offsets)
            data['start_idx'] = start_idx
            data['end_idx'] = end_idx
        
        return data

    def __len__(self):
        return len(self.df)
    
    def get_input_data(self, row):
        tweet = " " + " ".join(row.text.lower().split())
        encoding = self.tokenizer.encode(tweet)
        sentiment_id = self.tokenizer.encode(row.sentiment).ids
        ids = [0] + sentiment_id + [2, 2] + encoding.ids + [2]
        offsets = [(0, 0)] * 4 + encoding.offsets + [(0, 0)]
                
        pad_len = self.max_len - len(ids)
        if pad_len > 0:
            ids += [1] * pad_len
            offsets += [(0, 0)] * pad_len
        
        ids = torch.tensor(ids)
        masks = torch.where(ids != 1, torch.tensor(1), torch.tensor(0))
        offsets = torch.tensor(offsets)
        
        sentiment = row.sentiment
        ID_ = row.textID
        
        return ids, masks, tweet, offsets, sentiment, ID_
        
    def get_target_idx(self, row, tweet, offsets):
        selected_text = " " +  " ".join(row.selected_text.lower().split())

        len_st = len(selected_text) - 1
        idx0 = None
        idx1 = None

        for ind in (i for i, e in enumerate(tweet) if e == selected_text[1]):
            if " " + tweet[ind: ind+len_st] == selected_text:
                idx0 = ind
                idx1 = ind + len_st - 1
                break

        char_targets = [0] * len(tweet)
        if idx0 != None and idx1 != None:
            for ct in range(idx0, idx1 + 1):
                char_targets[ct] = 1

        target_idx = []
        for j, (offset1, offset2) in enumerate(offsets):
            if sum(char_targets[offset1: offset2]) > 0:
                target_idx.append(j)

        start_idx = target_idx[0]
        end_idx = target_idx[-1]
        
        return start_idx, end_idx
        
def get_train_val_loaders(df, train_idx, val_idx, batch_size=8):
    train_df = df.iloc[train_idx]
    val_df = df.iloc[val_idx]

    train_loader = torch.utils.data.DataLoader(
        TweetDataset(train_df), 
        batch_size=batch_size, 
        shuffle=True, 
        num_workers=2,
        drop_last=True)

    val_loader = torch.utils.data.DataLoader(
        TweetDataset(val_df), 
        batch_size=batch_size, 
        shuffle=False, 
        num_workers=2)

    dataloaders_dict = {"train": train_loader, "val": val_loader}

    return dataloaders_dict

def get_test_loader(df, batch_size=32):
    loader = torch.utils.data.DataLoader(
        TweetDataset(df), 
        batch_size=batch_size, 
        shuffle=False, 
        num_workers=2)    
    return loader

# Model

In [None]:
class roberta_large_one_layer(nn.Module):
    def __init__(self):
        super(roberta_large_one_layer, self).__init__()
        
        config = RobertaConfig.from_pretrained(
            '../input/robertalarge/config.json', output_hidden_states=True)    
        self.roberta = RobertaModel.from_pretrained(
            '../input/robertalarge/pytorch_model.bin', config=config)
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(config.hidden_size*4, 2)
#         self.rl = nn.LeakyReLU()
#         self.fc2 = nn.Linear(2000 , 1000)
#         self.rl2 = nn.LeakyReLU()
#         self.fc3 = nn.Linear(1000 , 2)

        
       
        nn.init.normal_(self.fc.weight, std=0.02)
        nn.init.normal_(self.fc.bias, 0)
#         nn.init.normal_(self.fc2.weight, std=0.02)
#         nn.init.normal_(self.fc2.bias, 0)
#         nn.init.normal_(self.fc3.weight, std=0.02)
#         nn.init.normal_(self.fc3.bias, 0)
#         nn.init.normal_(self.fc2.weight, std=0.02)
      
        

    def forward(self, input_ids, attention_mask):
        _, _, y = self.roberta(input_ids, attention_mask)
         
        x = torch.cat((y[-1], y[-2], y[-3], y[-4]), dim=-1)
        x = self.dropout(x)
        x = self.fc(x)
#         x = self.rl(x)
#         x = self.fc2(x)
#         x = self.rl2(x)
#         x = self.fc3(x)
       
        
        start_logits, end_logits = x.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)
                
        return start_logits, end_logits

In [None]:
class roberta_large_three_layer(nn.Module):
    def __init__(self):
        super(roberta_large_three_layer, self).__init__()
        
        config = RobertaConfig.from_pretrained(
            '../input/robertalarge/config.json', output_hidden_states=True)    
        self.roberta = RobertaModel.from_pretrained(
            '../input/robertalarge/pytorch_model.bin', config=config)
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(config.hidden_size * 4 , 1000)
        self.rl = nn.LeakyReLU()
        
        
        self.fc2 = nn.Linear(1000 , 2)

        
       
        nn.init.normal_(self.fc.weight, std=0.02)
        nn.init.normal_(self.fc.bias, 0)
        nn.init.normal_(self.fc2.weight, std=0.02)
      
        

    def forward(self, input_ids, attention_mask):
        _, _, y = self.roberta(input_ids, attention_mask)
         
        x = y = torch.cat((y[-1], y[-2],y[-3], y[-4]), dim=-1)
        x = self.dropout(x)
        x = self.fc(x)
        x = self.rl(x)
        x = self.fc2(x)
       
        
        start_logits, end_logits = x.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)
                
        return start_logits, end_logits

In [None]:
class robert_2_fc_model(nn.Module):
    def __init__(self):
        super(robert_2_fc_model, self).__init__()
        
        config = RobertaConfig.from_pretrained('../input/roberta-base/config.json')
        config.output_hidden_states=True
        
        self.roberta = transformers.RobertaModel.from_pretrained('../input/roberta-base/pytorch_model.bin', config = config)
        self.drop_out = nn.Dropout(0.5)
        
        
        self.fc1      = nn.Linear(768 * 4, 1000)
        self.rl1      = nn.LeakyReLU()
        self.fc2      = nn.Linear(1000, 2)
#         self.rl2      = nn.LeakyReLU()
#         self.fc3      = nn.Linear(500 ,2 )

        torch.nn.init.normal_(self.fc1.weight, std=0.02)
        torch.nn.init.normal_(self.fc2.weight, std=0.02)
#         torch.nn.init.normal_(self.fc3.weight, std=0.02)
        
        torch.nn.init.normal_(self.fc1.bias, 0.0)
        torch.nn.init.normal_(self.fc2.bias, 0.0)
#         torch.nn.init.normal_(self.fc3.bias, 0.0)
        
        
    
        
    def forward(self,ids, token_ids, mask):
        _, _, y = self.roberta( ids, attention_mask = mask) # get hidden state (max_len, 768)
        # y = hidden state 
        y = torch.cat((y[-1], y[-2], y[-3],y[-4] ), dim=-1)  # output from the last two layers. Output of each layer has shape [batch size][max len][768]
        
        y = self.drop_out(y)
        y = self.fc1(y)
        y = self.rl1(y)
        y = self.fc2(y)

        
        output = y.squeeze(-1)
        st, en = y.split(1,dim = -1)
        st = st.squeeze(-1) # propto prob of st 
        en = en.squeeze(-1) # propto prob of en 
        return st, en  
        
        
    

# Loss Function

In [None]:
def loss_fn(start_logits, end_logits, start_positions, end_positions):
    ce_loss = nn.CrossEntropyLoss()
    start_loss = ce_loss(start_logits, start_positions)
    end_loss = ce_loss(end_logits, end_positions)    
    total_loss = start_loss + end_loss
    return total_loss

# Evaluation Function

In [None]:
def get_selected_text(text, start_idx, end_idx, offsets):
    selected_text = ""
    for ix in range(start_idx, end_idx + 1):
        selected_text += text[offsets[ix][0]: offsets[ix][1]]
        if (ix + 1) < len(offsets) and offsets[ix][1] < offsets[ix + 1][0]:
            selected_text += " "
    return selected_text

def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

def compute_jaccard_score(text, start_idx, end_idx, start_logits, end_logits, offsets):
    start_pred = np.argmax(start_logits)
    end_pred = np.argmax(end_logits)
    if start_pred > end_pred:
        pred = text
    else:
        pred = get_selected_text(text, start_pred, end_pred, offsets)
        
    true = get_selected_text(text, start_idx, end_idx, offsets)
    
    return jaccard(true, pred)

# Inference

In [None]:
def test_model(model, dataloader, optimizer = None ):
    model.cuda()
    model.train()
    model.eval()
    iter_ = 0 
    
    sum_start_logits = []
    sum_end_logits = [] 
    
    for data in dataloader:
        if( iter_ % 20 ==0):
            print('iter',iter_ )
        
        ids = data['ids'].cuda()
        masks = data['masks'].cuda()
        ID_ = data['ID']
        sentiment = data['sentiment']
   
        with torch.set_grad_enabled(False):
            output = model(ids, masks)
        
            sm = nn.Softmax(dim=1)

            start_logits = sm( output[0] )
            end_logits   = sm( output[1] )
        
            
            for i in range(len(ids)): 
                sum_start_logits.append( start_logits[i] )
                sum_end_logits.append( end_logits[i] )
                iter_ = iter_ + 1  
                
    return [np.asarray(sum_start_logits), np.asarray(sum_end_logits) ] 

In [None]:

test_df = pd.read_csv('../input/tweet-sentiment-extraction/test.csv')
test_df['text'] = test_df['text'].astype(str)
test_loader = get_test_loader(test_df)

In [None]:

model_path =  ["../input/robertaallsentiment/model_"+str(i) for i in range(1,6) ]

sum_st_logits = 0.0
sum_en_logits = 0.0
for path in model_path:
    model = torch.load(path)
    model.cuda()
    model.train()
    model.eval()
    iter_ = 0 
    
    st = []
    en = []
    for data in test_loader:
        if( iter_ % 20 ==0):
            print('iter',iter_ )
        
        ids = data['ids'].cuda()
        masks = data['masks'].cuda()
        ID_ = data['ID']
        sentiment = data['sentiment']
   
        with torch.set_grad_enabled(False):
            output = model(ids, None, masks)
        
            sm = nn.Softmax(dim=1)

            start_logits = sm( output[0] )
            end_logits   = sm( output[1] )
        
            
            for i in range(len(ids)): 
                st.append( start_logits[i].cpu().numpy() )
                en.append( end_logits[i].cpu().numpy()  )
                iter_ = iter_ + 1  
                
        
                
    st, en = np.asarray(st), np.asarray(en) 

    sum_st_logits += st
    sum_en_logits += en 
    
st_1 = sum_st_logits/5.0
en_1 = sum_en_logits/5.0

                
    





In [None]:
%%time

model_list = [ '../input/robertalargethreelayerv1/roberta_fold4.pth' ,'../input/robertalargethreelayerv1/roberta_fold5.pth',
             '../input/robertalargethreelayerv2/roberta_fold2.pth' ,'../input/robertalargethreelayerv2/roberta_fold3.pth',
             '../input/robertalargethreelayerv3/roberta_fold1.pth' ]



sum_st_logits = 0.0
sum_en_logits = 0.0
model = roberta_large_three_layer()

sum_start_logits = 0.0
sum_end_logits = 0.0
for fold in range(0, 5):
    print("Fold ",fold )
    model.load_state_dict(torch.load(  model_list[fold]) )
#     optimizer = optim.AdamW(model.parameters(), lr=3e-5, betas=(0.9, 0.999))
    model.cuda()
    model.train()
    model.eval()
    iter_ = 0 
    
   
    st = []
    en = []
    for data in test_loader:
        if( iter_ % 20 ==0):
            print('iter',iter_ )
        
        ids = data['ids'].cuda()
        masks = data['masks'].cuda()
        ID_ = data['ID']
        sentiment = data['sentiment']
   
        with torch.set_grad_enabled(False):
            output = model(ids, masks)
        
            sm = nn.Softmax(dim=1)

            start_logits = sm( output[0] )
            end_logits   = sm( output[1] )
        
            
            for i in range(len(ids)): 
                st.append( start_logits[i].cpu().numpy() )
                en.append( end_logits[i].cpu().numpy()  )
                iter_ = iter_ + 1  
                
    st, en = np.asarray(st), np.asarray(en) 

    sum_st_logits += st
    sum_en_logits += en 
    
st_2 = sum_st_logits/5.0
en_2 = sum_en_logits/5.0

                
  


In [None]:
%%time

model_list = [ '../input/robertalargeonelayerv1/roberta_fold1.pth' ,'../input/robertalargeonelayerv2/roberta_fold2.pth',
             '../input/robertalargeonelayerv2/roberta_fold3.pth' ,'../input/robertalargeonelayerv3/roberta_fold4.pth',
             '../input/robertalargeonelayerv3/roberta_fold5.pth' ]



sum_st_logits = 0.0
sum_en_logits = 0.0
model = roberta_large_one_layer()

sum_start_logits = 0.0
sum_end_logits = 0.0
for fold in range(0, 5):
    print("Fold ",fold )
    model.load_state_dict(torch.load(  model_list[fold]) )
#     optimizer = optim.AdamW(model.parameters(), lr=3e-5, betas=(0.9, 0.999))
    model.cuda()
    model.train()
    model.eval()
    iter_ = 0 
    
   
    st = []
    en = []
    for data in test_loader:
        if( iter_ % 20 ==0):
            print('iter',iter_ )
        
        ids = data['ids'].cuda()
        masks = data['masks'].cuda()
        ID_ = data['ID']
        sentiment = data['sentiment']
   
        with torch.set_grad_enabled(False):
            output = model(ids, masks)
        
            sm = nn.Softmax(dim=1)

            start_logits = sm( output[0] )
            end_logits   = sm( output[1] )
        
            
            for i in range(len(ids)): 
                st.append( start_logits[i].cpu().numpy() )
                en.append( end_logits[i].cpu().numpy()  )
                iter_ = iter_ + 1  
                
    st, en = np.asarray(st), np.asarray(en) 

    sum_st_logits += st
    sum_en_logits += en 
    
st_3 = sum_st_logits/5.0
en_3 = sum_en_logits/5.0

                
  


In [None]:
predictions = [] 
ID_list = []
it_ = 0 
weight = [4.0,2.0,3.0]
for data in test_loader:
    tweet = data['tweet']
    offsets = data['offsets'].numpy()
    sentiment = data['sentiment']
    ID_ = data['ID']
    
    for i in range(len(tweet)):    
        if( sentiment[i] != 'neutral'):
            st_ = weight[0] * st_1[it_] + weight[1] * st_2[it_] + weight[2] * st_3[it_] 
            en_ = weight[0] * en_1[it_] + weight[1] * en_2[it_] + weight[2] * en_3[it_]
        else:
            st_ =  weight[1] * st_2[it_] + weight[2] * st_3[it_] 
            en_ =  weight[1] * en_2[it_] + weight[2] * en_3[it_]
#         st_ = st_3[it_]
#         en_ = en_3[it_]
        start_pred = np.argmax(st_)
        end_pred = np.argmax(en_)
        if start_pred > end_pred :
            pred = tweet[i]
        else:
            pred = get_selected_text(tweet[i], start_pred, end_pred, offsets[i])
        predictions.append(pred)
        ID_list.append(ID_[i])
        it_ += 1
        
print(len(ID_list), len(predictions))
        


# Submission

In [None]:
dic = {'textID':ID_list, 'selected_text':predictions}
df = pd.DataFrame.from_dict(dic)
df.head()

In [None]:
df['selected_text'] = df['selected_text'].apply(lambda x: x.replace('!!!!', '!') if len(x.split())==1 else x)
df['selected_text'] = df['selected_text'].apply(lambda x: x.replace('..', '.') if len(x.split())==1 else x)
df['selected_text'] = df['selected_text'].apply(lambda x: x.replace('...', '.') if len(x.split())==1 else x)
df.to_csv('submission.csv', index=False)
df.head()