In [None]:
import re
import os
import json
import torch
import pandas as pd
import torch.nn as nn
import numpy as np
import torch.nn.functional as F
from torch.optim import lr_scheduler

from sklearn.model_selection import StratifiedKFold
from sklearn import model_selection
from sklearn import metrics
import tokenizers
from transformers import *
from tqdm.autonotebook import tqdm
import utils

In [None]:
SEED = 1337
def set_seed(SEED):
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)

In [None]:
ARCH_NAME = "deepset/roberta-base-squad2"
ARCH = ARCH_NAME.split("/")[1]
MAX_LEN = 128
BATCH_SIZE = 64
NUM_EPOCHS = 5
LEARNING_RATE = 3e-5

In [None]:
PATH = '../input/tweet-sentiment-extraction/'
def read_train():
    train = pd.read_csv('{}train.csv'.format(PATH))
    train['text'] = train['text'].astype(str)
    train['selected_text'] = train['selected_text'].astype(str)
    
    return train
    
train = read_train()

In [None]:
train.reset_index(inplace=True, drop=True)

In [None]:
tokenizer = RobertaTokenizer.from_pretrained(ARCH_NAME, lowercase=True,
    add_prefix_space=True)

# Zip it to make it easier to export
!rm -rf {ARCH}
!mkdir {ARCH}
mod = RobertaForQuestionAnswering.from_pretrained(ARCH_NAME)
mod.save_pretrained("%s/" % (ARCH, ))
tokenizer.save_pretrained("%s/" % (ARCH, ))
!zip -r {ARCH}.zip {ARCH}
print()
print("All configurations stored in %s.zip" % (ARCH))

In [None]:
tokenizer = tokenizers.ByteLevelBPETokenizer(
    vocab_file=f"{ARCH}/vocab.json", 
    merges_file=f"{ARCH}/merges.txt", 
    lowercase=True,
    add_prefix_space=True
)
sentiment_id = {'positive': tokenizer.encode("positive", add_special_tokens=False).ids, 
                'negative': tokenizer.encode("negative", add_special_tokens=False).ids, 
                'neutral': tokenizer.encode("neutral", add_special_tokens=False).ids}
print(sentiment_id)

In [None]:
def jaccard(str1, str2): 
    a = set(str1.strip().lower().split()) 
    b = set(str2.strip().lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [None]:
ct = train.shape[0]
input_ids = np.ones((ct,MAX_LEN),dtype='int32')
token_type_ids = np.zeros((ct,MAX_LEN),dtype='int32')
attention_mask = np.zeros((ct,MAX_LEN),dtype='int32')
start_tokens = np.zeros((ct,MAX_LEN),dtype='int32')
end_tokens = np.zeros((ct,MAX_LEN),dtype='int32')
offsets = np.zeros((ct,MAX_LEN, 2),dtype='int32')

incorrect_samples_idx = []
count = 0

for k in range(train.shape[0]):
    text = " " + " ".join(str(train.iloc[k].text).split())
    selected_text = " " + " ".join(str(train.iloc[k].selected_text).split())
    sentiment = train.iloc[k].sentiment.strip()
    
    encoded = tokenizer.encode(text).ids
    
    len_st = len(selected_text) - 1
    idx0 = None
    idx1 = None

    for ind in (i for i, e in enumerate(text) if e == selected_text[1]):
        if " " + text[ind: ind+len_st] == selected_text:
            idx0 = ind
            idx1 = ind + len_st - 1
            break

    char_targets = [0] * len(text)
    if idx0 != None and idx1 != None:
        for ct in range(idx0, idx1 + 1):
            char_targets[ct] = 1
    
    tok_tweet = tokenizer.encode(text)
    input_ids_orig = tok_tweet.ids
    tweet_offsets = tok_tweet.offsets
    
    target_idx = []
    for j, (offset1, offset2) in enumerate(tweet_offsets):
        if sum(char_targets[offset1: offset2]) > 0:
            target_idx.append(j)
    
    sample_input_id = [0] + sentiment_id[sentiment] + [2] + [2] + input_ids_orig + [2]
    sample_token_type_id = [0, 0, 0, 0] + [0] * (len(input_ids_orig) + 1)
    sample_attention_mask = [1] * len(sample_token_type_id)
    sample_tweet_offsets = [(0, 0)] * 4 + tweet_offsets + [(0, 0)]

    start_index = target_idx[0] + 4
    end_index = target_idx[-1] + 4

    padding_length = MAX_LEN - len(sample_input_id)
    if padding_length > 0:
        sample_input_id = sample_input_id + ([1] * padding_length)
        sample_token_type_id = sample_token_type_id + ([0] * padding_length)
        sample_attention_mask = sample_attention_mask + ([0] * padding_length)
        sample_tweet_offsets = sample_tweet_offsets + ([(0, 0)] * padding_length)

    input_ids[k] = sample_input_id
    token_type_ids[k] = sample_token_type_id
    attention_mask[k] = sample_attention_mask
    start_tokens[k][start_index] = 1
    end_tokens[k][end_index] = 1
    offsets[k] = sample_tweet_offsets

    
# Merged tokens
merged_tokens = start_tokens | end_tokens
for k in range(ct):
    ranges = np.argwhere(merged_tokens[k] > 0)
    if len(ranges) == 2:
        start_index, end_index = ranges
        for j in range(start_index[0], end_index[0]+1):
            merged_tokens[k][j] = 1

start_tokens = start_tokens.astype(np.long)
end_tokens = end_tokens.astype(np.long)
merged_tokens = merged_tokens.astype(np.float32)

In [None]:
def final_post_proc(text, predicted_text):
    intersection = set(text).intersection(set(predicted_text))
    text_set = set(text)
    
    if intersection == text_set:
        return text
    
    if len(text_set) - len(intersection) < 2:
        return text
    
    return predicted_text

In [None]:
test_jacc_arr = []
for i in range(input_ids.shape[0]):
    example_number = i
    if 1 in list(start_tokens[example_number]):
        test_start = list(start_tokens[example_number]).index(1)
    else:
        test_start = 0
    if 1 in list(end_tokens[example_number]):
        test_end = list(end_tokens[example_number]).index(1)
    else:
        test_end = 0
    
    if test_start == test_end:
        test_selected_tokens = [input_ids[example_number][test_start]]
        selected_text = tokenizer.decode(test_selected_tokens).strip()
        test_jacc_arr.append(jaccard(selected_text, train.iloc[example_number].selected_text))
    else:
        test_selected_tokens = input_ids[example_number][test_start:test_end+1]
        selected_text = tokenizer.decode(test_selected_tokens).strip()
        test_jacc_arr.append(jaccard(selected_text, train.iloc[example_number].selected_text))

print(sum(test_jacc_arr) / len(test_jacc_arr))

In [None]:
def calculate_jaccard_score(original_tweet, input_id, sentiment, selected_text, start_index, end_index, tokenizer):
    def jaccard(str1, str2): 
        a = set(str1.strip().lower().split()) 
        b = set(str2.strip().lower().split())
        c = a.intersection(b)
        return float(len(c)) / (len(a) + len(b) - len(c))

    if end_index < start_index:
        end_index = start_index
    
    if start_index < 4 or end_index < 4:
        predicted = original_tweet
    else:
        predicted = tokenizer.decode(input_id[start_index: end_index+1]).strip()

    return jaccard(predicted, selected_text)

In [None]:
class TweetDataset:
    def __init__(self, train, idx):
        self.train = train.iloc[idx].reset_index(drop=True)
        self.input_ids = input_ids[idx]
        self.attention_mask = attention_mask[idx]
        self.start_tokens = start_tokens[idx]
        self.end_tokens = end_tokens[idx]
        self.merged_tokens = merged_tokens[idx]
        self.offsets = offsets[idx]
    
    def __len__(self):
        return self.train.shape[0]

    def __getitem__(self, k):
        example = self.train.iloc[k]
        example_input_ids = self.input_ids[k, ]
        example_attention_mask = self.attention_mask[k, ]
        example_start_tokens = list(self.start_tokens[k, ]).index(1)
        example_end_tokens = list(self.end_tokens[k, ]).index(1)
        example_merged_tokens = self.merged_tokens[k, ]
        example_offsets = self.offsets[k, ]
        return {
            'text' : " " + str(example.text).strip(),
            'selected_text': " " + str(example.selected_text).strip(),
            'sentiment': example.sentiment,
            'input_ids': torch.tensor(example_input_ids, dtype=torch.long),
            'attention_mask': torch.tensor(example_attention_mask, dtype=torch.long),
            'start_tokens': torch.tensor(example_start_tokens, dtype=torch.long),
            'end_tokens': torch.tensor(example_end_tokens, dtype=torch.long),
            'merged_tokens': torch.tensor(example_merged_tokens, dtype=torch.float),
            'offsets': torch.tensor(example_offsets, dtype=torch.long)
        }

In [None]:
class TweetModelTripleHead(BertPreTrainedModel):
    def __init__(self, config):
        super(TweetModelTripleHead, self).__init__(config)
        self.bert = RobertaForQuestionAnswering.from_pretrained(ARCH, config=config)
        self.drop_out = nn.Dropout(0.10)
        self.merged_head = nn.Linear(1536, 1)
    
    def forward(self, input_ids, attention_mask):
        start_logits, end_logits, pooled = self.bert(
            input_ids,
            attention_mask,
        )
        start_logits = self.drop_out(start_logits)
        end_logits = self.drop_out(end_logits)
        
        pooled = torch.cat((pooled[-1], pooled[-2]), dim=-1)
        merged_logits = self.merged_head(pooled)
        merged_logits = self.drop_out(merged_logits)
        
        return start_logits, end_logits, merged_logits.squeeze(-1)

In [None]:
def linear_combination(x, y, epsilon): 
    return epsilon*x + (1-epsilon)*y

def reduce_loss(loss, reduction='mean'):
    return loss.mean() if reduction=='mean' else loss.sum() if reduction=='sum' else loss

class LabelSmoothingCrossEntropy(nn.Module):
    def __init__(self, epsilon:float=0.1, reduction='mean'):
        super().__init__()
        self.epsilon = epsilon
        self.reduction = reduction
    
    def forward(self, preds, target):
        n = preds.size()[-1]
        log_preds = F.log_softmax(preds, dim=-1)
        loss = reduce_loss(-log_preds.sum(dim=-1), self.reduction)
        nll = F.nll_loss(log_preds, target, reduction=self.reduction)
        return linear_combination(loss/n, nll, self.epsilon)

def dice_ce_loss(pred, target):
    loss_fct = nn.BCEWithLogitsLoss()
    numerator = 2 * torch.sum(torch.abs(pred * target))
    denominator = torch.sum(torch.abs(pred) + torch.abs(target))
    return loss_fct(pred, target) + (1 - (numerator / denominator))

def loss_fn(start_logits, start_positions, end_logits, end_positions, merged_logits, merged_positions):
    loss_fct = LabelSmoothingCrossEntropy()
    start_loss = loss_fct(start_logits, start_positions)
    end_loss = loss_fct(end_logits, end_positions)
    merged_loss = dice_ce_loss(merged_logits, merged_positions)
    return start_loss, end_loss, merged_loss

In [None]:
def train_fn(data_loader, model, optimizer, device, scheduler=None):
    model.train()
    start_losses = utils.AverageMeter()
    end_losses = utils.AverageMeter()
    merged_losses = utils.AverageMeter()
    jaccards = utils.AverageMeter()

    tk0 = tqdm(data_loader, total=len(data_loader))
    
    for bi, d in enumerate(tk0):
        input_ids = d["input_ids"]
        attention_mask = d["attention_mask"]
        targets_start = d["start_tokens"]
        targets_end = d["end_tokens"]
        targets_merged = d["merged_tokens"]
        text = d["text"]
        selected_text = d["selected_text"]
        sentiment = d["sentiment"]

        input_ids = input_ids.to(device, dtype=torch.long)
        attention_mask = attention_mask.to(device, dtype=torch.long)
        targets_start = targets_start.to(device, dtype=torch.long)
        targets_end = targets_end.to(device, dtype=torch.long)
        targets_merged = targets_merged.to(device, dtype=torch.float)

        model.zero_grad()
        outputs_start, outputs_end, outputs_merged = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        
        start_loss, end_loss, merged_loss = loss_fn(outputs_start, targets_start,
                                                    outputs_end, targets_end,
                                                    outputs_merged, targets_merged)
        
        loss = start_loss + end_loss + 2*merged_loss
        loss.backward()
        
        optimizer.step()
        scheduler.step()
            
        jaccard_scores = []
        for px in range(len(text)):
            jaccard_scores.append(calculate_jaccard_score(text[px],
                                                          list(input_ids[px]),
                                                          sentiment[px],
                                                          selected_text[px],
                                                          torch.argmax(outputs_start[px]),
                                                          torch.argmax(outputs_end[px]),
                                                          tokenizer))
        
        jaccards.update(np.mean(jaccard_scores), input_ids.size(0))
        start_losses.update(start_loss.item(), input_ids.size(0))
        end_losses.update(end_loss.item(), input_ids.size(0))
        merged_losses.update(merged_loss.item(), input_ids.size(0))
        tk0.set_postfix(start_loss=start_losses.avg, end_loss=end_losses.avg,
                        merged_loss=merged_losses.avg, jaccard=jaccards.avg)

In [None]:
def eval_fn(data_loader, model, device):
    model.eval()
    losses = utils.AverageMeter()
    start_losses = utils.AverageMeter()
    end_losses = utils.AverageMeter()
    merged_losses = utils.AverageMeter()
    jaccards = utils.AverageMeter()
    
    with torch.no_grad():
        tk0 = tqdm(data_loader, total=len(data_loader))
        for bi, d in enumerate(tk0):
            
            input_ids = d["input_ids"]
            attention_mask = d["attention_mask"]
            targets_start = d["start_tokens"]
            targets_end = d["end_tokens"]
            targets_merged = d["merged_tokens"]
            text = d["text"]
            selected_text = d["selected_text"]
            sentiment = d["sentiment"]
        
            input_ids = input_ids.to(device, dtype=torch.long)
            attention_mask = attention_mask.to(device, dtype=torch.long)
            targets_start = targets_start.to(device, dtype=torch.long)
            targets_end = targets_end.to(device, dtype=torch.long)
            targets_merged = targets_merged.to(device, dtype=torch.float)

            outputs_start, outputs_end, outputs_merged = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            
            start_loss, end_loss, merged_loss = loss_fn(outputs_start, targets_start,
                                                        outputs_end, targets_end,
                                                        outputs_merged, targets_merged)
                        
            jaccard_scores = []
            for px in range(len(text)):
                jaccard_scores.append(calculate_jaccard_score(text[px], 
                                                              list(input_ids[px]),
                                                              sentiment[px],
                                                              selected_text[px],
                                                              torch.argmax(outputs_start[px]),
                                                              torch.argmax(outputs_end[px]),
                                                              tokenizer))

            jaccards.update(np.mean(jaccard_scores), input_ids.size(0))
            start_losses.update(start_loss.item(), input_ids.size(0))
            end_losses.update(end_loss.item(), input_ids.size(0))
            merged_losses.update(merged_loss.item(), input_ids.size(0))
            tk0.set_postfix(start_loss=start_losses.avg, end_loss=end_losses.avg,
                            merged_loss=merged_losses.avg, jaccard=jaccards.avg)
            
    print(f"Jaccard = {jaccards.avg}")
    return jaccards.avg

In [None]:
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=777)

In [None]:
def run(fold, dfx):
    train_idx, valid_idx = list(skf.split(train, train.sentiment.values))[fold]
    
    train_dataset = TweetDataset(
        dfx, 
        train_idx
    )

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=BATCH_SIZE,
        num_workers=4,
        shuffle=True
    )

    valid_dataset = TweetDataset(
        dfx,
        valid_idx
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=16,
        num_workers=2
    )

    device = torch.device("cuda")
    config = RobertaConfig.from_pretrained(ARCH)
    config.output_hidden_states = True
    model = TweetModelTripleHead(config)
    model.to(device)

    num_train_steps = int(len(train_idx) / BATCH_SIZE * NUM_EPOCHS)
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
    ]
    optimizer = AdamW(optimizer_parameters, lr=LEARNING_RATE)
    scheduler = get_cosine_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=0, 
        num_training_steps=num_train_steps
    )

    es = utils.EarlyStopping(patience=5, mode="max")
    print(f"Training is Starting for fold={fold}")
    
    for epoch in range(NUM_EPOCHS):
        train_fn(train_data_loader, model, optimizer, device, scheduler=scheduler)
        jaccard = eval_fn(valid_data_loader, model, device)
        print(f"Jaccard Score = {jaccard}")
        es(jaccard, model, model_path=f"model_{fold}.bin")
        if es.early_stop:
            print("Early stopping")
            break

In [None]:
run(0, train)

In [None]:
run(1, train)

In [None]:
run(2, train)

In [None]:
run(3, train)

In [None]:
run(4, train)