In [1]:
import os
import torch
import pandas as pd
import torch.nn as nn
import numpy as np
import torch.nn.functional as F
from torch.optim import lr_scheduler

from sklearn import model_selection
from sklearn import metrics
import transformers
import tokenizers
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from tqdm.autonotebook import tqdm
#import utils

from pathlib import Path

In [2]:
class config:
    MAX_LEN = 192
    TRAIN_BATCH_SIZE = 16
    VALID_BATCH_SIZE = 8
    GRAD_ACC_STEPS = 2
    EPOCHS = 3 # 5 was useless, earlystopping kicked in
    LEARNING_RATE = 3e-5
    DATA_DIR = Path('')
    MODEL_NAME = "roberta-base"
    TRAINING_FILE = "train_folds.csv"
    TOKENIZER = tokenizers.ByteLevelBPETokenizer( ##explore this
        vocab_file=f"vocab.json", 
        merges_file=f"merges.txt", 
        lowercase=True,
        add_prefix_space=True
    )

In [3]:
class TweetModel(transformers.BertPreTrainedModel):
    def __init__(self, conf):
        super(TweetModel, self).__init__(conf)
        self.roberta = transformers.RobertaModel.from_pretrained(config.MODEL_NAME, config=conf)
        self.drop_out = nn.Dropout(0.1)
        self.l0 = nn.Linear(768 * 2, 2)
        torch.nn.init.normal_(self.l0.weight, std=0.02)
    
    def forward(self, ids, mask, token_type_ids):
        _, _, out = self.roberta( #
            ids,
            attention_mask=mask,
            token_type_ids=token_type_ids
        )

        out = torch.cat((out[-1], out[-2]), dim=-1)
        out = self.drop_out(out)
        logits = self.l0(out)

        start_logits, end_logits = logits.split(1, dim=-1)

        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)
        print(start_logits, end_logits)

In [4]:
# def process_data(tweet, selected_text, sentiment, tokenizer, max_len):
#     len_st = len(selected_text)
#     idx0 = None
#     idx1 = None
#     for ind in (i for i, e in enumerate(tweet) if e == selected_text[0]):
#         if tweet[ind: ind+len_st] == selected_text:
#             idx0 = ind
#             idx1 = ind + len_st - 1
#             break

#     char_targets = [0] * len(tweet)
#     if idx0 != None and idx1 != None:
#         for ct in range(idx0, idx1 + 1):
#             char_targets[ct] = 1
    
#     tok_tweet = tokenizer.encode(tweet)
#     input_ids_orig = tok_tweet.ids[1:-1]
#     tweet_offsets = tok_tweet.offsets[1:-1]
    
#     target_idx = []
#     for j, (offset1, offset2) in enumerate(tweet_offsets):
#         if sum(char_targets[offset1: offset2]) > 0:
#             target_idx.append(j)
    
#     targets_start = target_idx[0]
#     targets_end = target_idx[-1]

#     sentiment_id = {
#         'positive': 3893,
#         'negative': 4997,
#         'neutral': 8699
#     }
    
#     input_ids = [101] + [sentiment_id[sentiment]] + [102] + input_ids_orig + [102]
#     token_type_ids = [0, 0, 0] + [1] * (len(input_ids_orig) + 1)
#     mask = [1] * len(token_type_ids)
#     tweet_offsets = [(0, 0)] * 3 + tweet_offsets + [(0, 0)]
#     targets_start += 3
#     targets_end += 3

#     padding_length = max_len - len(input_ids)
#     if padding_length > 0:
#         input_ids = input_ids + ([0] * padding_length)
#         mask = mask + ([0] * padding_length)
#         token_type_ids = token_type_ids + ([0] * padding_length)
#         tweet_offsets = tweet_offsets + ([(0, 0)] * padding_length)
    
#     return {
#         'ids': input_ids,
#         'mask': mask,
#         'token_type_ids': token_type_ids,
#         'targets_start': targets_start,
#         'targets_end': targets_end,
#         'orig_tweet': tweet,
#         'orig_selected': selected_text,
#         'sentiment': sentiment,
#         'offsets': tweet_offsets
#     }

In [5]:
def process_data(tweet, selected_text, sentiment, tokenizer, max_len):
    tweet = " " + " ".join(str(tweet).split())
    selected_text = " " + " ".join(str(selected_text).split())

    len_st = len(selected_text) - 1
    idx0 = None
    idx1 = None

    for ind in (i for i, e in enumerate(tweet) if e == selected_text[1]):
        if " " + tweet[ind: ind+len_st] == selected_text:
            idx0 = ind
            idx1 = ind + len_st - 1
            break

    char_targets = [0] * len(tweet)
    if idx0 != None and idx1 != None:
        for ct in range(idx0, idx1 + 1):
            char_targets[ct] = 1
    
    tok_tweet = tokenizer.encode(tweet)
    input_ids_orig = tok_tweet.ids
    tweet_offsets = tok_tweet.offsets
    
    target_idx = []
    for j, (offset1, offset2) in enumerate(tweet_offsets):
        if sum(char_targets[offset1: offset2]) > 0:
            target_idx.append(j)
    
    targets_start = target_idx[0]
    targets_end = target_idx[-1]

    sentiment_id = {
        'positive': 1313,
        'negative': 2430,
        'neutral': 7974
    }
    
    input_ids = [0] + [sentiment_id[sentiment]] + [2] + [2] + input_ids_orig + [2]
    token_type_ids = [0, 0, 0, 0] + [0] * (len(input_ids_orig) + 1)
    mask = [1] * len(token_type_ids)
    tweet_offsets = [(0, 0)] * 4 + tweet_offsets + [(0, 0)]
    targets_start += 4
    targets_end += 4

    padding_length = max_len - len(input_ids)
    if padding_length > 0:
        input_ids = input_ids + ([1] * padding_length)
        mask = mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)
        tweet_offsets = tweet_offsets + ([(0, 0)] * padding_length)
    
    return {
        'ids': input_ids,
        'mask': mask,
        'token_type_ids': token_type_ids,
        'targets_start': targets_start,
        'targets_end': targets_end,
        'orig_tweet': tweet,
        'orig_selected': selected_text,
        'sentiment': sentiment,
        'offsets': tweet_offsets
    }

In [6]:
class TweetDataset:
    def __init__(self, tweet, sentiment, selected_text):
        self.tweet = tweet
        self.sentiment = sentiment
        self.selected_text = selected_text
        self.tokenizer = config.TOKENIZER
        self.max_len = config.MAX_LEN
    
    def __len__(self):
        return len(self.tweet)

    def __getitem__(self, item):
        data = process_data(
            self.tweet[item], 
            self.selected_text[item], 
            self.sentiment[item],
            self.tokenizer,
            self.max_len
        )

        return {
            'ids': torch.tensor(data["ids"], dtype=torch.long),
            'mask': torch.tensor(data["mask"], dtype=torch.long),
            'token_type_ids': torch.tensor(data["token_type_ids"], dtype=torch.long),
            'targets_start': torch.tensor(data["targets_start"], dtype=torch.long),
            'targets_end': torch.tensor(data["targets_end"], dtype=torch.long),
            'orig_tweet': data["orig_tweet"],
            'orig_selected': data["orig_selected"],
            'sentiment': data["sentiment"],
            'offsets': torch.tensor(data["offsets"], dtype=torch.long)
        }


In [7]:
def calculate_jaccard_score(
    original_tweet, 
    target_string, 
    sentiment_val, 
    idx_start, 
    idx_end, 
    offsets,
    verbose=False):
    
    if idx_end < idx_start:
        idx_end = idx_start
    
    filtered_output  = ""
    for ix in range(idx_start, idx_end + 1):
        filtered_output += original_tweet[offsets[ix][0]: offsets[ix][1]]
        if (ix+1) < len(offsets) and offsets[ix][1] < offsets[ix+1][0]:
            filtered_output += " "

    if sentiment_val == "neutral" or len(original_tweet.split()) < 2:
        filtered_output = original_tweet

    jac = jaccard(target_string.strip(), filtered_output.strip())
    return jac, filtered_output

In [8]:
# def calculate_jaccard_score(
#     original_tweet, 
#     target_string, 
#     sentiment_val, 
#     idx_start, 
#     idx_end, 
#     offsets,
#     verbose=False):
    
#     if idx_end < idx_start:
#         idx_end = idx_start
    
#     filtered_output  = ""
#     for ix in range(idx_start, idx_end + 1):
#         filtered_output += original_tweet[offsets[ix][0]: offsets[ix][1]]
#         if (ix+1) < len(offsets) and offsets[ix][1] < offsets[ix+1][0]:
#             filtered_output += " "

#     if sentiment_val == "neutral" or len(original_tweet.split()) < 2:
#         filtered_output = original_tweet

#     jac = utils.jaccard(target_string.strip(), filtered_output.strip())
#     return jac, filtered_output

In [9]:
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [10]:
class AverageMeter(object):
    """Computes and stores the average and current value.

    Examples::
        >>> # Initialize a meter to record loss
        >>> losses = AverageMeter()
        >>> # Update meter after every minibatch update
        >>> losses.update(loss_value, batch_size)
    """

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


In [11]:
class EarlyStopping:
    """Early stops the training if validation loss doesn't improve after a given patience."""
    def __init__(self, patience=7, verbose=False, delta=0):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 7
            verbose (bool): If True, prints a message for each validation loss improvement. 
                            Default: False
            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
                            Default: 0
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta

    def __call__(self, val_loss, model, model_path):

        score = val_loss#-val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model, model_path)
        elif score < self.best_score + self.delta:
            self.counter += 1
            print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model, model_path)
            self.counter = 0

    def save_checkpoint(self, val_loss, model, model_path):
        '''Saves model when validation loss decrease.'''
        if self.verbose:
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), model_path)
        self.val_loss_min = val_loss

In [12]:
# def eval_fn(data_loader, model, device):
#     model.eval()
#     losses = utils.AverageMeter()
#     jaccards = utils.AverageMeter()
    
#     with torch.no_grad():
#         tk0 = tqdm(data_loader, total=len(data_loader))
#         for bi, d in enumerate(tk0):
#             ids = d["ids"]
#             token_type_ids = d["token_type_ids"]
#             mask = d["mask"]
#             sentiment = d["sentiment"]
#             orig_selected = d["orig_selected"]
#             orig_tweet = d["orig_tweet"]
#             targets_start = d["targets_start"]
#             targets_end = d["targets_end"]
#             offsets = d["offsets"].numpy()

#             ids = ids.to(device, dtype=torch.long)
#             token_type_ids = token_type_ids.to(device, dtype=torch.long)
#             mask = mask.to(device, dtype=torch.long)
#             targets_start = targets_start.to(device, dtype=torch.long)
#             targets_end = targets_end.to(device, dtype=torch.long)

#             outputs_start, outputs_end = model(
#                 ids=ids,
#                 mask=mask,
#                 token_type_ids=token_type_ids
#             )
#             loss = loss_fn(outputs_start, outputs_end, targets_start, targets_end)
#             outputs_start = torch.softmax(outputs_start, dim=1).cpu().detach().numpy()
#             outputs_end = torch.softmax(outputs_end, dim=1).cpu().detach().numpy()
#             jaccard_scores = []
#             for px, tweet in enumerate(orig_tweet):
#                 selected_tweet = orig_selected[px]
#                 tweet_sentiment = sentiment[px]
#                 jaccard_score, _ = calculate_jaccard_score(
#                     original_tweet=tweet,
#                     target_string=selected_tweet,
#                     sentiment_val=tweet_sentiment,
#                     idx_start=np.argmax(outputs_start[px, :]),
#                     idx_end=np.argmax(outputs_end[px, :]),
#                     offsets=offsets[px]
#                 )
#                 jaccard_scores.append(jaccard_score)

#             jaccards.update(np.mean(jaccard_scores), ids.size(0))
#             losses.update(loss.item(), ids.size(0))
#             tk0.set_postfix(loss=losses.avg, jaccard=jaccards.avg)
    
#     print(f"Jaccard = {jaccards.avg}")
#     return jaccards.avg

In [13]:
def loss_fn(start_logits, end_logits, start_positions, end_positions):
    loss_fct = nn.CrossEntropyLoss()
    start_loss = loss_fct(start_logits, start_positions)
    end_loss = loss_fct(end_logits, end_positions)
    total_loss = (start_loss + end_loss)
    return total_loss

In [14]:
def train_fn(data_loader, model, optimizer, device, scheduler=None):
    model.train()
    losses = AverageMeter()
    jaccards = AverageMeter()

    tk0 = tqdm(data_loader, total=len(data_loader))
    
    for bi, d in enumerate(tk0):

        ids = d["ids"]
        token_type_ids = d["token_type_ids"]
        mask = d["mask"]
        targets_start = d["targets_start"]
        targets_end = d["targets_end"]
        sentiment = d["sentiment"]
        orig_selected = d["orig_selected"]
        orig_tweet = d["orig_tweet"]
        targets_start = d["targets_start"]
        targets_end = d["targets_end"]
        offsets = d["offsets"]

        ids = ids.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        targets_start = targets_start.to(device, dtype=torch.long)
        targets_end = targets_end.to(device, dtype=torch.long)

        model.zero_grad()
        outputs_start, outputs_end = model(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids,
        )
        loss = loss_fn(outputs_start, outputs_end, targets_start, targets_end)
        loss = loss / config.GRAD_ACC_STEPS
        loss.backward()
#         optimizer.step()
#         scheduler.step()
        if (bi+1) % config.GRAD_ACC_STEPS == 0:
                optimizer.step()
                if scheduler is not None:
                    scheduler.step() 
                optimizer.zero_grad()

        outputs_start = torch.softmax(outputs_start, dim=1).cpu().detach().numpy()
        outputs_end = torch.softmax(outputs_end, dim=1).cpu().detach().numpy()
        jaccard_scores = []
        for px, tweet in enumerate(orig_tweet):
            selected_tweet = orig_selected[px]
            tweet_sentiment = sentiment[px]
            jaccard_score, _ = calculate_jaccard_score(
                original_tweet=tweet,
                target_string=selected_tweet,
                sentiment_val=tweet_sentiment,
                idx_start=np.argmax(outputs_start[px, :]),
                idx_end=np.argmax(outputs_end[px, :]),
                offsets=offsets[px]
            )
            jaccard_scores.append(jaccard_score)

        jaccards.update(np.mean(jaccard_scores), ids.size(0))
        losses.update(loss.item(), ids.size(0))
        tk0.set_postfix(loss=losses.avg, jaccard=jaccards.avg)

In [15]:
def eval_fn(data_loader, model, device):
    model.eval()
    losses = AverageMeter()
    jaccards = AverageMeter()
    
    with torch.no_grad():
        tk0 = tqdm(data_loader, total=len(data_loader))
        for bi, d in enumerate(tk0):
            ids = d["ids"]
            token_type_ids = d["token_type_ids"]
            mask = d["mask"]
            sentiment = d["sentiment"]
            orig_selected = d["orig_selected"]
            orig_tweet = d["orig_tweet"]
            targets_start = d["targets_start"]
            targets_end = d["targets_end"]
            offsets = d["offsets"].numpy()

            ids = ids.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            targets_start = targets_start.to(device, dtype=torch.long)
            targets_end = targets_end.to(device, dtype=torch.long)

            outputs_start, outputs_end = model(
                ids=ids,
                mask=mask,
                token_type_ids=token_type_ids
            )
            loss = loss_fn(outputs_start, outputs_end, targets_start, targets_end)
            outputs_start = torch.softmax(outputs_start, dim=1).cpu().detach().numpy()
            outputs_end = torch.softmax(outputs_end, dim=1).cpu().detach().numpy()
            jaccard_scores = []
            for px, tweet in enumerate(orig_tweet):
                selected_tweet = orig_selected[px]
                tweet_sentiment = sentiment[px]
                jaccard_score, _ = calculate_jaccard_score(
                    original_tweet=tweet,
                    target_string=selected_tweet,
                    sentiment_val=tweet_sentiment,
                    idx_start=np.argmax(outputs_start[px, :]),
                    idx_end=np.argmax(outputs_end[px, :]),
                    offsets=offsets[px]
                )
                jaccard_scores.append(jaccard_score)

            jaccards.update(np.mean(jaccard_scores), ids.size(0))
            losses.update(loss.item(), ids.size(0))
            tk0.set_postfix(loss=losses.avg, jaccard=jaccards.avg)
    
    print(f"Jaccard = {jaccards.avg}")
    return jaccards.avg

In [17]:
def run(fold):
    dfx = pd.read_csv(config.TRAINING_FILE)

    df_train = dfx[dfx.kfold != fold].reset_index(drop=True)
    df_valid = dfx[dfx.kfold == fold].reset_index(drop=True)

    train_dataset = TweetDataset(
        tweet=df_train.text.values,
        sentiment=df_train.sentiment.values,
        selected_text=df_train.selected_text.values
    )

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN_BATCH_SIZE,
        num_workers=4
    )

    valid_dataset = TweetDataset(
        tweet=df_valid.text.values,
        sentiment=df_valid.sentiment.values,
        selected_text=df_valid.selected_text.values
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.VALID_BATCH_SIZE,
        num_workers=2
    )

    device = torch.device("cuda")
    model_config = transformers.RobertaConfig.from_pretrained(config.MODEL_NAME)
    model_config.output_hidden_states = True
    model = TweetModel(conf=model_config)
    model.to(device)

    num_train_steps = int(len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
    ]
    optimizer = AdamW(optimizer_parameters, lr=config.LEARNING_RATE)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=0, 
        num_training_steps=num_train_steps
    )

    es = EarlyStopping(patience=2)#, mode="max")
    print(f"Training is Starting for fold={fold}")
    
    for epoch in range(config.EPOCHS):# for epoch in range(config.EPOCHS):
        train_fn(train_data_loader, model, optimizer, device, scheduler=scheduler)
        jaccard = eval_fn(valid_data_loader, model, device)
        print(f"Jaccard Score = {jaccard}")
        es(jaccard, model, model_path=f"model_{fold}.bin")
        if es.early_stop:
            print("Early stopping")
            break

In [18]:
run(fold=0)

Training is Starting for fold=0


HBox(children=(FloatProgress(value=0.0, max=1375.0), HTML(value='')))

tensor([[ 0.9123,  0.3620,  0.3564,  ...,  0.6429,  0.6386,  0.3273],
        [ 0.3949, -0.0938,  0.7475,  ...,  1.5816,  0.5949,  0.6256],
        [ 0.4508,  0.1188,  0.3900,  ...,  0.7170,  0.1505,  1.0884],
        ...,
        [ 0.4142,  0.2713,  0.4085,  ...,  0.5965,  0.7608,  0.4838],
        [ 1.1016,  0.5583,  0.1481,  ...,  0.5951,  0.8918,  0.6371],
        [ 0.3598,  0.2364,  0.7523,  ...,  0.9263,  1.4485,  0.6210]],
       device='cuda:0', grad_fn=<SqueezeBackward1>) tensor([[-0.3769, -1.0391, -0.7663,  ..., -0.6132, -1.0681, -0.0798],
        [-0.6863, -0.4711, -0.1056,  ..., -0.6001, -1.2499, -0.8928],
        [-0.6768, -0.8577, -0.7265,  ..., -0.7517, -0.9358, -0.9979],
        ...,
        [-0.6881, -0.8805, -0.7320,  ..., -0.3080, -0.8769, -0.9246],
        [ 0.3586, -0.5595, -0.7263,  ..., -0.8362, -0.8003, -0.7976],
        [-0.6664, -0.8769, -0.8300,  ..., -0.0274, -0.3073, -0.9599]],
       device='cuda:0', grad_fn=<SqueezeBackward1>)
tensor([[ 0.3817,  0.3639,  

tensor([[ 0.7892, -0.5633,  0.7600,  ..., -0.8201,  0.1220,  0.2437],
        [ 0.7412, -0.6438,  0.7737,  ..., -0.1996,  0.3107, -0.7660],
        [ 0.2263, -0.8166,  0.3678,  ..., -0.9735, -0.7652, -0.6485],
        ...,
        [ 0.2433, -0.8660, -0.1575,  ..., -1.0281, -0.9320, -0.9581],
        [ 0.2314, -0.7518,  0.2647,  ..., -0.6633,  0.3925, -0.0021],
        [ 0.7043, -0.3498,  0.3489,  ..., -0.8499, -0.7463, -1.1032]],
       device='cuda:0', grad_fn=<SqueezeBackward1>) tensor([[-0.4189, -1.8359, -0.4370,  ..., -1.9275, -0.8509, -0.7126],
        [-0.4801, -1.8592, -0.4188,  ..., -1.4136, -0.1268, -1.8519],
        [-0.6901, -1.5863,  0.1303,  ..., -1.6298, -1.6971, -2.0054],
        ...,
        [-0.6855, -1.5683, -0.8781,  ..., -1.4475, -2.0301, -1.6310],
        [-0.7117, -1.4809, -0.7037,  ..., -1.2844,  0.1156, -1.0863],
        [ 0.2625, -1.2143, -0.7221,  ..., -1.6655, -2.0238, -1.5865]],
       device='cuda:0', grad_fn=<SqueezeBackward1>)
tensor([[ 0.2972, -1.1798, -

tensor([[ 0.2196, -1.6787, -0.4628,  ..., -2.3599, -2.5365, -2.0093],
        [ 0.1951, -0.2529, -0.6666,  ..., -3.3556, -2.4381, -3.1400],
        [ 0.2677, -0.7990, -0.0543,  ..., -2.3450, -2.7388, -2.3834],
        ...,
        [ 0.6634, -1.5819, -0.6440,  ..., -2.9575, -2.3824, -2.8615],
        [ 0.1836, -0.3658, -0.3641,  ..., -2.5040, -2.2966, -2.2764],
        [ 0.2164, -1.0264,  0.4614,  ..., -2.0197, -2.9789, -3.0099]],
       device='cuda:0', grad_fn=<SqueezeBackward1>) tensor([[-0.6441, -2.4800, -0.8318,  ..., -3.2458, -2.8876, -2.4092],
        [-0.6398, -0.8399,  0.5344,  ..., -2.3243, -2.2758, -2.9330],
        [-0.5882, -1.9343, -0.0409,  ..., -2.5066, -3.0426, -3.1130],
        ...,
        [-0.3465, -2.7478, -1.3951,  ..., -2.9955, -3.0068, -2.9388],
        [-0.6034, -1.1722, -0.7110,  ..., -2.4898, -2.2870, -1.8141],
        [-0.5997, -1.8256,  0.4251,  ..., -2.9264, -2.9907, -2.6995]],
       device='cuda:0', grad_fn=<SqueezeBackward1>)
tensor([[-0.5440, -1.6453, -

KeyboardInterrupt: 

In [None]:
run(fold=1)

In [None]:
run(fold=2)

In [None]:
run(fold=3)

In [None]:
run(fold=4)

# Eval

In [None]:
df_test = pd.read_csv("test.csv")
df_test.loc[:, "selected_text"] = df_test.text.values

In [None]:
device = torch.device("cuda")
model_config = transformers.RobertaConfig.from_pretrained(config.MODEL_NAME)
model_config.output_hidden_states = True

In [None]:
model1 = TweetModel(conf=model_config)
model1.to(device)
model1.load_state_dict(torch.load("model_0.bin"))
model1.eval()

model2 = TweetModel(conf=model_config)
model2.to(device)
model2.load_state_dict(torch.load("model_1.bin"))
model2.eval()

model3 = TweetModel(conf=model_config)
model3.to(device)
model3.load_state_dict(torch.load("model_2.bin"))
model3.eval()

model4 = TweetModel(conf=model_config)
model4.to(device)
model4.load_state_dict(torch.load("model_3.bin"))
model4.eval()

model5 = TweetModel(conf=model_config)
model5.to(device)
model5.load_state_dict(torch.load("model_4.bin"))
model5.eval()


In [None]:
final_output = []

In [None]:
test_dataset = TweetDataset(
        tweet=df_test.text.values,
        sentiment=df_test.sentiment.values,
        selected_text=df_test.selected_text.values
    )

data_loader = torch.utils.data.DataLoader(
    test_dataset,
    shuffle=False,
    batch_size=config.VALID_BATCH_SIZE,
    num_workers=1
)


with torch.no_grad():
    tk0 = tqdm(data_loader, total=len(data_loader))
    for bi, d in enumerate(tk0):
        ids = d["ids"]
        token_type_ids = d["token_type_ids"]
        mask = d["mask"]
        sentiment = d["sentiment"]
        orig_selected = d["orig_selected"]
        orig_tweet = d["orig_tweet"]
        targets_start = d["targets_start"]
        targets_end = d["targets_end"]
        offsets = d["offsets"].numpy()

        ids = ids.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        targets_start = targets_start.to(device, dtype=torch.long)
        targets_end = targets_end.to(device, dtype=torch.long)

#         outputs_start1, outputs_end1 = model1(
#             ids=ids,
#             mask=mask,
#             token_type_ids=token_type_ids
#         )
        
        outputs_start2, outputs_end2 = model2(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids
        )
        
#         outputs_start3, outputs_end3 = model3(
#             ids=ids,
#             mask=mask,
#             token_type_ids=token_type_ids
#         )
        
#         outputs_start4, outputs_end4 = model4(
#             ids=ids,
#             mask=mask,
#             token_type_ids=token_type_ids
#         )
        
#         outputs_start5, outputs_end5 = model5(
#             ids=ids,
#             mask=mask,
#             token_type_ids=token_type_ids
#         )
#         outputs_start = (outputs_start1 + outputs_start2 + outputs_start3 + outputs_start4 + outputs_start5) / 5
#         outputs_end = (outputs_end1 + outputs_end2 + outputs_end3 + outputs_end4 + outputs_end5) / 5
        
        outputs_start = outputs_start2
        outputs_end = outputs_end2
        
        outputs_start = torch.softmax(outputs_start, dim=1).cpu().detach().numpy()
        outputs_end = torch.softmax(outputs_end, dim=1).cpu().detach().numpy()
        jaccard_scores = []
        for px, tweet in enumerate(orig_tweet):
            selected_tweet = orig_selected[px]
            tweet_sentiment = sentiment[px]
            _, output_sentence = calculate_jaccard_score(
                original_tweet=tweet,
                target_string=selected_tweet,
                sentiment_val=tweet_sentiment,
                idx_start=np.argmax(outputs_start[px, :]),
                idx_end=np.argmax(outputs_end[px, :]),
                offsets=offsets[px]
            )
            final_output.append(output_sentence)

In [None]:
sample = pd.read_csv("sample_submission.csv")
sample.loc[:, 'selected_text'] = final_output
sample.to_csv("submission-fold2.csv", index=False)

In [None]:
sample.head()