In [1]:
import os
import numpy as np
import pandas as pd
import random
import warnings

from tokenizers import ByteLevelBPETokenizer
from transformers import RobertaModel, RobertaConfig
from transformers import get_linear_schedule_with_warmup

from sklearn.model_selection import StratifiedKFold

import torch
from torch import nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import TensorDataset

warnings.filterwarnings('ignore')

In [2]:
test_df = pd.read_csv('../input/tweet-sentiment-extraction/test.csv')
test_df['text'] = test_df['text'].astype(str)

In [3]:
class TweetDataset(torch.utils.data.Dataset):
    def __init__(self, df, max_len=96):
        self.df = df
        self.max_len = max_len
        self.labeled = 'selected_text' in df
        self.tokenizer = ByteLevelBPETokenizer(
            vocab_file='../input/roberta-base/vocab.json', 
            merges_file='../input/roberta-base/merges.txt',
            lowercase=True,
            add_prefix_space=True)

    def __getitem__(self, index):
        data = {}
        row = self.df.iloc[index]
        
        ids, masks, tweet, offsets = self.get_input_data(row)
        data['ids'] = ids
        data['masks'] = masks
        data['tweet'] = tweet
        data['offsets'] = offsets
        
        if self.labeled:
            start_idx, end_idx = self.get_target_idx(row, tweet, offsets)
            data['start_idx'] = start_idx
            data['end_idx'] = end_idx
        
        return data

    def __len__(self):
        return len(self.df)
    
    def get_input_data(self, row):
        tweet = " " + " ".join(row.text.lower().split())
        encoding = self.tokenizer.encode(tweet)
        sentiment_id = self.tokenizer.encode(row.sentiment).ids
        ids = [0] + sentiment_id + [2, 2] + encoding.ids + [2]
        offsets = [(0, 0)] * 4 + encoding.offsets + [(0, 0)]
                
        pad_len = self.max_len - len(ids)
        if pad_len > 0:
            ids += [1] * pad_len
            offsets += [(0, 0)] * pad_len
        
        ids = torch.tensor(ids)
        masks = torch.where(ids != 1, torch.tensor(1), torch.tensor(0))
        offsets = torch.tensor(offsets)
        
        return ids, masks, tweet, offsets
        
    def get_target_idx(self, row, tweet, offsets):
        selected_text = " " +  " ".join(row.selected_text.lower().split())

        len_st = len(selected_text) - 1
        idx0 = None
        idx1 = None

        for ind in (i for i, e in enumerate(tweet) if e == selected_text[1]):
            if " " + tweet[ind: ind+len_st] == selected_text:
                idx0 = ind
                idx1 = ind + len_st - 1
                break

        char_targets = [0] * len(tweet)
        if idx0 != None and idx1 != None:
            for ct in range(idx0, idx1 + 1):
                char_targets[ct] = 1

        target_idx = []
        for j, (offset1, offset2) in enumerate(offsets):
            if sum(char_targets[offset1: offset2]) > 0:
                target_idx.append(j)

        start_idx = target_idx[0]
        end_idx = target_idx[-1]
        
        return start_idx, end_idx

In [4]:
def get_test_loader(df, batch_size=32):
    loader = torch.utils.data.DataLoader(
        TweetDataset(df), 
        batch_size=batch_size, 
        shuffle=False, 
        num_workers=2)    
    return loader

In [5]:
class TweetModel_43(nn.Module):
    def __init__(self):
        super(TweetModel_43, self).__init__()
        
        config = RobertaConfig.from_pretrained(
            '../input/roberta-base/config.json', output_hidden_states=False)    
        self.roberta = RobertaModel.from_pretrained(
            '../input/roberta-base/pytorch_model.bin', config=config)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(config.hidden_size, 2)
        nn.init.normal_(self.fc.weight, std=0.02)
        nn.init.normal_(self.fc.bias, 0)

    def forward(self, input_ids, attention_mask):
        out, _ = self.roberta(input_ids, attention_mask)
        
        x = self.dropout(out)
        x = self.fc(x)
        start_logits, end_logits = x.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)
                
        return start_logits, end_logits
    
class TweetModel_52(nn.Module):
    def __init__(self):
        super(TweetModel_52, self).__init__()
        
        config = RobertaConfig.from_pretrained(
            '../input/roberta-base/config.json', output_hidden_states=False)    
        self.roberta = RobertaModel.from_pretrained(
            '../input/roberta-base/pytorch_model.bin', config=config)
        self.dropout = nn.Dropout(0.1)
        self.relu = nn.ReLU()
        self.conv1d = nn.Conv1d(768, 768, kernel_size=2, stride=1, padding=1, dilation=2)
        self.fc = nn.Linear(config.hidden_size, 2)
        nn.init.normal_(self.fc.weight, std=0.02)
        nn.init.normal_(self.fc.bias)

    def forward(self, input_ids, attention_mask):
        out, _ = self.roberta(input_ids, attention_mask)

        x = self.dropout(out)
        x = x.permute(0,2,1)
        x = self.conv1d(x)
        x = x.permute(0,2,1)
        x = self.relu(x)
        x = self.fc(x)

        start_logits, end_logits = x.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)
        
        return start_logits, end_logits

class TweetModel_54(nn.Module):
    def __init__(self):
        super(TweetModel_54, self).__init__()
        
        config = RobertaConfig.from_pretrained(
            '../input/roberta-base/config.json', output_hidden_states=False)    
        self.roberta = RobertaModel.from_pretrained(
            '../input/roberta-base/pytorch_model.bin', config=config)
        self.dropout = nn.Dropout(0.1)
        self.relu = nn.LeakyReLU()
        self.conv1d = nn.Conv1d(768, 768, kernel_size=2, stride=1, padding=1, dilation=2)
        self.fc = nn.Linear(config.hidden_size, 2)
        nn.init.normal_(self.fc.weight, std=0.02)
        nn.init.normal_(self.fc.bias)

    def forward(self, input_ids, attention_mask):
        out, _ = self.roberta(input_ids, attention_mask)

        x = self.dropout(out)
        x = x.permute(0,2,1)
        x = self.conv1d(x)
        x = x.permute(0,2,1)
        x = self.relu(x)
        x = self.fc(x)

        start_logits, end_logits = x.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)
        
        return start_logits, end_logits

In [6]:
def get_selected_text(text, start_idx, end_idx, offsets):
    selected_text = ""
    for ix in range(start_idx, end_idx + 1):
        selected_text += text[offsets[ix][0]: offsets[ix][1]]
        if (ix + 1) < len(offsets) and offsets[ix][1] < offsets[ix + 1][0]:
            selected_text += " "
    return selected_text

In [7]:
%%time

treshold = 0.5
test_loader = get_test_loader(test_df)
predictions = []
models = []

for fold in range(10):
    model = TweetModel_43()
    model.cuda()
    model.load_state_dict(torch.load(f'../input/pseudo-v43/pseudo_v43_pseudo_fold{fold+1}.pth'))
    model.eval()
    models.append(model)
for fold in range(4):
    model = TweetModel_43()
    model.cuda()
    model.load_state_dict(torch.load(f'../input/pseudo-v43-2/pseudo_v43_2_fold{fold+1}.pth'))
    model.eval()
    models.append(model)
for fold in range(1,4):
    model = TweetModel_52()
    model.cuda()
    model.load_state_dict(torch.load(f'../input/roberta-v52/roberta_v52_fold{fold+1}.pth'))
    model.eval()
    models.append(model)
for fold in range(5):
    model = TweetModel_54()
    model.cuda()
    model.load_state_dict(torch.load(f'../input/roberta-v54/roberta_v54_fold{fold+1}.pth'))
    model.eval()
    models.append(model)

for data in test_loader:
    ids = data['ids'].cuda()
    masks = data['masks'].cuda()
    tweet = data['tweet']
    offsets = data['offsets'].numpy()
    multi_preds = {}
    for k in range(len(tweet)):
        multi_preds[k] = []

    start_prediction = []
    end_prediction = []
    for model in models:
        with torch.no_grad():
            output = model(ids, masks)
            start_prob = torch.softmax(output[0], dim=1).cpu().detach().numpy()
            end_prob = torch.softmax(output[1], dim=1).cpu().detach().numpy()
            start_pred = np.argmax(start_prob, axis=1)
            end_pred = np.argmax(end_prob, axis=1)
            for i in range(len(ids)):
                if start_pred[i] > end_pred[i]:
                    pred = tweet[i]
                else:
                    pred = get_selected_text(tweet[i], start_pred[i], end_pred[i], offsets[i])
                multi_preds[i].append(pred.split())
    for pred in multi_preds.values():
        cnt = {}
        for model_pred in pred:
            for word in model_pred:
                if word in cnt.keys():
                    cnt[word] += 1
                else:
                    cnt[word] = 1
        em_pred = []
        for key,value in cnt.items():
            if value/len(models) >= treshold:
                em_pred.append(key)
        predictions.append(' '.join(em_pred))

CPU times: user 4min 6s, sys: 1min 36s, total: 5min 42s
Wall time: 5min 51s


In [8]:
test_loader = get_test_loader(test_df)
predictions_n = []
models = []

for fold in range(10):
    model = TweetModel_43()
    model.cuda()
    model.load_state_dict(torch.load(f'../input/pseudo-v43/pseudo_v43_pseudo_fold{fold+1}.pth'))
    model.eval()
    models.append(model)
for fold in range(4):
    model = TweetModel_43()
    model.cuda()
    model.load_state_dict(torch.load(f'../input/pseudo-v43-2/pseudo_v43_2_fold{fold+1}.pth'))
    model.eval()
    models.append(model)
for fold in range(1,4):
    model = TweetModel_52()
    model.cuda()
    model.load_state_dict(torch.load(f'../input/roberta-v52/roberta_v52_fold{fold+1}.pth'))
    model.eval()
    models.append(model)
for fold in range(5):
    model = TweetModel_54()
    model.cuda()
    model.load_state_dict(torch.load(f'../input/roberta-v54/roberta_v54_fold{fold+1}.pth'))
    model.eval()
    models.append(model)

for data in test_loader:
    ids = data['ids'].cuda()
    masks = data['masks'].cuda()
    tweet = data['tweet']
    offsets = data['offsets'].numpy()

    start_logits = []
    end_logits = []
    for model in models:
        with torch.no_grad():
            output = model(ids, masks)
            start_logits.append(torch.softmax(output[0], dim=1).cpu().detach().numpy())
            end_logits.append(torch.softmax(output[1], dim=1).cpu().detach().numpy())

    start_logits = np.mean(start_logits, axis=0)
    end_logits = np.mean(end_logits, axis=0)
    for i in range(len(ids)):    
        start_pred = np.argmax(start_logits[i])
        end_pred = np.argmax(end_logits[i])
        if start_pred > end_pred:
            pred = tweet[i]
        else:
            pred = get_selected_text(tweet[i], start_pred, end_pred, offsets[i])
        predictions_n.append(pred)

In [9]:
test_df['pred'] = predictions
test_df['pred_n'] = predictions

In [10]:
sub_df = pd.read_csv('../input/tweet-sentiment-extraction/sample_submission.csv')
sub_df['selected_text'] = test_df.apply(lambda x : x.pred_n if x.sentiment=='neutral' else x.pred, axis=1)
sub_df['selected_text'] = sub_df['selected_text'].apply(lambda x: x.replace('!!!!', '!') if len(x.split())==1 else x)
sub_df['selected_text'] = sub_df['selected_text'].apply(lambda x: x.replace('..', '.') if len(x.split())==1 else x)
sub_df['selected_text'] = sub_df['selected_text'].apply(lambda x: x.replace('...', '.') if len(x.split())==1 else x)
sub_df.to_csv('submission.csv', index=False)
sub_df.head()

Unnamed: 0,textID,selected_text
0,f87dea47db,last session of the day
1,96d74cb729,exciting
2,eee518ae67,such a shame!
3,01082688c6,happy bday!
4,33987a8ee5,i like it!!
