Thank to Sazuma for baseline https://www.kaggle.com/shoheiazuma/tweet-sentiment-roberta-pytorch

In [None]:
!nvidia-smi

# Libraries

In [None]:
#!curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
#!python pytorch-xla-env-setup.py --version nightly --apt-packages libomp5 libopenblas-dev

#!export XLA_USE_BF16=1
!pip install tokenizers
!pip install transformers 

In [None]:
import collections
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pytz
import random
import seaborn as sns
import shutil
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import tokenizers

# import torch_xla.core.xla_model as xm
# import torch_xla.distributed.parallel_loader as pl
# import torch_xla.distributed.xla_multiprocessing as xmp

import transformers
import warnings
warnings.filterwarnings("ignore")

from datetime import datetime
from joblib import Parallel, delayed
from sklearn import model_selection
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
from torch.optim import lr_scheduler
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from transformers import RobertaModel, RobertaConfig
from tqdm.autonotebook import tqdm
#import utils

# Variables

In [None]:
# Datetime variables
TIMESTAMP = datetime.now(tz=pytz.timezone("Europe/Moscow")).strftime("%Y-%m-%dT%H%M%S")

# Without train
IS_MISS_TRAIN = True

In [None]:
try:
    from google.colab import drive
    drive.mount('/content/drive')
    COLAB_USE = True
except:
    COLAB_USE = False

In [None]:
if COLAB_USE:
    INPUT = '/content/drive/My Drive/Colab Notebooks/2020-05_kaggle/input/'
      
    OUTPUT = f'/content/drive/My Drive/Colab Notebooks/2020-05_kaggle/output/{TIMESTAMP}/'
    os.makedirs(OUTPUT, exist_ok = False)
    ARCHIVE_FILE = OUTPUT + 'models.zip'

    MODELS = 'models/'
    # MODELS = '/content/drive/My Drive/Colab Notebooks/2020-05_kaggle/models/'
    if not IS_MISS_TRAIN and os.path.exists(MODELS):
        shutil.rmtree(MODELS)
    #os.makedirs(MODELS, exist_ok = False)
    for epoch in (2,3,4,5):
        os.makedirs(f'{MODELS}e{epoch}')
else:
    INPUT = '../input/'
    MODELS = os.path.join(INPUT,'20200615t234256-e4/')
    #MODELS = os.path.join(INPUT,'20200615t234256-e-best/')
    OUTPUT = './'

In [None]:
!ls {MODELS}

In [None]:
# Paths variables
COLAB = '/content/drive/My Drive/Colab Notebooks/2020-05_kaggle/'
VOCAB_RB = os.path.join(INPUT,'roberta-base/vocab.json')
MERGES_RB = os.path.join(INPUT,'roberta-base/merges.txt')
CONFIG_RB = os.path.join(INPUT,'roberta-base/config.json')
MODEL_RB = os.path.join(INPUT,'roberta-base/pytorch_model.bin')
TRAIN = os.path.join(INPUT,'tweet-sentiment-extraction/train.csv')
TEST = os.path.join(INPUT,'tweet-sentiment-extraction/test.csv')
SAMPLE_SUBM = os.path.join(INPUT,'tweet-sentiment-extraction/sample_submission.csv')

#Check gpu
print(torch.cuda.is_available())

In [None]:
assert os.path.exists(OUTPUT)

In [None]:
def ps(*args):
    print(*args)
    with open(os.path.join(OUTPUT,'output.txt'), 'a') as f:
        print(*args, file = f)

In [None]:
device = torch.device('cuda')
#device = torch.device('cpu')
#fold = 0
#device = xm.xla_device(fold + 1)

# Seed

In [None]:
def seed_everything(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = True

seed = 42
seed_everything(seed)

# Data Loader

In [None]:
class TweetDataset(torch.utils.data.Dataset):
    def __init__(self, df, max_len=96):
        self.df = df
        self.max_len = max_len
        self.labeled = 'selected_text' in df
        self.tokenizer = tokenizers.ByteLevelBPETokenizer(
            vocab_file=VOCAB_RB, 
            merges_file=MERGES_RB, 
            lowercase=True,
            add_prefix_space=True)

    def __getitem__(self, index):
        data = {}
        row = self.df.iloc[index]
        
        ids, masks, tweet, offsets = self.get_input_data(row)
        data['ids'] = ids
        data['masks'] = masks
        data['tweet'] = tweet
        data['offsets'] = offsets
        
        if self.labeled:
            start_idx, end_idx = self.get_target_idx(row, tweet, offsets)
            data['start_idx'] = start_idx
            data['end_idx'] = end_idx
        
        return data

    def __len__(self):
        return len(self.df)
    
    def get_input_data(self, row):
        tweet = " " + " ".join(row.text.lower().split())
        encoding = self.tokenizer.encode(tweet)
        sentiment_id = self.tokenizer.encode(row.sentiment).ids
        ids = [0] + sentiment_id + [2, 2] + encoding.ids + [2]
        offsets = [(0, 0)] * 4 + encoding.offsets + [(0, 0)]
                
        pad_len = self.max_len - len(ids)
        if pad_len > 0:
            ids += [1] * pad_len
            offsets += [(0, 0)] * pad_len
        
        ids = torch.tensor(ids)
        masks = torch.where(ids != 1, torch.tensor(1), torch.tensor(0))
        offsets = torch.tensor(offsets)
        
        return ids, masks, tweet, offsets
        
    def get_target_idx(self, row, tweet, offsets):
        selected_text = " " +  " ".join(row.selected_text.lower().split())

        len_st = len(selected_text) - 1
        idx0 = None
        idx1 = None

        for ind in (i for i, e in enumerate(tweet) if e == selected_text[1]):
            if " " + tweet[ind: ind+len_st] == selected_text:
                idx0 = ind
                idx1 = ind + len_st - 1
                break

        char_targets = [0] * len(tweet)
        if idx0 != None and idx1 != None:
            for ct in range(idx0, idx1 + 1):
                char_targets[ct] = 1

        target_idx = []
        for j, (offset1, offset2) in enumerate(offsets):
            if sum(char_targets[offset1: offset2]) > 0:
                target_idx.append(j)

        start_idx = target_idx[0]
        end_idx = target_idx[-1]
        
        return start_idx, end_idx
        
def get_train_val_loaders(df, train_idx, val_idx, batch_size=8):
    train_df = df.iloc[train_idx]
    val_df = df.iloc[val_idx]

    train_loader = torch.utils.data.DataLoader(
        TweetDataset(train_df), 
        batch_size=batch_size, 
        shuffle=True, 
        num_workers=2,
        drop_last=True)

    val_loader = torch.utils.data.DataLoader(
        TweetDataset(val_df), 
        batch_size=batch_size, 
        shuffle=False, 
        num_workers=2)

    dataloaders_dict = {"train": train_loader, "val": val_loader}

    return dataloaders_dict

def get_test_loader(df, batch_size=32):
    loader = torch.utils.data.DataLoader(
        TweetDataset(df), 
        batch_size=batch_size, 
        shuffle=False, 
        num_workers=2)    
    return loader

# Model

In [None]:
class TweetModel(nn.Module):
    def __init__(self):
        super(TweetModel, self).__init__()
        
        config = RobertaConfig.from_pretrained(
            CONFIG_RB, output_hidden_states=True)    
        self.roberta = RobertaModel.from_pretrained(
            MODEL_RB, config=config)
        self.dropout = nn.Dropout(0.9)
        self.fc = nn.Linear(config.hidden_size, 2)
        nn.init.normal_(self.fc.weight, std=0.02)
        nn.init.normal_(self.fc.bias, 0)

    def forward(self, input_ids, attention_mask):
        _, _, hs = self.roberta(input_ids, attention_mask)
         
        #x = torch.stack([hs[-1], hs[-2], hs[-3]])
        x = torch.stack([hs[-1], hs[-2]])
        x = torch.mean(x, 0)
        x = self.dropout(x)
        x = self.fc(x)
        start_logits, end_logits = x.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)
                
        return start_logits, end_logits

# Loss Function

In [None]:
def loss_fn(start_logits, end_logits, start_positions, end_positions):
    ce_loss = nn.CrossEntropyLoss()
    start_loss = ce_loss(start_logits, start_positions)
    end_loss = ce_loss(end_logits, end_positions)    
    total_loss = start_loss + end_loss
    return total_loss

# Evaluation Function

In [None]:
def get_selected_text(text, start_idx, end_idx, offsets):
    selected_text = ""
    for ix in range(start_idx, end_idx + 1):
        selected_text += text[offsets[ix][0]: offsets[ix][1]]
        if (ix + 1) < len(offsets) and offsets[ix][1] < offsets[ix + 1][0]:
            selected_text += " "
    return selected_text

In [None]:
def get_tokens(text, offsets):
  tokens = []
  for i in range(len(offsets)):
    s = get_selected_text(text, i, i, offsets)
    tokens.append(s)
  return tokens

In [None]:
def plot_logits(text, start_scores, end_scores, offsets):
  tokens = get_tokens(text, offsets)
  # Use plot styling from seaborn.
  sns.set(style='darkgrid')

  # Increase the plot size and font size.
  #sns.set(font_scale=1.5)
  plt.rcParams["figure.figsize"] = (16,8)

  # Pull the scores out of PyTorch Tensors and convert them to 1D numpy arrays.
  s_scores = start_scores.flatten()
  e_scores = end_scores.flatten()

  # We'll use the tokens as the x-axis labels. In order to do that, they all need
  # to be unique, so we'll add the token index to the end of each one.
  token_labels = []
  for (i, token) in enumerate(tokens):
      token_labels.append('{:} - {:>2}'.format(token, i))

  # Store the tokens and scores in a DataFrame. 
  # Each token will have two rows, one for its start score and one for its end
  # score. The "marker" column will differentiate them. A little wacky, I know.
  scores = []
  for (i, token_label) in enumerate(token_labels):

      # Add the token's start score as one row.
      scores.append({'token_label': token_label, 
                    'score': s_scores[i],
                    'marker': 'start'})
      
      # Add  the token's end score as another row.
      scores.append({'token_label': token_label, 
                    'score': e_scores[i],
                    'marker': 'end'})
      
  df = pd.DataFrame(scores)

  # Draw a grouped barplot to show start and end scores for each word.
  # The "hue" parameter is where we tell it which datapoints belong to which
  # of the two series.
  g = sns.catplot(x="token_label", y="score", hue="marker", data=df,
                  kind="bar", height=6, aspect=4)

  # Turn the xlabels vertical.
  g.set_xticklabels(g.ax.get_xticklabels(), rotation=90, ha="center")

  # Turn on the vertical grid to help align words to scores.
  g.ax.grid(True)

  #plt.title('Start Word Scores')

  plt.show()

In [None]:
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

def compute_jaccard_score(text, start_idx, end_idx, start_logits, end_logits, offsets):
    true = get_selected_text(text, start_idx, end_idx, offsets)
    
    start_pred = np.argmax(start_logits)
    end_pred = np.argmax(end_logits)
    if start_pred > end_pred:
      #1 baseline
#         pred = text
#         case = 'kir01'
      #2 kir
        # len_tweet = len(end_logits) - 1
        # if start_pred == len_tweet:
        # #[01234]
        # #[  e s]
        #     pred = get_selected_text(text, end_pred, len_tweet, offsets)
        #     case = 'kir21'
        # else:
        # #[01234]
        # #[  es ]
        #     pred = get_selected_text(text, start_pred, len_tweet, offsets)
        #     case = 'kir22'
      #3 McCormick
        sum_start_end = []
        idx_start_end = []
        for i, start_value in enumerate(start_logits):
            for g, end_value in enumerate(end_logits):
                if i > g: # начало забегает за конец
                    continue
                sum_start_end.append(start_value + end_value)
                idx_start_end.append([i,g])
        best_position = np.argmax(sum_start_end)
        start_pred, end_pred = idx_start_end[best_position]
        pred = get_selected_text(text, start_pred, end_pred, offsets)
      #DAMP
          #print(text)
          #print(pred)
          #print(true)
    else:
        pred = get_selected_text(text, start_pred, end_pred, offsets)
        #case = 'kir02'

    return jaccard(true, pred), true, pred, case

# Training Function

In [None]:
def train_model(model, dataloaders_dict, criterion, optimizer, num_epochs, filename, threshold):
    model.to(device)
    
    jacs_dict = collections.defaultdict(list)
    for epoch in range(num_epochs):       
        for phase in ['train', 'val']:
            cases = {}
            start_time = time.time()
            if phase == 'train':
                model.train()
            else:
                model.eval()

            epoch_loss = 0.0
            epoch_jaccard = 0.0
            
            last_print_time = 0

            for x, data in enumerate(dataloaders_dict[phase]):
                tokenizer = dataloaders_dict[phase].dataset.tokenizer
                if time.time() - last_print_time > 10:
                    last_print_time = time.time()
                    print(x, end=' ')
                ids = data['ids'].to(device)
                masks = data['masks'].to(device)
                tweet = data['tweet']
                offsets = data['offsets'].numpy()
                start_idx = data['start_idx'].to(device)
                end_idx = data['end_idx'].to(device)
                
                optimizer.zero_grad()

                with torch.set_grad_enabled(phase == 'train'):

                    start_logits, end_logits = model(ids, masks)

                    loss = criterion(start_logits, end_logits, start_idx, end_idx)
                    
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()
                        #xm.optimizer_step(optimizer, barrier=True)
                        
                    epoch_loss += loss.item() * len(ids)
                    
                    start_idx = start_idx.cpu().detach().numpy()
                    end_idx = end_idx.cpu().detach().numpy()
                    start_logits = torch.softmax(start_logits, dim=1).cpu().detach().numpy()
                    end_logits = torch.softmax(end_logits, dim=1).cpu().detach().numpy()
                    
                    for i in range(len(ids)):                        
                        jaccard_score, true, pred, case = compute_jaccard_score(
                            tweet[i],
                            start_idx[i],
                            end_idx[i],
                            start_logits[i], 
                            end_logits[i], 
                            offsets[i])
                        epoch_jaccard += jaccard_score
                        if case in cases:
                            cases[case] += 1
                        else:
                            cases[case] = 1
                        if phase == 'val' and jaccard_score < threshold:
                            ps('\n',jaccard_score, tweet[i], '\nTrue:', true,'\nPred:', pred, '\nSentiment:', tokenizer.decode([ids[i][1]]))
                            plot_logits(tweet[i], start_logits[i], end_logits[i], offsets[i])
            ps('')
            epoch_loss = epoch_loss / len(dataloaders_dict[phase].dataset)
            epoch_jaccard = epoch_jaccard / len(dataloaders_dict[phase].dataset)
            
            ps('Epoch {}/{} | {:^5} | Loss: {:.4f} | Jaccard: {:.4f} | elapsed sec: {:.1f} | Cases: {}'.format(
                epoch + 1, num_epochs, phase, epoch_loss, epoch_jaccard, time.time() - start_time, 
                str(cases)))
            
            jacs_dict[phase].append(epoch_jaccard)
            
        plt.plot(jacs_dict['train'], '-co', label = 'train')
        plt.plot(jacs_dict['val'], '-yv', label = 'val')
        plt.ylim((0.5,1.0))
        plt.legend()
        plt.grid(True)
        plt.show()
            
    #torch.save(model.state_dict(), filename)
        if epoch > 0:
            filename = f'{MODELS}e{epoch+1}/roberta_fold{fold}.pth'
            torch.save(model.state_dict(), filename)

# Training

In [None]:
num_epochs = 5
batch_size = 64
n_splits = 10
threshold = -1

skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)

IS_FAST_CHECK = False
FAST_CHECK_RATIO = 0.01

In [None]:
%%time
if not IS_MISS_TRAIN:
    ps("num_epochs:",num_epochs) 
    ps("batch_size:",batch_size)
    ps("n_splits:",n_splits)
    ps("threshold:",threshold)

    train_df = pd.read_csv(TRAIN)
    if IS_FAST_CHECK:
        ps('Fast check mode. Reduce train_df size. Old shape: ', train_df.shape)
        train_df = train_df.iloc[:int(train_df.shape[0] * FAST_CHECK_RATIO), :]
        ps('.. new shape: ', train_df.shape)

    train_df['text'] = train_df['text'].astype(str)
    train_df['selected_text'] = train_df['selected_text'].astype(str)

    for fold, (train_idx, val_idx) in enumerate(skf.split(train_df, train_df.sentiment), start=1): 
        ps(f'Fold: {fold}, Batches in train: {len(train_idx)//batch_size}, Batches in val: {len(val_idx)//batch_size}')

        model = TweetModel()
        optimizer = optim.AdamW(model.parameters(), lr=3e-5, betas=(0.9, 0.999))
        criterion = loss_fn
        dataloaders_dict = get_train_val_loaders(train_df, train_idx, val_idx, batch_size)

        train_model(
            model, 
            dataloaders_dict,
            criterion, 
            optimizer, 
            num_epochs,
            MODELS + f'roberta_fold{fold}.pth',
            threshold)

In [None]:
#!cp '{MODELS}e3/roberta_fold10.pth' '{MODELS}e_best/'
#!cp '{MODELS}e4/roberta_fold9.pth' '{MODELS}e_best/'
#!cp '{MODELS}e4/roberta_fold8.pth' '{MODELS}e_best/'
#!cp '{MODELS}e4/roberta_fold7.pth' '{MODELS}e_best/'
#!cp '{MODELS}e4/roberta_fold6.pth' '{MODELS}e_best/'
#!cp '{MODELS}e2/roberta_fold5.pth' '{MODELS}e_best/'
#!cp '{MODELS}e2/roberta_fold4.pth' '{MODELS}e_best/'
#!cp '{MODELS}e2/roberta_fold3.pth' '{MODELS}e_best/'
#!cp '{MODELS}e5/roberta_fold2.pth' '{MODELS}e_best/'
#!cp '{MODELS}e4/roberta_fold1.pth' '{MODELS}e_best/'

In [None]:
!nvidia-smi

# Model Archiving

In [None]:
##unzip models
#OUTPUT = '/content/drive/My Drive/Colab Notebooks/2020-05_kaggle/output/2020-06-11T234912/'
#ARCHIVE_FILE = OUTPUT + 'models.zip'
#MODELS = '/content/models/'
#!unzip '{ARCHIVE_FILE}' -d '{MODELS}'

In [None]:
%%time
if COLAB_USE:
##zip models with delete files after creating archive
    #OUTPUT = '/content/drive/My Drive/Colab Notebooks/2020-05_kaggle/output/2020-06-11T234912/'
    #ARCHIVE_FILE = OUTPUT + 'models_2.zip'
    #MODELS = '/content/drive/My Drive/Colab Notebooks/2020-05_kaggle/output/2020-06-11T234912/models_1/'
    #MODELS = '/content/models/models_2/'
    !zip -1 -r -j '{ARCHIVE_FILE}' '{MODELS}e4'

In [None]:
if COLAB_USE:
    last_print_time = 0
    while True:
        if time.time() - last_print_time > 300:
            last_print_time = time.time()
            print(datetime.now(tz=pytz.timezone("Europe/Moscow")).strftime("%Y-%m-%dT%H%M%S"), end='\n')

# Inference

In [None]:
test_df = pd.read_csv(TEST)
test_df['text'] = test_df['text'].astype(str)
test_loader = get_test_loader(test_df)
predictions = []
models = []
#for fold in range(skf.n_splits):
for fold in (0,1,2,3,5,6,7,8,9):
    model = TweetModel()
    model.to(device) 
    model.load_state_dict(torch.load(MODELS + f'roberta_fold{fold+1}.pth'))
    model.eval()
    models.append(model)

for x, data in enumerate(test_loader):
    print(x, end=' ')
    ids = data['ids'].to(device)
    masks = data['masks'].to(device)
    tweet = data['tweet']
    offsets = data['offsets'].numpy()

    start_logits = []
    end_logits = []
    for model in models:
        with torch.no_grad():
            output = model(ids, masks)
            start_logits.append(torch.softmax(output[0], dim=1).cpu().detach().numpy())
            end_logits.append(torch.softmax(output[1], dim=1).cpu().detach().numpy())

    start_logits = np.mean(start_logits, axis=0)
    end_logits = np.mean(end_logits, axis=0)
    for i in range(len(ids)):    
        start_pred = np.argmax(start_logits[i])
        end_pred = np.argmax(end_logits[i])
        if start_pred > end_pred:
        #1 baseline
            #pred = tweet[i]
        #3 McCormick
            sum_start_end = []
            idx_start_end = []
            for k, start_value in enumerate(start_logits[i]):
                for g, end_value in enumerate(end_logits[i]):
                    if k > g:
                        continue
                    sum_start_end.append(start_value + end_value)
                    idx_start_end.append([k,g])
            best_position = np.argmax(sum_start_end)
            start_pred, end_pred = idx_start_end[best_position]
            pred = get_selected_text(tweet[i], start_pred, end_pred, offsets[i])
        else:
            pred = get_selected_text(tweet[i], start_pred, end_pred, offsets[i])
        predictions.append(pred)

# Submission

In [None]:
sub_df = pd.read_csv(SAMPLE_SUBM)
sub_df['selected_text'] = predictions
sub_df['selected_text'] = sub_df['selected_text'].apply(lambda x: x.replace('!!!!', '!') if len(x.split())==1 else x)
sub_df['selected_text'] = sub_df['selected_text'].apply(lambda x: x.replace('..', '.') if len(x.split())==1 else x)
sub_df['selected_text'] = sub_df['selected_text'].apply(lambda x: x.replace('...', '.') if len(x.split())==1 else x)
sub_df.to_csv(os.path.join(OUTPUT, 'submission.csv'), index=False)
sub_df.head()