In [None]:
#Import modules and packages which are needed for the analysis
import numpy as np
import pandas as pd
import os
import re
import string
import random
import warnings
import torch
from torch import nn
import torch.optim as optim
from sklearn.model_selection import StratifiedKFold
import tokenizers
from transformers import RobertaModel, RobertaConfig

warnings.filterwarnings('ignore')

In [None]:
#Set seed to make random input consistent for every run
def seed_everything(seed_num):
    random.seed(seed_num)
    np.random.seed(seed_num)
    torch.manual_seed(seed_num)
    os.environ['PYTHONHASHSEED'] = str(seed_num)
    
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed_num)
        torch.cuda.manual_seed_all(seed_num)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = True

In [None]:
seed = 0
seed_everything(seed)

In [None]:
path = '../input/tweet-sentiment-extraction/'

In [None]:
train = pd.read_csv(path + 'train.csv')
test = pd.read_csv(path + 'test.csv')
sub_df = pd.read_csv(path + 'sample_submission.csv')

In [None]:
train.info()

In [None]:
# 1 missing row, so we need to remove it
train.dropna(inplace = True)

# **1. DATA CLEANING**

* As we know,twitter tweets always have to be cleaned before we go onto modelling.So I will do some cleaning such as removing html tags and emojis etc.
* Spelling correction will not be performed due to the run time limitation of the competition

In [None]:
def removeURL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', text)

In [None]:
def remove_html(text):
    html = re.compile(r'<.*?>')
    return html.sub(r'', text)

In [None]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [None]:
for col in ['text', 'selected_text']:
    train[col] = train[col].astype('string')
    train[col] = train[col].apply(lambda x: removeURL(x))
    train[col] = train[col].apply(lambda x: remove_html(x))
    train[col] = train[col].apply(lambda x: remove_emoji(x))

In [None]:
test['text'] = test['text'].astype('string')
test['text'] = test['text'].apply(lambda x: removeURL(x))
test['text'] = test['text'].apply(lambda x: remove_html(x))
test['text'] = test['text'].apply(lambda x: remove_emoji(x))

# **2. DATA LOADER**

In this project, I need to select words which have correct sentiment in the text. Thus, I structured this problem similar to the question/answer problem with:
- Question is the sentiment.
- Answer is the text.
- Training with selected text.

The model I am using is Roberta model. The tokenizer structure is:
![image.png](attachment:image.png)

So I need to determine:
- The starting index of the token of the selected text in the text token.
- The offset of the token of the selected text. The offset is the number of token from the starting token to the end token of the selected text tokens.

Then I can get the selected text which corresponds to the sentiment given

In [None]:
#define a class to load data set

class GetDataset(torch.utils.data.Dataset):
    def __init__(self, df, max_len = 99):
        self.df = df
        self.max_len = max_len
        self.labeleb = 'selected_text' in df
        self.tokenizer = tokenizers.ByteLevelBPETokenizer('../input/roberta-base/vocab.json',
                                                         '../input/roberta-base/merges.txt',
                                                         lowercase = True,
                                                         add_prefix_space = True)
        
    def __getitem__(self, index):
        data = {}
        row = self.df.iloc[index]
        
        #get index, mask, tweet, and offsets of selected text on each row
        ids, masks, tweet, offsets = self.get_input_data(row)        
        data['ids'] = ids
        data['masks'] = masks
        data['tweet'] = tweet
        data['offsets'] = offsets
        
        #get start index, end index of selected text on each row
        if self.labeled:
            start_idx, end_idx = self.get_target_idx(row, tweet, offsets)
            data['start_idx'] = start_idx
            data['end_idx'] = end_idx
            
        return data
    
    def __len__(self):
        return len(self.df)
    
    def get_input_data(self, row):
        
        #Because Roberta require a space prefix so a space is added as a prefix for each text
        #to remove all redundent space if have, the text is splitted then join again
        tweet = ' ' + ' '.join(row.text.lower().split())
        
        #index array for each row with [0] is start sentence token id and [2] is end sentence token id
        encoding = self.tokenizer.encode(tweet)
        sentiment_id = self.tokenizer.encode(row.sentiment).ids        
        ids = [0] + sentiment_id + [2,2] + encoding.ids + [2]
        
        #get offset, 4 is the combination of start token, sentiment token and 2 stop tokens
        offsets = [(0,0)]*4 + encoding.offsets + [(0,0)]
        
        #padding 1 on the remaining of the len
        pad_len = self.max_len - len(ids)
        if pad_len > 0:
            ids += [1] * pad_len
            offsets += [(0,0)] * pad_len
            
        #convert ids and offset to torch tensor
        ids = torch.tensor(ids)
        offsets = torch.tensor(offsets)
        
        #mask all where index = 1
        masks = torch.where(ids != 1, torch.tensor(1), torch.tensor(0))
        
        return ids, masks, tweet, offsets
    
    def get_target_idx(self, row, tweet, offsets):
        
        selected_text = ' ' + ' '.join(row.selected_text.lower().split())
        len_st = len(selected_text) - 1
        idx0 = None
        idx1 = None
        
        #get the start index and end index of character of selected text in the tweet
        for ind in (i for i, e in enumerate(tweet) if e == selected_text[1]): #selected_text[0] is a space I added
            if ' ' + tweet[ind:ind+len_st] == selected_text:
                idx0 = ind
                idx1 = ind + len_st - 1
                break
        
        #set 1 for characters which are the selected text in character targets array
        char_targets = [0] * len(tweet)
        if idx0 != None and idx1 != None:
            for ct in range(idx0, idx1+1):
                char_targets[ct] = 1
                
        
        #so the target is getting the index of words of the selected text in the tweet
        #the character of selected text is set to 1 in char_targets
        #so if the word in the tweet is a word of selected text, the sum of char_targets
        #at the positions of the word is greater than 0.
        target_idx = []
        for i, (offset1, offset2) in enumerate(offsets):
            if sum(char_targets[offset1:offset2]) >0 :
                taget_idx.append(i)
        start_idx = target_idx[0]
        end_idx = target_idx[-1]
        
        return start_idx, end_idx

In [None]:
#function to get train loader and validation loader

def get_train_val_loaders(df, train_idx, val_idx, batch_size=8):
    
    train_df = df.iloc[train_idx]
    val_df = df.iloc[val_idx]
    
    train_loader = torch.utils.data.DataLoader(GetDataset(train_df),
                                              batch_size=batch_size,
                                              shuffle=True,
                                              num_workers=2,
                                              drop_last=True)
    val_loader = torch.utils.data.DataLoader(GetDataset(val_df),
                                            batch_size=batch_size,
                                            shuffle=False,
                                            num_workers=2)
    dataloaders_dict = {'train': train_loader, 'val': val_loader}
    
    return dataloaders_dict

In [None]:
#fuction to get test loader

def get_test_loader(df, batch_size=8):
    
    loader = torch.utils.data.DataLoader(GetDataset(df),
                                        batch_size=batch_size,
                                        shuffle=False,
                                        num_workers=2)
    return loader

# **3. MODEL**

In [None]:
#define a class of roberta base model

class TweetModel(nn.Module):
    
    def __init__(self):
        
        super(TweetModel, self).__init__()
        
        config = RobertaConfig.from_pretrained('../input/roberta-base/config.json',
                                              output_hidden_states=True)
        self.roberta = RobertaModel.from_pretrained('../input/roberta-base/pytorch_model.bin',
                                                   config=config)
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(config.hidden_size, 2)
        nn.init.normal_(self.fc.weight, std=0.02)
        nn.init.normal_(self.fc.bias, 0)
        
    def forward(self, input_ids, attention_mask):
        
        #get output of roberta base model
        _, _, hs = self.roberta(input_ids, attention_mask)
        
        x = torch.stack([hs[-1], hs[-2], hs[-3], hs[-4]])
        x = torch.mean(x, 0)
        x = self.dropout(x)
        x = self.fc(x)
        
        start_logits, end_logits = x.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)
        
        return start_logits, end_logits

# **4. LOSS FUNCTION**

In [None]:
def loss_fn(start_logits, end_logits, start_positions, end_positions):
    
    ce_loss = nn.CrossEntropyLoss()
    start_loss = ce_loss(start_logits, start_positions)
    end_loss = ce_loss(end_logits, end_positions)
    total_loss = start_loss + end_loss
    
    return total_loss

# **5. EVALUATION**

In [None]:
def get_selected_text(text, start_idx, end_idx, offsets):
    
    selected_text = ''
    
    for ix in range(start_idx, end_idx+1):
        selected_text += text[offsets[ix][0]:offsets[ix][1]]
        #add space to between the words of the selected text
        if (ix+1) < len(offsets) and offsets[ix][1] < offsets[ix+1][0]:
            selected_text += ' '
    
    return selected_text

In [None]:
#functions to compute jaccard score of similarity between 2 strings
def jaccard(str1, str2):
    
    a = set(str1.lower().split())
    b = set(str2.lower().split())
    c = a.intersection(b)
    
    return float(len(c)/(len(a)+len(b)-len(c)))

def compute_jaccard_score(text, start_idx, end_idx, start_logits, end_logits, offsets):
    
    start_pred = np.argmax(start_logits)
    end_pred = np.argmax(end_logits)
    if start_pred > end_pred:
        pred = text
    else:
        pred = get_selected_text(text, start_pred, end_pred, offsets)
        
    true = get_selected_text(text, start_idx, end_idx, offsets)
    
    return jaccard(pred, true)

# **6. TRAINING**

In [None]:
def train_model(model, dataloaders_dict, criterion, optimizer, num_epochs, filename):
    
    model.cuda()
    
    for epoch in range(num_epochs):
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()
            else:
                model.eval()
                
            epoch_loss = 0.0
            epoch_jaccard = 0.0
            
            for data in (dataloaders_dict[phase]):
                ids = data['ids'].cuda()
                masks = data['masks'].cuda()
                tweet = data['tweet'].cuda()
                offsets = data['offsets'].cuda()
                start_idx = data['start_idx'].cuda()
                end_idx = data['end_idx'].cuda()
                
                optimizer.zero_grad()
                
                with torch.set_grad_enabled(phase=='train'):
                    start_logits, end_logits = model(ids, masks)
                    loss = criterion(start_logits, end_logits, start_idx, end_idx)
                    
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()
                        
                    epoch_loss += loss.item() * len(ids)
                    start_idx = start_idx.cpu().detach().numpy()
                    end_idx = end_idx.cpu().detach().numpy()
                    start_logits = torch.softmax(start_logits, dim=1).cpu().detach().numpy()
                    end_logits = torch.softmax(end_logits, dim=1).cpu().detach().numpy()
                    
                    for i in range(len(ids)):
                        jaccard_score = compute_jaccard_score(tweet[i],
                                                             start_idx[i],
                                                             end_idx[i],
                                                             start_logits[i],
                                                             end_logits[i],
                                                             offsets[i])
                        epoch_jaccard += jaccard_score
                        
            epoch_loss = epoch_loss/len(dataloaders_dict[phase].dataset)
            epoch_jaccard = epoch_jaccard/len(dataloaders_dict[phase].dataset)
            
            print('Epoch {}/{} | {:^5} | Loss: {:.4f} | Jaccard: {:.4f}'.format(
                epoch + 1, num_epochs, phase, epoch_loss, epoch_jaccard))
            
    torch.save(model.state_dict(), filename)

In [None]:
num_epochs = 5
batch_size = 8
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state = seed)

for fold, (train_idx, val_idx) in enumerate(skf.split(train, train.sentiment), start=1):
    print(f'Fold: {fold}')
    
    model = TweetModel()
    optimizer = optim.AdamW(model.parameters(), lr=1e-6, betas=(0.9, 0.999))
    criterion = loss_fn
    dataloaders_dict = get_train_val_loaders(train, train_idx, val_idx, batch_size)
    
    train_model(model, dataloaders_dict, criterion, optimizer, num_epochs, f'roberta_fold{fold}.pth')

# **7. INFERENCE**

In [None]:
test_loader = get_test_loader(test)
predictions = []
models = []

for fold in range(skf.n_splits):
    model = TweetModel()
    model.cuda()
    model.load_state_dict(torch.load(f'roberta_fold{fold+1}.pth'))
    model.eval()
    models.append(model)
    
for data in test_loader:
    ids = data['ids'].cuda()
    masks = data['masks'].cuda()
    tweet = data['tweet'].cuda()
    offsets = data['offsets'].cuda()
    
    start_logits = []
    end_logits = []
    
    for model in models:
        with torch.no_grad():
            output = model(ids, masks)
            start_logits.append(torch.softmax(output[0], dim=1).cpu().detach().numpy())
            end_logits.append(torch.softmax(output[1], dim=1).cpu().detach().numpy())
            
    start_logits = np.mean(start_logits, axis=0)
    end_logits = np.mean(end_logits, axis=0)
    
    for i in range(len(ids)):    
        start_pred = np.argmax(start_logits[i])
        end_pred = np.argmax(end_logits[i])
        if start_pred > end_pred:
            pred = tweet[i]
        else:
            pred = get_selected_text(tweet[i], start_pred, end_pred, offsets[i])
        predictions.append(pred)

MAKE SUBMISSION FILE

In [None]:
sub_df['selected_text'] = predictions
sub_df['selected_text'] = sub_df['selected_text'].apply(lambda x: x.replace('!!!!', '!') if len(x.split())==1 else x)
sub_df['selected_text'] = sub_df['selected_text'].apply(lambda x: x.replace('..', '.') if len(x.split())==1 else x)
sub_df['selected_text'] = sub_df['selected_text'].apply(lambda x: x.replace('...', '.') if len(x.split())==1 else x)
sub_df.to_csv('submission.csv', index=False)