In [34]:
import os
import re
import string
import json
import emoji
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, AutoTokenizer, BertModel, BertConfig, AutoModel, AdamW
import warnings
warnings.filterwarnings('ignore')

pd.set_option("display.max_columns", None)

In [35]:
# df_train = pd.read_csv("../input/emotion-data/emotion_data/train.csv")
# # df_train.head()

# # df_train = pd.read_csv("../input/emotion-data/goemotion.csv")
# df_test = pd.read_csv("../input/emotion-data/emotion_data/test.csv")
# # df_valid = pd.read_csv("../input/emotion-data/emotion_data/train.csv")

In [36]:
df_train = pd.read_csv("../input/voilla/additional_features_train.csv", index_col=["Unnamed: 0"])
df_test = pd.read_csv("../input/voilla/additional_features_test.csv", index_col=["Unnamed: 0"])

In [37]:
df_train.shape

(7724, 21)

In [38]:
import json

In [39]:
df_train["encoded_dependency_features"] = df_train["encoded_dependency_features"].apply(json.loads)
df_train["encoded_noun_features"] = df_train["encoded_noun_features"].apply(json.loads)

In [40]:
# type(.iloc[0])

In [41]:
df_dev = df_train.iloc[:1000]
df_train = df_train.iloc[1000:]

In [42]:
df_train = df_train.reset_index(drop=True)
df_dev = df_dev.reset_index(drop=True)

In [43]:
print(df_train.shape)
print(df_dev.shape)

(6724, 21)
(1000, 21)


In [44]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [45]:
# Sections of config

# Defining some key variables that will be used later on in the training
MAX_LEN = 150
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 32
EPOCHS = 10
LEARNING_RATE = 2e-5
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-emotion")

Downloading:   0%|          | 0.00/768 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

In [46]:
# df_train

In [47]:
target_cols = [ 'anger', 'anticipation', 'disgust', 'fear', 'joy',
       'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust', 'neutral']
target_cols

['anger',
 'anticipation',
 'disgust',
 'fear',
 'joy',
 'love',
 'optimism',
 'pessimism',
 'sadness',
 'surprise',
 'trust',
 'neutral']

In [48]:
class BERTDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.max_len = max_len
        self.text = df.Tweet
        self.tokenizer = tokenizer
        self.targets = df[target_cols].values
        self.feature1 = df["encoded_dependency_features"]
        self.feature2 = df["encoded_noun_features"]
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        text = self.text[index]
        inputs = self.tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'feature1' : torch.tensor(self.feature1[index], dtype=torch.long),
            'feature2' : torch.tensor(self.feature2[index], dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [49]:
train_dataset = BERTDataset(df_train, tokenizer, MAX_LEN)
valid_dataset = BERTDataset(df_dev, tokenizer, MAX_LEN)

In [50]:
train_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, 
                          num_workers=4, shuffle=True, pin_memory=True)
valid_loader = DataLoader(valid_dataset, batch_size=VALID_BATCH_SIZE, 
                          num_workers=4, shuffle=False, pin_memory=True)

In [51]:
from transformers import AutoModel
_model = AutoModel.from_pretrained("cardiffnlp/twitter-roberta-base-emotion")
# sentence-transformers/all-MiniLM-L6-v2

Downloading:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-emotion and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [52]:
# _model

In [53]:
# from torch import nn
# _model.classifier.out_proj = torch.nn.Linear(in_features=768, out_features=11, bias=True)

In [54]:
# _model

In [55]:
from torch import nn

In [56]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 

class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.roberta = _model
        self.dense_layer_1 = nn.Linear(768+110+110, 256)
        self.dropout = nn.Dropout(0.2)
        self.dense_layer_2 = nn.Linear(256, 128)
        self.dropout_2 = nn.Dropout(0.2) 
        self.cls_layer = nn.Linear(128, 12, bias = True)
    
    def forward(self, input_ids, attention_masks, feature1, feature2):
        
        pooled_output = self.roberta(input_ids=input_ids, attention_mask=attention_masks)["pooler_output"]
        
        concat = torch.cat( (pooled_output, feature1, feature2), dim=1 )
        
        x = self.dense_layer_1(concat)
        x = self.dropout(x)
        x_1 = self.dense_layer_2(x)
        x_2 = self.dropout_2(x_1)
        
        logits = self.cls_layer(x_2)
        
        return logits

model = BERTClass()
model.to(device);

In [57]:
class ContrastiveLoss(torch.nn.Module):
  def __init__(self, m=2.0):
    super(ContrastiveLoss, self).__init__()  # pre 3.3 syntax
    self.m = m  # margin or radius

  def forward(self, y1, y2, d=0):
    # d = 0 means y1 and y2 are supposed to be same
    # d = 1 means y1 and y2 are supposed to be different
    
    euc_dist = torch.nn.functional.pairwise_distance(y1, y2)

    if d == 0:
      return torch.mean(torch.pow(euc_dist, 2))  # distance squared
    else:  # d == 1
      delta = self.m - euc_dist  # sort of reverse distance
      delta = torch.clamp(delta, min=0.0, max=None)
      return torch.mean(torch.pow(delta, 2))  # mean over all rows

loss_fn = nn.BCEWithLogitsLoss()

In [58]:
from torch.optim import Adam

In [59]:
optimizer = AdamW(params =  model.parameters(), betas=(0.9, 0.999), eps=1e-8, lr=2e-5)

In [60]:
def validation():
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(valid_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            feature1 = data['feature1'].to(device, dtype = torch.long)
            feature2 = data['feature2'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, feature1, feature2)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [61]:
def train(epoch):
    model.train()
    for _,data in enumerate(train_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        feature1 = data['feature1'].to(device, dtype = torch.long)
        feature2 = data['feature2'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, feature1, feature2)

        loss = loss_fn(outputs, targets)
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        if _%500 == 0:
            
            for _, data in enumerate(valid_loader, 0):
                ids = data['ids'].to(device, dtype = torch.long)
                mask = data['mask'].to(device, dtype = torch.long)
                feature1 = data['feature1'].to(device, dtype = torch.long)
                feature2 = data['feature2'].to(device, dtype = torch.long)
                targets = data['targets'].to(device, dtype = torch.float)
                outputs = model(ids, mask, feature1, feature2)
                loss = loss_fn(outputs, targets)
                
            outputs, targets = validation()
            accuracy = metrics.f1_score(np.array(targets).round().astype(int), np.array(outputs).round().astype(int), average="macro")
            print(f'Epoch: {epoch}, Val F1:  {accuracy}')
            print(f'Epoch: {epoch}, Val Loss:  {loss.item()}')

In [62]:
for epoch in range(30):
    train(epoch)

Epoch: 0, Val F1:  0.20682998586230028
Epoch: 0, Val Loss:  0.7270818948745728
Epoch: 1, Val F1:  0.3672037594260415
Epoch: 1, Val Loss:  0.22946123778820038
Epoch: 2, Val F1:  0.46167119739086054
Epoch: 2, Val Loss:  0.2265312373638153
Epoch: 3, Val F1:  0.48834506610580014
Epoch: 3, Val Loss:  0.20787957310676575
Epoch: 4, Val F1:  0.4840207266871262
Epoch: 4, Val Loss:  0.23711282014846802
Epoch: 5, Val F1:  0.4905962390215013
Epoch: 5, Val Loss:  0.2221367210149765
Epoch: 6, Val F1:  0.525532531493519
Epoch: 6, Val Loss:  0.28976261615753174
Epoch: 7, Val F1:  0.5383975096721595
Epoch: 7, Val Loss:  0.315105676651001
Epoch: 8, Val F1:  0.5483165814138664
Epoch: 8, Val Loss:  0.27668747305870056
Epoch: 9, Val F1:  0.549338740835781
Epoch: 9, Val Loss:  0.3606465756893158
Epoch: 10, Val F1:  0.5258462928385806
Epoch: 10, Val Loss:  0.31350183486938477
Epoch: 11, Val F1:  0.540477624516816
Epoch: 11, Val Loss:  0.3419830799102783
Epoch: 12, Val F1:  0.5420873879140685
Epoch: 12, Val L

KeyboardInterrupt: 

In [63]:
outputs, targets = validation()
outputs = np.array(outputs).round()

outputs = pd.DataFrame(outputs)

accuracy = metrics.accuracy_score(targets, outputs)
f1_score = metrics.f1_score(targets, outputs, average='macro')
print(f"Accuracy Score = {accuracy}")
print(f"F1 Score = {f1_score}")

Accuracy Score = 0.227
F1 Score = 0.5440240917620088


<a href="./hero"> Download File </a>

In [None]:
# test

In [65]:
torch.save(model, "hero")

In [None]:
df_test = pd.read_csv("../input/voilla/additional_features_test.csv", index_col=["Unnamed: 0"])
df_test["encoded_dependency_features"] = df_test["encoded_dependency_features"].apply(json.loads)
df_test["encoded_noun_features"] = df_test["encoded_noun_features"].apply(json.loads)

In [None]:
def give_padding(sent):

  var_len = len(sent)
  padding_len = 110-var_len
  padding = [0]*padding_len
  sent = sent + padding
  
  return sent[:110]

In [None]:
df_test['encoded_noun_features'] = df_test['encoded_noun_features'].apply(lambda x : give_padding(x) )
df_test['encoded_dependency_features'] = df_test['encoded_dependency_features'].apply(lambda x : give_padding(x) )

In [None]:
class TestBERTDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.max_len = max_len
        self.idx = df.index
        self.text = df.Tweet
        self.tokenizer = tokenizer
        self.feature1 = df["encoded_dependency_features"]
        self.feature2 = df["encoded_noun_features"]
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        text = self.text[index]
        idx = self.idx[index]
        inputs = self.tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'idx' : torch.tensor(idx, dtype=torch.long),
            'feature1' : torch.tensor(self.feature1[index], dtype=torch.long),
            'feature2' : torch.tensor(self.feature2[index], dtype=torch.long)
        }

In [None]:
test_dataset = TestBERTDataset(df_test, tokenizer, MAX_LEN)

In [None]:
test_loader = DataLoader(test_dataset, batch_size=1, 
                          num_workers=4, shuffle=False, pin_memory=True)

In [None]:
fin_outputs=[]
with torch.no_grad():
    for _, data in enumerate(test_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        feature1 = data['feature1'].to(device, dtype = torch.long)
        feature2 = data['feature2'].to(device, dtype = torch.long)
        outputs = model(ids, mask, feature1, feature2)
        fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

In [None]:
# fin_outputs

In [None]:
fin_outputs = np.array(fin_outputs) >= 0.5

In [None]:
fin_outputs.astype(int).sum()

In [None]:
columns = pd.read_csv("../input/emotion-data/emotion_data/train.csv").columns

In [None]:
submission = pd.concat([df_test[["ID", "Tweet"]], pd.DataFrame(fin_outputs.astype(int))], axis=1)
# submission["neutral"] = 0
# submission.columns = columns

In [None]:
submission.head()

In [None]:
submission.to_csv("optimism.csv", index=False)

In [None]:
submission.sum()

<a href="./model.bin"> Download File </a>