# Kaggle Competition Code

In [1]:
import pandas as pd

import numpy as np

from tqdm.auto import tqdm

import torch

import torch.nn as nn

from torch.utils.data import Dataset, DataLoader


from transformers import BertTokenizer, BertForSequenceClassification
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from transformers import XLNetTokenizer, XLNetForSequenceClassification


from transformers import InputExample, InputFeatures


from sklearn.model_selection import train_test_split

import emoji

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
# Pulling in data and taking a look

data_identification = pd.read_csv("kaggle/data_identification.csv")
data_identification.head()

Unnamed: 0,tweet_id,identification
0,0x28cc61,test
1,0x29e452,train
2,0x2b3819,train
3,0x2db41f,test
4,0x2a2acc,train


In [3]:
# Pulling in data and taking a look

emotion = pd.read_csv("kaggle/emotion.csv")
emotion.head()

Unnamed: 0,tweet_id,emotion
0,0x3140b1,sadness
1,0x368b73,disgust
2,0x296183,anticipation
3,0x2bd6e1,joy
4,0x2ee1dd,anticipation


In [4]:
import numpy as np

# Below is some preprocessing, commented out because I exported the output to csv to avoid the long processing time


# Taking the weird nested json format and flattening
# tweets_source = tweets['_source'].tolist()
# for i in range(len(tweets_source)):
#     tweets_source[i] = tweets_source[i]['tweet']
# tweets_source = pd.DataFrame(tweets_source)
# def json_to_series(text):
#     keys, values = zip(*[item for dct in json.loads(json.dumps(text)) for item in dct.items()])
#     return pd.Series(values, index=keys)

# Merging the flattened data and original data
# tweets_expanded = pd.concat([tweets, tweets_source], axis=1)
# tweets_merged = pd.merge(tweets_expanded, data_identification, on="tweet_id", how='left')
# tweets_merged = pd.merge(tweets_merged, emotion, on='tweet_id', how='left')

# Using the emoji module to replace emoji with actual text for BERT
# tweets_merged['text'] = tweets_merged['text'].apply(emoji.demojize, delimiters=("", ""))

# Turned into a csv to save time
tweets_merged = pd.read_csv("tweets_processed.csv")

# Setting up train and test splits
tweets_train_eval = tweets_merged[tweets_merged['identification'] == 'train']
tweets_test = tweets_merged[tweets_merged['identification'] == 'test']

In [5]:
# Export processed tweets to csv to avoid having to reprocess
# tweets_merged.to_csv("tweets_processed.csv")

In [6]:
from sklearn import preprocessing, metrics, decomposition, pipeline, dummy

# Using LabelEncoder to turn emotion labels into numeric representation

mle = preprocessing.LabelEncoder()
mle.fit(tweets_train_eval['emotion'])
mle.classes_
tweets_train_eval['label'] = mle.transform(tweets_train_eval['emotion']).tolist()

# Tried out two ways to split test and eval, I preferred train_test_split because its a bit more clear
# tweets_train, tweets_eval = np.split(tweets_train_eval.sample(frac=1, random_state = 99), [int(.8*len(tweets_train_eval))])
tweets_train, tweets_eval, y_train, y_val = train_test_split(tweets_train_eval.index.values, 
        tweets_train_eval['label'].values, test_size=0.15, random_state=99, stratify = tweets_train_eval['label'].values)

tweets_train_eval['data_type'] = ['not_set']*tweets_train_eval.shape[0]

# Label train and val
tweets_train_eval.loc[tweets_train, 'data_type'] = "train"
tweets_train_eval.loc[tweets_eval, 'data_type'] = "val"

# Taking a look at counts per train and val
tweets_train_eval.groupby(['emotion', 'label', 'data_type']).count()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets_train_eval['label'] = mle.transform(tweets_train_eval['emotion']).tolist()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets_train_eval['data_type'] = ['not_set']*tweets_train_eval.shape[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 0,_score,_index,_source,_crawldate,_type,hashtags,tweet_id,text,identification
emotion,label,data_type,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
anger,0,train,33887,33887,33887,33887,33887,33887,33887,33887,33887,33887
anger,0,val,5980,5980,5980,5980,5980,5980,5980,5980,5980,5980
anticipation,1,train,211595,211595,211595,211595,211595,211595,211595,211595,211595,211595
anticipation,1,val,37340,37340,37340,37340,37340,37340,37340,37340,37340,37340
disgust,2,train,118236,118236,118236,118236,118236,118236,118236,118236,118236,118236
disgust,2,val,20865,20865,20865,20865,20865,20865,20865,20865,20865,20865
fear,3,train,54399,54399,54399,54399,54399,54399,54399,54399,54399,54399
fear,3,val,9600,9600,9600,9600,9600,9600,9600,9600,9600,9600
joy,4,train,438614,438614,438614,438614,438614,438614,438614,438614,438614,438614
joy,4,val,77403,77403,77403,77403,77403,77403,77403,77403,77403,77403


In [7]:
# Looking at overall emotion counts. It is imbalanced, but totals are high so not super worried

tweets_train_eval['emotion'].value_counts()

joy             516017
anticipation    248935
trust           205478
sadness         193437
disgust         139101
fear             63999
surprise         48729
anger            39867
Name: emotion, dtype: int64

In [8]:
# Using pre-trained BERT through the Transformers library
# I tried a few different pretrained models (regular bert, bert large, roberta large, and XLnet)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Setting up the encoding. I followed a few different tutorials on how to do this, but found batch_encode_plus was most convenient

encoded_data_train = tokenizer.batch_encode_plus(
    tweets_train_eval[tweets_train_eval['data_type'] == "train"].text.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    padding='max_length', 
    max_length=90, 
    # Pytorch tensor
    return_tensors='pt',
    truncation=True
)

encoded_data_val = tokenizer.batch_encode_plus(
    tweets_train_eval[tweets_train_eval['data_type'] == "val"].text.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    padding="max_length", 
    max_length=90, 
    # Pytorch tensor
    return_tensors='pt',
    truncation=True
)

In [9]:
from torch.utils.data import DataLoader, TensorDataset
from transformers import DataCollatorWithPadding


# include labels in the encoded data

encoded_data_train['labels'] = tweets_train_eval[tweets_train_eval['data_type'] == "train"].label.values
encoded_data_val['labels'] = tweets_train_eval[tweets_train_eval['data_type'] == "val"].label.values

input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(tweets_train_eval[tweets_train_eval['data_type'] == "train"].label.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(tweets_train_eval[tweets_train_eval['data_type'] == "val"].label.values)

dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

# using DataLoader from pytorch to pre-load the data
train_dataloader = DataLoader(dataset_train, shuffle=True, batch_size=256)
val_dataloader = DataLoader(dataset_val, shuffle=True, batch_size=256)







In [10]:
from transformers import get_scheduler, AdamW

# Setting up a dictionary to map label numeric values and actual values

label_dict = dict(zip(mle.classes_, mle.transform(mle.classes_)))

# Setting up the pretrained model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

# Using AdamW for the optimizer
optimizer = AdamW(model.parameters(),
                  lr=1e-05, 
                  eps=1e-08)
                  
# defining the learning rate scheduler
num_epochs = 4
num_training_steps = num_epochs* len(train_dataloader)
scheduler = get_scheduler("linear", optimizer = optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [29]:
import random
from sklearn.metrics import f1_score


seed_val = 1776
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
model.to(device)



def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def evaluate(dataloader_val):
    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        batch = tuple(b.to(device) for b in batch)
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()
        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
    f1 = f1_score_func(predictions, true_vals)
    return f1


print("starting training...")
    
for epoch in (range(num_epochs)):
    
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(train_dataloader)
    for batch in progress_bar:
        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)

        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_description('Epoch {:1d}'.format(epoch))
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
        
    torch.save(model.state_dict(), 'finetuned_BERT_emoji_epoch_{}.model'.format(epoch))
        
    tqdm.write(f'\nEpoch {epoch}')
    
    f1 = evaluate(val_dataloader)
    tqdm.write("epoch {} validation f1 score: {}".format(epoch, f1))



starting training...


Epoch 0:   0%|          | 13/4833 [00:11<1:09:34,  1.15it/s, training_loss=0.367]


KeyboardInterrupt: 

In [31]:
# Getting ready for prediction

# Load model
model.load_state_dict(torch.load('finetuned_BERT_emoji_epoch_0.model', map_location=torch.device('cpu')))

# Encode test data
encoded_data_test = tokenizer.batch_encode_plus(
    tweets_test.text.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    padding='max_length', 
    max_length=90, 
    # Pytorch tensor
    return_tensors='pt',
    truncation=True
)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [36]:
input_ids_test = encoded_data_test['input_ids']
attention_masks_test = encoded_data_test['attention_mask']


dataset_test = TensorDataset(input_ids_test, attention_masks_test)

test_dataloader = DataLoader(dataset_test, shuffle=True, batch_size=256)

In [37]:
model.to(device)
def testing(dataloader_test):

    model.eval()
    
    predictions = []
    
    for batch in dataloader_test:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        logits = outputs[0]

        logits = logits.detach().cpu().numpy()
        predictions.append(logits)
    
    
    predictions = np.concatenate(predictions, axis=0)
            
    return predictions

predictions = testing(test_dataloader)

In [51]:
predictions = np.argmax(predictions, axis=1).flatten()

In [50]:
predictions 

array([5, 2, 5, ..., 4, 7, 4], dtype=int64)

In [47]:
tweets_test.shape

(411972, 11)

In [52]:
label_dict_inverse = {v: k for k, v in label_dict.items()}


tweets_test.insert(0, "predictions", predictions)

In [53]:
tweets_test['prediction'] = tweets_test['predictions'].map(label_dict_inverse)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets_test['predictions'] = tweets_test['predictions'].map(label_dict_inverse)


In [55]:
submission = tweets_test[['tweet_id', 'predictions']]
submission

Unnamed: 0,tweet_id,predictions
2,0x28b412,sadness
4,0x2de201,disgust
9,0x218443,sadness
30,0x2939d5,fear
33,0x26289a,joy
...,...,...
1867525,0x2913b4,surprise
1867529,0x2a980e,trust
1867530,0x316b80,joy
1867531,0x29d0cb,trust


In [None]:
submission.rename(columns={"predictions":"emotion", "tweet_id":"id"}, inplace=True)
submission.to_csv('bert_emoji_epoch_6.csv', index=False)