# BERTweet fine-tuned feature vector builder


In [1]:
# Fish for target GPU

import torch

if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: Tesla P100-PCIE-16GB


# Installs

In [2]:
!pip install transformers
!pip install emoji==0.6.0
!pip install numpy==1.21.5
!pip install torch==1.10.0+cu111 torchvision==0.11.0+cu111 torchaudio==0.10.0 -f https://download.pytorch.org/whl/torch_stable.html
!pip install scipy==1.5.2

Collecting transformers
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 5.1 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 61.4 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 59.2 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.6.0-py3-none-any.whl (84 kB)
[K     |████████████████████████████████| 84 kB 3.4 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninstalled PyYAML-3.13
Successfully installed huggingface-hub-0.

Looking in links: https://download.pytorch.org/whl/torch_stable.html
Collecting torch==1.10.0+cu111
  Downloading https://download.pytorch.org/whl/cu111/torch-1.10.0%2Bcu111-cp37-cp37m-linux_x86_64.whl (2137.6 MB)
[K     |████████████▌                   | 834.1 MB 1.4 MB/s eta 0:15:45tcmalloc: large alloc 1147494400 bytes == 0x3918c000 @  0x7fd49814f615 0x592b76 0x4df71e 0x59afff 0x515655 0x549576 0x593fce 0x548ae9 0x51566f 0x549576 0x593fce 0x548ae9 0x5127f1 0x598e3b 0x511f68 0x598e3b 0x511f68 0x598e3b 0x511f68 0x4bc98a 0x532e76 0x594b72 0x515600 0x549576 0x593fce 0x548ae9 0x5127f1 0x549576 0x593fce 0x5118f8 0x593dd7
[K     |███████████████▉                | 1055.7 MB 1.3 MB/s eta 0:14:07tcmalloc: large alloc 1434370048 bytes == 0x7d7e2000 @  0x7fd49814f615 0x592b76 0x4df71e 0x59afff 0x515655 0x549576 0x593fce 0x548ae9 0x51566f 0x549576 0x593fce 0x548ae9 0x5127f1 0x598e3b 0x511f68 0x598e3b 0x511f68 0x598e3b 0x511f68 0x4bc98a 0x532e76 0x594b72 0x515600 0x549576 0x593fce 0x548ae9 0x51

Collecting scipy==1.5.2
  Downloading scipy-1.5.2-cp37-cp37m-manylinux1_x86_64.whl (25.9 MB)
[K     |████████████████████████████████| 25.9 MB 45 kB/s 
Installing collected packages: scipy
  Attempting uninstall: scipy
    Found existing installation: scipy 1.4.1
    Uninstalling scipy-1.4.1:
      Successfully uninstalled scipy-1.4.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
albumentations 0.1.12 requires imgaug<0.2.7,>=0.2.5, but you have imgaug 0.2.9 which is incompatible.[0m
Successfully installed scipy-1.5.2


# **Restart the runtime after installs!**

# Imports

In [1]:
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import random
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import emoji
   
from sklearn.metrics import precision_recall_curve 

from transformers import AutoModel, AutoTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup
%matplotlib inline

# Check GPU for reproducibility

In [2]:
# setup GPU
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: Tesla P100-PCIE-16GB


# Load dataset

In [3]:
training_data = pd.read_csv('/content/balanced_train_En_seed10.csv')
training_data

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,tweet,sarcastic,rephrase,sarcasm,irony,satire,understatement,overstatement,rhetorical_question
0,341,341,@Dancruz07 Sureeeeeee you do,1,"I could've replied ""You do not despise me, you...",1.0,0.0,0.0,0.0,0.0,0.0
1,138,138,"@BoardroomBoy Few links for you. To summarise,...",1,"I would omit the ""who'da thunk"" part.",1.0,0.0,0.0,0.0,0.0,1.0
2,241,241,#bbcqt It sure would be nice if Ms May decided...,1,Theresa May just needs to answer the question.,1.0,0.0,0.0,0.0,0.0,0.0
3,1359,1359,Someone in Walmart asked my mom if I was a boy...,0,,,,,,,
4,588,588,Being a mom with an autoimmune disease means I...,1,I wouldn't because sarcasm is the only weapon ...,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
1557,1635,1635,i was so bored today that i let my computer re...,0,,,,,,,
1558,2306,2306,It is midnight. On a Thursday. My roommates de...,0,,,,,,,
1559,3149,3149,"when i was taken off anesthesia today, i told ...",0,,,,,,,
1560,1153,1153,@savingfilm @_celia_bedelia_ I have zero tampo...,0,,,,,,,


In [4]:
positive_augment_training_data = pd.read_csv('/content/sarcastic_all_no_hashtags.csv', lineterminator='\n')
positive_augment_training_data

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,tweet,label
0,0,0,"@cityofdenver, been driving/sliding all around...",1
1,1,1,This. This is the news that matters: http://t....,1
2,2,2,"@empressivegeek LOL, you guys are lawyers righ...",1
3,3,3,The start of another super busy week. So pumped.,1
4,4,4,I'm gonna start knitting again.. Because it's ...,1
...,...,...,...,...
28023,28023,28006,@dangainor @SuePendleton2 @piersmorgan pahahah...,1
28024,28024,28007,@jennapoole11 wait till the game at White Hous...,1
28025,28025,28008,Well well I didn't know it was snowing with al...,1
28026,28026,28009,Just want to thank all of my friends for showi...,1


In [5]:
negative_augment_training_data = pd.read_csv('/content/negative_all.csv', lineterminator='\n')
negative_augment_training_data

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,tweet,label
0,0,0,@ChrisGreenBean I thought you speak about a gu...,0
1,1,1,fancis moves into a different plane at age 32 ...,0
2,2,2,Elmer http://t.co/u8nP9vqu3Z via @comiXology O...,0
3,3,3,[needrestart] https://t.co/HlZA2TDymo Thomas L...,0
4,4,4,Webinar with Landatel for Spanish customers - ...,0
...,...,...,...,...
31747,31747,31747,@blingyeol we should! That'd be awesome :'3 I ...,0
31748,31748,31748,The Polaroid iM1836 copycat camera is now gone...,0
31749,31749,31749,"Woke up around 9, 20 min later smoke like is t...",0
31750,31750,31750,Czech glass beads handmade shamballa bracelet ...,0


In [6]:
validation_data = pd.read_csv('/content/balanced_validation_En_seed10.csv')
validation_data

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,tweet,sarcastic,rephrase,sarcasm,irony,satire,understatement,overstatement,rhetorical_question
0,2708,2708,The meaning behind Little Talks - @monstersand...,0,,,,,,,
1,3416,3416,Kyle is out of town so i made a baked ziti and...,0,,,,,,,
2,3254,3254,Applying for jobs is so weird and feels fake,0,,,,,,,
3,293,293,@lora__ SHOCKED. How am I supposed to tell the...,1,These people were aware of the risks when they...,1.0,0.0,0.0,0.0,0.0,0.0
4,1,1,I love it when professors draw a big question ...,1,I do not like when professors don’t write out ...,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
167,656,656,Is spunk a slur?,1,I am worried that I may have used a slur by sa...,0.0,1.0,0.0,0.0,0.0,1.0
168,1653,1653,OH MY GOD I AM IN LOVE WITH MY FIANCÉ. https:/...,0,,,,,,,
169,3074,3074,Just get yer jab and shut yer moof,0,,,,,,,
170,43,43,"I suppose though, we did sign one for the futu...",1,I wish we could have signed a player for the f...,1.0,0.0,0.0,1.0,0.0,0.0


In [7]:
test_data = pd.read_csv('/content/task_A_En_test.csv')
test_data

Unnamed: 0,text,sarcastic
0,"Size on the the Toulouse team, That pack is mo...",0
1,Pinball!,0
2,So the Scottish Government want people to get ...,1
3,villainous pro tip : change the device name on...,0
4,I would date any of these men 🥺,0
...,...,...
1395,I’ve just seen this and felt it deserved a Ret...,0
1396,Omg how an earth is that a pen !!! 🤡,0
1397,Bringing Kanye and drake to a tl near you,0
1398,"I love it when women are referred to as ""girl ...",1


In [8]:
# Set label to use in binary classifcation
LABEL_TO_CLASSIFY = 'sarcastic'

In [9]:
# Reduce the size of the augmentation data to to make processing faster

NUM_TO_SAMPLE = 800

sampled_positives = positive_augment_training_data.sample(n=NUM_TO_SAMPLE, random_state=2020)
sampled_negatives = negative_augment_training_data.sample(n=NUM_TO_SAMPLE, random_state=2020)

augment_training_data = pd.concat([sampled_positives, sampled_negatives]).reset_index(drop=True)

In [11]:
X_aug_train = augment_training_data['tweet'].astype('str')
y_aug_train = augment_training_data['label'].astype('int')

In [12]:
# Get train, val, test

X_train = training_data['tweet']
X_val = validation_data['tweet']

y_train = training_data[LABEL_TO_CLASSIFY]
y_val = validation_data[LABEL_TO_CLASSIFY]

X_train = pd.concat([X_train, X_aug_train]).reset_index(drop=True)
y_train = pd.concat([y_train, y_aug_train]).reset_index(drop=True)

X_test = test_data['text']
y_test = test_data[LABEL_TO_CLASSIFY]

# Define BERTweet preprocessing
BERTweet-large does not have pre-processing built into its huggingface tokenizer, so we have to do it ourselves here. This code is from: https://github.com/VinAIResearch/BERTweet/blob/master/TweetNormalizer.py

In [13]:
# Define preprocessing function for BERTweet-large (not needed for base version)

from nltk.tokenize import TweetTokenizer
from emoji import demojize

bertweet_preprocessing_tokenizer = TweetTokenizer()

def normalizeToken(token):
    lowercased_token = token.lower()
    if token.startswith("@"):
        return "@USER"
    elif lowercased_token.startswith("http") or lowercased_token.startswith("www"):
        return "HTTPURL"
    elif len(token) == 1:
        return demojize(token)
    else:
        if token == "’":
            return "'"
        elif token == "…":
            return "..."
        else:
            return token

def normalizeTweet(tweet):
    tokens = bertweet_preprocessing_tokenizer.tokenize(tweet.replace("’", "'").replace("…", "..."))
    normTweet = " ".join([normalizeToken(token) for token in tokens])

    normTweet = (
        normTweet.replace("cannot ", "can not ")
        .replace("n't ", " n't ")
        .replace("n 't ", " n't ")
        .replace("ca n't", "can't")
        .replace("ai n't", "ain't")
    )
    normTweet = (
        normTweet.replace("'m ", " 'm ")
        .replace("'re ", " 're ")
        .replace("'s ", " 's ")
        .replace("'ll ", " 'll ")
        .replace("'d ", " 'd ")
        .replace("'ve ", " 've ")
    )
    normTweet = (
        normTweet.replace(" p . m .", "  p.m.")
        .replace(" p . m ", " p.m ")
        .replace(" a . m .", " a.m.")
        .replace(" a . m ", " a.m ")
    )

    return " ".join(normTweet.split())

# Tokenize the BERTweet inputs
Normalization is set to false because we are doing it using the 'normalizeTweet' function above.

In [14]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-large", normalization=False, use_fast=False)

# Define function to do BERT family preprocessing
def preprocessing_for_bert(data):
    """Perform required preprocessing steps for pretrained BERT.
    @param    data (np.array): Array of texts to be processed.
    @return   input_ids (torch.Tensor): Tensor of token ids to be fed to a model.
    @return   attention_masks (torch.Tensor): Tensor of indices specifying which
                  tokens should be attended to by the model.
    """
    # Create empty lists to store outputs
    input_ids = []
    attention_masks = []

    # For every sentence...
    for sent in data:
        # `encode_plus` will:
        #    (1) Tokenize the sentence
        #    (2) Add the `[CLS]` and `[SEP]` token to the start and end
        #    (3) Truncate/Pad sentence to max length
        #    (4) Map tokens to their IDs
        #    (5) Create attention mask
        #    (6) Return a dictionary of outputs
        encoded_sent = tokenizer.encode_plus(
            text=sent,
            add_special_tokens=True,        # Add `[CLS]` and `[SEP]`
            max_length=MAX_LEN,                  # Max length to truncate/pad
            padding='max_length',         # Pad sentence to max length
            #return_tensors='pt',           # Return PyTorch tensor
            return_attention_mask=True,      # Return attention mask
            truncation=True
            )
        
        # Add the outputs to the lists
        input_ids.append(encoded_sent.get('input_ids'))
        attention_masks.append(encoded_sent.get('attention_mask'))

    # Convert lists to tensors
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)

    return input_ids, attention_masks

In [15]:
# Specify `MAX_LEN`
MAX_LEN = 100

# normalize all tweets for BERTweet
for index, value in X_train.iteritems():
  X_train.at[index] = normalizeTweet(value)

for index, value in X_val.iteritems():
  X_val.at[index] = normalizeTweet(value)

for index, value in X_test.iteritems():
  X_test.at[index] = normalizeTweet(value)

# Run function `preprocessing_for_bert` on the train set and the validation set
train_inputs, train_masks = preprocessing_for_bert(X_train)
val_inputs, val_masks = preprocessing_for_bert(X_val)
test_inputs, test_masks = preprocessing_for_bert(X_test)

# Create dataloaders

In [16]:
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler

train_labels = torch.tensor(y_train.values)
val_labels = torch.tensor(y_val.values)
test_labels = torch.tensor(y_test.values)

batch_size = 32

# Create the DataLoaders

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = SequentialSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)


# BERTweet large feature model

In [17]:
class BertweetFeatureModel(nn.Module):
    """Bert Model for Classification Tasks.
    """
    def __init__(self, freeze_bert=False):
        """
        @param    bert: a BertModel object
        @param    classifier: a torch.nn.Module classifier
        @param    freeze_bert (bool): Set `False` to fine-tune the BERT model
        """
        super(BertweetFeatureModel, self).__init__()

        # Instantiate BERT model
        self.bert = AutoModel.from_pretrained("vinai/bertweet-large")

        # Freeze the BERT model
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
        
    def forward(self, input_ids, attention_mask):
        """
        Feed input to BERT and the classifier to compute logits.
        @param    input_ids (torch.Tensor): an input tensor with shape (batch_size,
                      max_length)
        @param    attention_mask (torch.Tensor): a tensor that hold attention mask
                      information with shape (batch_size, max_length)
        @return   features A feature tensor of (batch_size,
                      max_length)
        """
        # Feed input to BERT
        bert_outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask)
        
        # Extract the last hidden state of the token `[CLS]` for classification task
        last_hidden_state_cls = bert_outputs[0][:, 0, :]

        return last_hidden_state_cls

In [18]:
def initialize_feature_model():
    """Initialize the Bert Classifier, the optimizer and the learning rate scheduler.
    """
    # Instantiate Bert Classifier
    feature_model = BertweetFeatureModel(freeze_bert=True)

    # Tell PyTorch to run the model on GPU
    feature_model.to(device)

    return feature_model

In [19]:
feature_model = initialize_feature_model()

Some weights of the model checkpoint at vinai/bertweet-large were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at vinai/bertweet-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use 

# Load state dictionary weights from fine-tuned model

In [29]:
from google.colab import drive
drive.mount('/content/gdrive',force_remount=True)

Mounted at /content/gdrive


In [20]:
# load weights - 'IncompatibleKeys' message is normal as we are not using the classifier weights

state_dict_path = "/content/gdrive/MyDrive/573_model_weights/random2020_statedict.pt"
feature_model.load_state_dict(torch.load(state_dict_path), strict=False)

_IncompatibleKeys(missing_keys=[], unexpected_keys=['classifier.0.weight', 'classifier.0.bias', 'classifier.3.weight', 'classifier.3.bias'])

# Build the feature vectors

In [21]:
def bert_create_features(model, current_dataloader):
    """Creates feature vectors.
    """
    # Put the model into the evaluation mode.
    model.eval()

    all_vectors = []

    # For each batch in our test set...
    for batch in current_dataloader:
        # Load batch to GPU
        # The '_' value is for labels, which we do not need
        b_input_ids, b_attn_mask, _ = tuple(t.to(device) for t in batch)

        # Compute logits
        with torch.no_grad():
            batch_vectors = model(b_input_ids, b_attn_mask)
        all_vectors.append(batch_vectors)
    
    # Concatenate vectors from each batch
    final_result = torch.cat(all_vectors, dim=0)

    return final_result.cpu().numpy()

In [22]:
val_result = bert_create_features(feature_model, val_dataloader)

In [23]:
val_result.shape

(172, 1024)

In [24]:
test_result = bert_create_features(feature_model, test_dataloader)

In [25]:
test_result.shape

(1400, 1024)

In [26]:
train_result = bert_create_features(feature_model, train_dataloader)

In [27]:
train_result.shape

(3162, 1024)

In [25]:
np.save("val_feature_vectors.npy", val_result) 

In [26]:
np.save("test_feature_vectors.npy", test_result)

In [28]:
np.save("train_feature_vectors.npy", train_result)