In [96]:
import pandas as pd
import nltk
import re
import string
import os
from tqdm import tqdm
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import IterableDataset, DataLoader, Dataset
import torch.optim as optim

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score   
import gc

from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import AutoTokenizer, DistilBertForSequenceClassification

In [97]:
train_data = pd.read_csv("EI-reg-En-anger-train.txt", delimiter="\t")
test_data = pd.read_csv("2018-EI-reg-En-anger-test-gold.txt", delimiter="\t")

In [98]:
train_data

Unnamed: 0,ID,Tweet,Affect Dimension,Intensity Score
0,2017-En-10264,@xandraaa5 @amayaallyn6 shut up hashtags are c...,anger,0.562
1,2017-En-10072,it makes me so fucking irate jesus. nobody is ...,anger,0.750
2,2017-En-11383,Lol Adam the Bull with his fake outrage...,anger,0.417
3,2017-En-11102,@THATSSHAWTYLO passed away early this morning ...,anger,0.354
4,2017-En-11506,@Kristiann1125 lol wow i was gonna say really?...,anger,0.438
...,...,...,...,...
1696,2017-En-11110,Got a $20 tip from a drunk Uber passenger. Tod...,anger,0.708
1697,2017-En-11497,@Claymakerbigsi @toghar11 @scott_mulligan_ @Bo...,anger,0.625
1698,2017-En-10539,@vladfucker69 i look rabid,anger,0.472
1699,2017-En-10468,"@m_t_f_72 I'm not surprised, I would be fuming! 😤",anger,0.479


### Preprocessing
Taken from the previous assignment, hence mostly uncommented. The basic preprocessing steps such as lowercasing, removing punctuation, emojis, etc. are conducted.

In [99]:
def custom_tokenize(text):
    if not text:
        print('The text to be tokenized is a None type. Defaulting to blank string.')
        text = ''
    return nltk.word_tokenize(text)

In [100]:
class Preprocessing():

    def __init__(self, full_dataset):
        self.full_dataset = full_dataset
        self.word_to_ix = {}
        self.ix_to_word = {}
        self.context_dataset = []
        self.vocab_size = None
        
    def convert_lowercase(self, x):
        x = x.lower()
        return x
        
    def remove_emoji(self, x):
        emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
        return emoji_pattern.sub(r'', x)
        
    exclude = string.punctuation
    
    def remove_punc(self, x):
        exclude = string.punctuation
        return x.translate(str.maketrans('', '', exclude))
    
    def remove_special_chars(self, x):
        x = re.sub('[^A-Za-z0-9]+', ' ', x)
        return x

    def remove_one_letter_words(self, x):
        x = re.sub(r'(?:^| )\w(?:$| )', ' ', x).strip()
        return x
    
    # default is to apply all these preprocessing steps
    def apply_preprocessing(self,
                            lowercase=True,
                            remove_emoji=True,
                            remove_punc=True,
                            remove_special_chars=True,
                            remove_one_letter_words=True):
        if lowercase:
            self.full_dataset['Tweet'] = self.full_dataset['Tweet'].apply(self.convert_lowercase)
        if remove_emoji:
            self.full_dataset['Tweet'] = self.full_dataset['Tweet'].apply(self.remove_emoji)
        if remove_punc:
            self.full_dataset['Tweet'] = self.full_dataset['Tweet'].apply(self.remove_punc)
        if remove_special_chars:
            self.full_dataset['Tweet'] = self.full_dataset['Tweet'].apply(self.remove_special_chars)
        if remove_one_letter_words:
            self.full_dataset['Tweet'] = self.full_dataset['Tweet'].apply(self.remove_one_letter_words)

In [101]:
cl1 = Preprocessing(train_data)
cl1.apply_preprocessing()
train_data_preprocessed = cl1.full_dataset

cl2 = Preprocessing(test_data)
cl2.apply_preprocessing()
test_data_preprocessed = cl2.full_dataset

We can also drop some cols as they are not needed in the task.

In [102]:
train_data_preprocessed = train_data_preprocessed.drop(['ID', 'Affect Dimension'], axis=1)
test_data_preprocessed = test_data_preprocessed.drop(['ID', 'Affect Dimension'], axis=1)

In [103]:
train_data_preprocessed

Unnamed: 0,Tweet,Intensity Score
0,xandraaa5 amayaallyn6 shut up hashtags are coo...,0.562
1,it makes me so fucking irate jesus nobody is c...,0.750
2,lol adam the bull with his fake outrage,0.417
3,thatsshawtylo passed away early this morning i...,0.354
4,kristiann1125 lol wow was gonna say really hah...,0.438
...,...,...
1696,got 20 tip from drunk uber passenger today get...,0.708
1697,claymakerbigsi toghar11 scottmulligan boxingfa...,0.625
1698,vladfucker69 look rabid,0.472
1699,mtf72 im not surprised would be fuming,0.479


Here, we load the pretrained BERT models and specify that we have only one label (i.e. Regression Task).

In [104]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", 
                                                           num_labels=1,
                                                           ignore_mismatched_sizes=True)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.25.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/vocab.txt
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/tokenizer_config.json
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_ac

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/pytorch_model.bin
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a

Let's apply the tokenizer to the train and test datasets and bring the datasets into a format our Trainer can take. Then we can already train and evaluate the model.

In [105]:
tokenized_data_train = train_data_preprocessed['Tweet'].apply(tokenizer)
tokenized_data_test = test_data_preprocessed['Tweet'].apply(tokenizer)

In [106]:
tokenized_data_train

0       [input_ids, token_type_ids, attention_mask]
1       [input_ids, token_type_ids, attention_mask]
2       [input_ids, token_type_ids, attention_mask]
3       [input_ids, token_type_ids, attention_mask]
4       [input_ids, token_type_ids, attention_mask]
                           ...                     
1696    [input_ids, token_type_ids, attention_mask]
1697    [input_ids, token_type_ids, attention_mask]
1698    [input_ids, token_type_ids, attention_mask]
1699    [input_ids, token_type_ids, attention_mask]
1700    [input_ids, token_type_ids, attention_mask]
Name: Tweet, Length: 1701, dtype: object

In [107]:
for idx, item in enumerate(tokenized_data_train):
    item['labels'] = train_data_preprocessed['Intensity Score'][idx]

In [108]:
for idx, item in enumerate(tokenized_data_test):
    item['labels'] = test_data_preprocessed['Intensity Score'][idx]

Create a list of dicts, where each entry is a dictionary. In each dictionary we have the input_ids, attention_mask, and labels in form of torch tensors.

In [109]:
data_train_full = []
for item in tokenized_data_train:
    itemlist={}
    for key in item.keys():
        if key == 'labels':
            itemlist[key] = torch.tensor(item[key], dtype=torch.double)
        else:
            itemlist[key] = torch.tensor(item[key], dtype=torch.long)
    data_train_full.append(itemlist)

In [110]:
data_test_full = []
for item in tokenized_data_test:
    itemlist={}
    for key in item.keys():
        if key == 'labels':
            itemlist[key] = torch.tensor(item[key], dtype=torch.double)
        else:
            itemlist[key] = torch.tensor(item[key], dtype=torch.long)
    data_test_full.append(itemlist)

In [111]:
data_train_full[0]

{'input_ids': tensor([  101,  1060, 29159, 11057,  2629, 25933,  3148,  3973,  2078,  2575,
          3844,  2039, 23325, 15900,  2015,  2024,  4658, 15807,   102]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
 'labels': tensor(0.5620, dtype=torch.float64)}

After finishing all preprocessing steps, we can finally train and evaluate the model. This is easily done using the hugging face API.

In [112]:
batch_size=32
args = TrainingArguments(
    #evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    #per_device_eval_batch_size=batch_size,
    num_train_epochs=10,
    report_to="none",
    weight_decay=0.01,
    metric_for_best_model='accuracy',
    output_dir='results',
    logging_dir='logs',
)

trainer = Trainer(
    model,
    args,
    train_dataset=data_train_full,
    tokenizer=tokenizer,
)

PyTorch: setting up devices


In [113]:
trainer.train()

***** Running training *****
  Num examples = 1701
  Num Epochs = 10
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 540
  Number of trainable parameters = 109483009
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,0.0127


Saving model checkpoint to results/checkpoint-54
Configuration saved in results/checkpoint-54/config.json
Model weights saved in results/checkpoint-54/pytorch_model.bin
tokenizer config file saved in results/checkpoint-54/tokenizer_config.json
Special tokens file saved in results/checkpoint-54/special_tokens_map.json
Saving model checkpoint to results/checkpoint-108
Configuration saved in results/checkpoint-108/config.json
Model weights saved in results/checkpoint-108/pytorch_model.bin
tokenizer config file saved in results/checkpoint-108/tokenizer_config.json
Special tokens file saved in results/checkpoint-108/special_tokens_map.json
Saving model checkpoint to results/checkpoint-162
Configuration saved in results/checkpoint-162/config.json
Model weights saved in results/checkpoint-162/pytorch_model.bin
tokenizer config file saved in results/checkpoint-162/tokenizer_config.json
Special tokens file saved in results/checkpoint-162/special_tokens_map.json
Saving model checkpoint to result

TrainOutput(global_step=540, training_loss=0.012124849635141868, metrics={'train_runtime': 178.7179, 'train_samples_per_second': 95.178, 'train_steps_per_second': 3.022, 'total_flos': 310933322900790.0, 'train_loss': 0.012124849635141868, 'epoch': 10.0})

In [114]:
preds = trainer.predict(data_test_full)

***** Running Prediction *****
  Num examples = 1002
  Batch size = 8


In [115]:
from sklearn.metrics import r2_score

r2_score(preds.label_ids, preds.predictions)

0.5113854691311741

Get the the length of the largest tweet, so we can create a padding for all tweets that contain fewer words. Also, let us define the needed funtion to create the padding. This is done as pytorch needs the inputs to be of the same length.