In [1]:
!pip install tqdm
!pip install transformers
!pip install torch



In [0]:
import torch
import pandas as pd
from tqdm.notebook import tqdm

In [0]:
df = pd.read_csv('/content/smile-annotations-final.csv',
                names = ['id', 'tweet', 'label'])
df.set_index('id', inplace = True)

In [4]:
df.head()

Unnamed: 0_level_0,tweet,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1
611857364396965889,@aandraous @britishmuseum @AndrewsAntonio Merc...,nocode
614484565059596288,Dorian Gray with Rainbow Scarf #LoveWins (from...,happy
614746522043973632,@SelectShowcase @Tate_StIves ... Replace with ...,happy
614877582664835073,@Sofabsports thank you for following me back. ...,happy
611932373039644672,@britishmuseum @TudorHistory What a beautiful ...,happy


In [5]:
df.label.value_counts()

nocode               1572
happy                1137
not-relevant          214
angry                  57
surprise               35
sad                    32
happy|surprise         11
happy|sad               9
disgust|angry           7
disgust                 6
sad|angry               2
sad|disgust             2
sad|disgust|angry       1
Name: label, dtype: int64

In [0]:
#no-code data should be removed - not handeling such data yet
df = df[df.label != 'nocode']

#remove data with multiple labels
df = df[~df.label.str.contains('\|')]

In [7]:
df.label.value_counts()

happy           1137
not-relevant     214
angry             57
surprise          35
sad               32
disgust            6
Name: label, dtype: int64

In [0]:
labels = df.label.unique()

#encode and store unique labels
labels_dict = {}
for index, unique in enumerate(labels):
    labels_dict[unique] = index

In [9]:
print(labels_dict)

{'happy': 0, 'not-relevant': 1, 'angry': 2, 'disgust': 3, 'sad': 4, 'surprise': 5}


In [0]:
df['code'] = df.label.replace(labels_dict)

In [11]:
df.head(10)

Unnamed: 0_level_0,tweet,label,code
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
614484565059596288,Dorian Gray with Rainbow Scarf #LoveWins (from...,happy,0
614746522043973632,@SelectShowcase @Tate_StIves ... Replace with ...,happy,0
614877582664835073,@Sofabsports thank you for following me back. ...,happy,0
611932373039644672,@britishmuseum @TudorHistory What a beautiful ...,happy,0
611570404268883969,@NationalGallery @ThePoldarkian I have always ...,happy,0
614499696015503361,Lucky @FitzMuseum_UK! Good luck @MirandaStearn...,happy,0
613601881441570816,Yr 9 art students are off to the @britishmuseu...,happy,0
613696526297210880,@RAMMuseum Please vote for us as @sainsbury #s...,not-relevant,1
610746718641102848,#AskTheGallery Have you got plans to privatise...,not-relevant,1
612648200588038144,@BarbyWT @britishmuseum so beautiful,happy,0


In [0]:
from sklearn.model_selection import train_test_split

In [0]:
x_train, x_val, y_train, y_val = train_test_split(df.index.values,
                                                 df.code.values,
                                                 test_size=0.20,
                                                 stratify=df.code.values)

In [0]:
#to add train and valid labels
df['data_type'] = ['not_set'] * df.shape[0]

In [15]:
df.head(10)

Unnamed: 0_level_0,tweet,label,code,data_type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
614484565059596288,Dorian Gray with Rainbow Scarf #LoveWins (from...,happy,0,not_set
614746522043973632,@SelectShowcase @Tate_StIves ... Replace with ...,happy,0,not_set
614877582664835073,@Sofabsports thank you for following me back. ...,happy,0,not_set
611932373039644672,@britishmuseum @TudorHistory What a beautiful ...,happy,0,not_set
611570404268883969,@NationalGallery @ThePoldarkian I have always ...,happy,0,not_set
614499696015503361,Lucky @FitzMuseum_UK! Good luck @MirandaStearn...,happy,0,not_set
613601881441570816,Yr 9 art students are off to the @britishmuseu...,happy,0,not_set
613696526297210880,@RAMMuseum Please vote for us as @sainsbury #s...,not-relevant,1,not_set
610746718641102848,#AskTheGallery Have you got plans to privatise...,not-relevant,1,not_set
612648200588038144,@BarbyWT @britishmuseum so beautiful,happy,0,not_set


In [0]:
df.loc[x_train, 'data_type'] = 'train'
df.loc[x_val, 'data_type'] = 'valid'

In [17]:
df.head(10)

Unnamed: 0_level_0,tweet,label,code,data_type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
614484565059596288,Dorian Gray with Rainbow Scarf #LoveWins (from...,happy,0,train
614746522043973632,@SelectShowcase @Tate_StIves ... Replace with ...,happy,0,train
614877582664835073,@Sofabsports thank you for following me back. ...,happy,0,train
611932373039644672,@britishmuseum @TudorHistory What a beautiful ...,happy,0,train
611570404268883969,@NationalGallery @ThePoldarkian I have always ...,happy,0,valid
614499696015503361,Lucky @FitzMuseum_UK! Good luck @MirandaStearn...,happy,0,train
613601881441570816,Yr 9 art students are off to the @britishmuseu...,happy,0,train
613696526297210880,@RAMMuseum Please vote for us as @sainsbury #s...,not-relevant,1,train
610746718641102848,#AskTheGallery Have you got plans to privatise...,not-relevant,1,train
612648200588038144,@BarbyWT @britishmuseum so beautiful,happy,0,train


In [18]:
#Hierarchical Grouping
df.groupby(['label', 'code', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,tweet
label,code,data_type,Unnamed: 3_level_1
angry,2,train,45
angry,2,valid,12
disgust,3,train,5
disgust,3,valid,1
happy,0,train,909
happy,0,valid,228
not-relevant,1,train,171
not-relevant,1,valid,43
sad,4,train,26
sad,4,valid,6


In [0]:
from torch.utils.data import TensorDataset
from transformers import BertTokenizer

In [0]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                          do_lower_case=True)

In [0]:

train_encode = tokenizer.batch_encode_plus(
    df[df.data_type=='train'].tweet.values,
    add_special_tokens=True,
    pad_to_max_length=True,
    return_attention_masks=True,
    max_length = 256,   #highly unlikely to have tweets of lenth 256
    return_tensors='pt'
)

valid_encode = tokenizer.batch_encode_plus(
    df[df.data_type=='valid'].tweet.values,
    pad_to_max_length=True,
    return_attention_masks=True,
    add_special_tokens=True,
    max_length=256,   
    return_tensors='pt'
)

input_ids = train_encode['input_ids']
attention_mask = train_encode['attention_mask']
labels = torch.tensor(df[df.data_type=='train'].code.values)

valid_input = valid_encode['input_ids']
valid_attention = valid_encode['attention_mask']
valid_labels = torch.tensor(df[df.data_type=='valid'].code.values)

In [0]:
train_data = TensorDataset(input_ids,
                          attention_mask,
                          labels)
valid_data = TensorDataset(valid_input,
                          valid_attention,
                          valid_labels)

In [23]:
print(len(train_data), len(valid_data))

1184 297


In [0]:
from transformers import BertForSequenceClassification

#Finetuning pretrained BERT model with objective -> Sentence Classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased',
                                                      num_labels = len(labels_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

In [0]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

In [0]:
BATCH_SIZE = 32

train_dataloader = DataLoader(train_data,
                              sampler = RandomSampler(train_data),
                              batch_size = BATCH_SIZE)

valid_dataloader = DataLoader(valid_data,
                              sampler = SequentialSampler(valid_data),
                              batch_size = BATCH_SIZE)

In [0]:
from transformers import AdamW, get_cosine_schedule_with_warmup

#Works better than torch.optim.Adam
optimizer = AdamW(model.parameters(),
                           lr=1e-5,
                           eps=1e-6,
                           weight_decay=1e-1)
#chosen after setting = 3 (f1 -> 0.71), 5 (f1 -> 0.77) but acpc for many classes is 0
#10 Epochs helps with generalization
EPOCHS = 10 

#Works better than torch.optim.CosineAnnealingLR and transformers.get_linear_schedule_with_warmup
scheduler = get_cosine_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0,
                                            num_training_steps=len(train_dataloader)*EPOCHS)

In [0]:
import numpy as np
from sklearn.metrics import f1_score

In [0]:
def f1(preds, labels):
  preds_flat = np.argmax(preds, axis=1).flatten()
  labels_flat = labels.flatten()
  return f1_score(labels_flat, preds_flat, average='weighted')

In [0]:
#Per Class Accuracy
def acpc(preds, labels):
  code_dict = {val:key for key,val in labels_dict.items()}

  preds_flat = np.argmax(preds, axis=1).flatten()
  labels_flat = labels.flatten()

  for idx in np.unique(labels_flat):
    y_hat = preds_flat[labels_flat==idx]
    y = labels_flat[labels_flat==idx]
    print(f'Class: {code_dict[idx]}')
    print(f'Accuracy: {len(y_hat[y_hat==idx])/ len(y)}\n')

In [48]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(device)

cuda


In [0]:
def evaluate(valid_dataloader):
  model.eval()

  total_eval_loss = 0
  y_hat, y = [], []

  for batch in valid_dataloader:
      batch = tuple(b.to(device) for b in batch)
      inputs = {'input_ids' : batch[0],
                'attention_mask': batch[1],
                'labels' : batch[2]
                }
      
      with torch.no_grad():
        outputs = model(**inputs)
      
      loss = outputs[0]
      logits = outputs[1]
      total_eval_loss += loss.item()

      logits = logits.detach().cpu().numpy()
      label_ids = inputs['labels'].cpu().numpy()
      y_hat.append(logits)
      y.append(label_ids)

  avg_eval_loss = total_eval_loss/len(valid_dataloader) 
    
  y_hat = np.concatenate(y_hat, axis=0)
  y = np.concatenate(y, axis=0)
            
  return avg_eval_loss, y_hat, y

In [51]:
for epoch in tqdm(range(1, EPOCHS+1)):
  model.train()

  total_loss = 0

  progress_bar = tqdm(train_dataloader, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
  for batch in progress_bar:
        model.zero_grad()

        batch = tuple(b.to(device) for b in batch)
      
        inputs = {'input_ids' : batch[0],
                  'attention_mask': batch[1],
                  'labels' : batch[2]
                  }
        outputs = model(**inputs)
        loss = outputs[0]
        total_loss += loss.item()
        loss.backward()

        optimizer.step()
        scheduler.step()

        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})

  torch.save(model.state_dict(), f'BERT_semantic_epoch_{epoch}.pt')
        
  tqdm.write(f'\nEpoch {epoch}')
    
  avg_training_loss = total_loss/len(train_dataloader)            
  tqdm.write(f'Training loss: {avg_training_loss}')
    
  val_loss, predictions, actual = evaluate(valid_dataloader)
  score_f1 = f1(predictions, actual)
  tqdm.write(f'Validation loss: {val_loss}')
  tqdm.write(f'F1 Score (Weighted): {score_f1}')

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='Epoch 1', max=37.0, style=ProgressStyle(description_width…


Epoch 1
Training loss: 1.0081276796959542
Validation loss: 0.7710063368082046
F1 Score (Weighted): 0.6667821067821067


HBox(children=(FloatProgress(value=0.0, description='Epoch 2', max=37.0, style=ProgressStyle(description_width…


Epoch 2
Training loss: 0.7598029471732475
Validation loss: 0.6699674025177955
F1 Score (Weighted): 0.7163313963313963


HBox(children=(FloatProgress(value=0.0, description='Epoch 3', max=37.0, style=ProgressStyle(description_width…


Epoch 3
Training loss: 0.6017240656388773
Validation loss: 0.4739010468125343
F1 Score (Weighted): 0.7939647284732121


HBox(children=(FloatProgress(value=0.0, description='Epoch 4', max=37.0, style=ProgressStyle(description_width…


Epoch 4
Training loss: 0.4490174425614847
Validation loss: 0.41884562373161316
F1 Score (Weighted): 0.827895182312853


HBox(children=(FloatProgress(value=0.0, description='Epoch 5', max=37.0, style=ProgressStyle(description_width…


Epoch 5
Training loss: 0.35502408928162343
Validation loss: 0.42015796191990373
F1 Score (Weighted): 0.8484994888659191


HBox(children=(FloatProgress(value=0.0, description='Epoch 6', max=37.0, style=ProgressStyle(description_width…


Epoch 6
Training loss: 0.298729965614306
Validation loss: 0.4058902654796839
F1 Score (Weighted): 0.8488515841704237


HBox(children=(FloatProgress(value=0.0, description='Epoch 7', max=37.0, style=ProgressStyle(description_width…


Epoch 7
Training loss: 0.25519115920807867
Validation loss: 0.39857896193861964
F1 Score (Weighted): 0.8516860771953001


HBox(children=(FloatProgress(value=0.0, description='Epoch 8', max=37.0, style=ProgressStyle(description_width…


Epoch 8
Training loss: 0.24235258130608378
Validation loss: 0.3953212328255177
F1 Score (Weighted): 0.8581589386970183


HBox(children=(FloatProgress(value=0.0, description='Epoch 9', max=37.0, style=ProgressStyle(description_width…


Epoch 9
Training loss: 0.2211077579775372
Validation loss: 0.3954137580469251
F1 Score (Weighted): 0.8531663984254404


HBox(children=(FloatProgress(value=0.0, description='Epoch 10', max=37.0, style=ProgressStyle(description_widt…


Epoch 10
Training loss: 0.21579906384687167
Validation loss: 0.39553908463567494
F1 Score (Weighted): 0.8531663984254404



In [52]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(labels_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [56]:
model.load_state_dict(torch.load('/content/BERT_semantic_epoch_10.pt', map_location=torch.device('cpu')))

<All keys matched successfully>

In [0]:
_, predictions, actual = evaluate(valid_dataloader)

In [58]:
acpc(predictions, actual)

Class: happy
Accuracy: 0.9517543859649122

Class: not-relevant
Accuracy: 0.6511627906976745

Class: angry
Accuracy: 0.8333333333333334

Class: disgust
Accuracy: 0.0

Class: sad
Accuracy: 0.0

Class: surprise
Accuracy: 0.2857142857142857

