<a href="https://colab.research.google.com/github/d-maniatakos/vaccine-sentiment-classifier/blob/master/vaccine_sentiment_classifier_using_bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Vaccine Sentiment Classifier using fine-tuned BERT Model**

In [16]:
!pip install transformers
import torch
from torch import nn
from torch.utils.data import DataLoader

import pandas as pd
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup, BertForSequenceClassification

from sklearn.metrics import precision_score, recall_score, f1_score, log_loss, confusion_matrix, ConfusionMatrixDisplay, roc_curve, RocCurveDisplay, roc_auc_score



In [17]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [18]:
# read datasets and ignore the first (index) column
train_data =  pd.read_csv('vaccine_train_set.csv').iloc[:, 1:]
validation_data = pd.read_csv('vaccine_validation_set.csv').iloc[:, 1:]    # <--- replace with test set path

train_data.head()

Unnamed: 0,tweet,label
0,Sip N Shop Come thru right now #Marjais #Popul...,0
1,I don't know about you but My family and I wil...,1
2,@MSignorile Immunizations should be mandatory....,2
3,President Obama spoke in favor of vaccination ...,0
4,"""@myfoxla: Arizona monitoring hundreds for mea...",0


In [22]:
y_train = torch.tensor(train_data['label'].tolist())
y_validation = torch.tensor(validation_data['label'].tolist())

encoded_train = tokenizer.batch_encode_plus(train_data['tweet'].tolist(), add_special_tokens = True, truncation=True, max_length=64, padding='longest', return_attention_mask=True, return_tensors='pt')
encoded_validation = tokenizer.batch_encode_plus(validation_data['tweet'].tolist(), add_special_tokens = True, truncation=True, max_length=64, padding='longest',pad_to_max_length=True, return_attention_mask=True, return_tensors='pt')

ids_train = encoded_train['input_ids']
ids_validation = encoded_validation['input_ids']

masks_train = encoded_train['attention_mask']
masks_validation = encoded_validation['attention_mask']

train_dataset = torch.utils.data.TensorDataset(ids_train, masks_train, y_train)
validation_dataset = torch.utils.data.TensorDataset(ids_validation, masks_validation, y_validation)

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True)
validation_dataloader = torch.utils.data.DataLoader(validation_dataset, batch_size=8, shuffle=True)

In [23]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

if torch.cuda.is_available:
  device = torch.device('cuda')
  torch.cuda.empty_cache()
else:
  device = torch.device('cpu')


model = model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels = 3)
model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [24]:
optimizer = AdamW(model.parameters(), lr = 1e-5)



In [25]:
model.train()
for epoch in range(2):
  for batch in train_dataloader:
      model.zero_grad()
      batch = tuple(b.to(device) for b in batch)
      out = model(input_ids=batch[0], attention_mask=batch[1], labels=batch[2])
      loss = out['loss']

      loss.backward()
      optimizer.step()
      
    
      print(loss)


tensor(1.3386, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3456, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1211, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2252, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3123, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1221, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2416, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0449, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0349, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8969, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2988, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9173, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0554, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1202, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0438, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2386, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1262, device='cuda:0', grad_fn=

In [26]:
model.eval()

real_values = []
predictions = []

for batch in validation_dataloader:
  real_values = real_values + batch[2].tolist()
  batch = tuple(b.to(device) for b in batch)
  predictions = predictions + torch.argmax(model(input_ids=batch[0], attention_mask=batch[1], labels=batch[2])['logits'], dim=1).tolist()

print(real_values)
print(predictions)

[2, 2, 2, 2, 0, 2, 1, 2, 0, 1, 2, 2, 2, 0, 2, 2, 2, 2, 0, 2, 2, 0, 2, 2, 2, 0, 2, 2, 1, 2, 0, 0, 1, 1, 0, 2, 0, 2, 0, 1, 0, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 2, 0, 1, 2, 2, 2, 1, 2, 0, 1, 2, 2, 0, 0, 0, 0, 0, 2, 0, 1, 2, 2, 0, 0, 2, 2, 2, 2, 1, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 1, 0, 2, 0, 0, 0, 1, 0, 2, 2, 2, 0, 2, 0, 1, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 2, 0, 2, 2, 0, 2, 2, 0, 2, 2, 0, 2, 0, 0, 2, 1, 0, 2, 2, 2, 1, 1, 2, 0, 0, 1, 0, 0, 0, 1, 0, 2, 0, 0, 2, 0, 2, 1, 0, 2, 2, 1, 0, 2, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 1, 1, 2, 0, 2, 2, 2, 0, 2, 0, 2, 0, 2, 1, 0, 0, 0, 2, 0, 0, 0, 2, 0, 1, 2, 2, 2, 2, 1, 1, 0, 2, 0, 2, 1, 1, 1, 2, 1, 2, 1, 2, 1, 1, 0, 1, 2, 0, 0, 0, 2, 2, 2, 2, 0, 2, 0, 1, 1, 0, 0, 0, 2, 2, 0, 0, 0, 0, 2, 0, 0, 2, 2, 0, 2, 2, 2, 1, 1, 2, 2, 0, 0, 1, 2, 2, 0, 2, 2, 2, 0, 2, 0, 1, 0, 2, 2, 0, 2, 0, 2, 1, 0, 0, 2, 2, 1, 2, 2, 2, 2, 0, 0, 0, 2, 2, 2, 2, 2, 2, 0, 2, 2, 0, 2, 1, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 1, 0, 1, 

In [69]:
precision_score(real_values, predictions, average='weighted')


0.7824112451963035

In [63]:
def predict_sentiment(text):
  encoded_text = tokenizer.batch_encode_plus([text], add_special_tokens = True, truncation=True, max_length=32, padding='longest', return_attention_mask=True, return_tensors='pt')
  id_text = encoded_train['input_ids']
  mask_text = encoded_train['attention_mask']
  text_dataset = torch.utils.data.TensorDataset(id_text, mask_text)
  dataloader = torch.utils.data.DataLoader(text_dataset, batch_size=1, shuffle=True)
  for batch in dataloader:
    batch = tuple(b.to(device) for b in batch)

  print(model(input_ids=batch[0], attention_mask=batch[1]))
  
  prediction = torch.argmax(model(input_ids=batch[0], attention_mask=batch[1])['logits'], dim=1)
  print(prediction)
  print('Text: \'' + text + '\'' + '  Sentiment: ', end=' ')
  if prediction == 0:
    print('Neutral Sentiment')
  elif prediction == 1:
    print('Anti-vaccinist')
  elif prediction == 2:
    print('Pro-vaccinist')

predict_sentiment(',Thank you @realDonaldTrump for speaking out against vaccines/autism. It needs to be done more. Something is wrong. MDs are trained to push')

SequenceClassifierOutput(loss=None, logits=tensor([[ 2.6763, -2.6665, -0.0274]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
tensor([0], device='cuda:0')
Text: ',Thank you @realDonaldTrump for speaking out against vaccines/autism. It needs to be done more. Something is wrong. MDs are trained to push'  Sentiment:  Neutral Sentiment
