<a href="https://colab.research.google.com/github/d-maniatakos/vaccine-sentiment-classifier/blob/master/vaccine_sentiment_classifier_using_neural_networks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Modules Import**

In [71]:
import pandas as pd
import numpy as np
import torch
import torchtext
from torch import nn
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt

import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## **Download Glove**

In [2]:
!wget http://nlp.stanford.edu/data/glove.6B.zip

--2021-12-08 13:42:55--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2021-12-08 13:42:55--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2021-12-08 13:42:55--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2021-1

In [3]:
!unzip glove.6B.zip

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [4]:
glove = {}

with open('glove.6B.50d.txt') as f:
    lines = f.readlines()
for line in lines:
  vector = []
  for dim in line.split(' ')[1:]:
    vector.append(float(dim))
  glove[line.split(' ')[0]] = np.array(vector)

## **Data Import**

In [5]:
# read datasets and ignore the first (index) column
train_data =  pd.read_csv('vaccine_train_set.csv').iloc[:, 1:]
validation_data = pd.read_csv('vaccine_validation_set.csv').iloc[:, 1:]    # <--- replace with test set path

train_data.head()

Unnamed: 0,tweet,label
0,Sip N Shop Come thru right now #Marjais #Popul...,0
1,I don't know about you but My family and I wil...,1
2,@MSignorile Immunizations should be mandatory....,2
3,President Obama spoke in favor of vaccination ...,0
4,"""@myfoxla: Arizona monitoring hundreds for mea...",0


## **Pre-processing & Vectorization**

In [88]:
# preprocess tweets texts before vectorization
def preprocess(text):
  text = text.lower()
  unwanted_chars = ['@', '#', '!', '(', ')', '*', ':', ',']

  for char in unwanted_chars:
    text = text.replace(char, '')
  lemmatizer = WordNetLemmatizer()

  lemmatized_text = ''
  for word in text.split():
    lemmatized_text += lemmatizer.lemmatize(word) + ' '
  return lemmatized_text


# vectorize a tweet using glove word embedding
def glove_vectorize(corpus, representation = 'average'):
  vectors = []
  for document in corpus:
    glove_vectors = []
    for token in document.split():
      try:
        glove_vectors.append(torch.from_numpy(glove[token]))
      except:
        glove_vectors.append(torch.from_numpy(np.zeros((50))))
    if representation == 'average':
      vectors.append(torch.stack(glove_vectors, dim=0).sum(dim=0).div(len(document.split())))
    elif representation == 'sum':
      vectors.append(torch.stack(glove_vectors, dim=0).sum(dim=0))
  return torch.stack(vectors, dim=0).to(torch.float32)

# create a tf-idf or bow vectorizer (using the training set's tweets)
def create_vectorizer(train_corpus, method='tf-idf', max_features=1000, ngram_range=(1, 2)):
  if method == 'tf-idf':
    vectorizer = TfidfVectorizer(max_features=max_features, ngram_range=ngram_range)
  elif method == 'bow':
    vectorizer = CountVectorizer(max_features=max_features, ngram_range=ngram_range)
  return vectorizer.fit(train_corpus)

# vectorize a dataset
def vectorize(vectorizer, corpus):
  return torch.from_numpy(vectorizer.transform(corpus).toarray()).to(torch.float32)


train_data['tweet'] = train_data['tweet'].apply(preprocess)
validation_data['tweet'] = validation_data['tweet'].apply(preprocess)

train_corpus = train_data['tweet']
validation_corpus = validation_data['tweet']

# tf-idf vectorization approach
tf_idf_vectorizer = create_vectorizer(train_data['tweet'])
train_vector = vectorize(tf_idf_vectorizer, train_corpus)
validation_vector = vectorize(tf_idf_vectorizer, validation_corpus)

# glove vectorization approach
# train_vector = glove_vectorize(train_corpus)
# validation_vector = glove_vectorize(validation_corpus)

x_train = train_vector
y_train = torch.tensor(train_data['label'].values, dtype=torch.long)

x_validation = validation_vector
y_validation = torch.tensor(validation_data['label'].values, dtype=torch.long)

## **Model Creation, Training & Evaluation**

In [91]:
class NeuralNetwork(nn.Module):
  def __init__(self, x_size, num_of_classes, hidden_size):
    super(NeuralNetwork, self).__init__()
    self.l1 = nn.Linear(x_size, hidden_size)
    self.l2 = nn.ReLU()
    self.l3 = nn.ReLU()
    self.l4 = nn.Linear(hidden_size, num_of_classes)

  def forward(self, x):
    out = self.l1(x)
    out = self.l2(out)
    out = self.l3(out)
    out = self.l4(out)
    return out

input_size = 1000
hidden_size = 100
output_size = 3
num_of_epochs = 100
batch_size = 50
learning_rate = 0.005
loss_function = nn.CrossEntropyLoss()
model = NeuralNetwork(input_size, output_size, hidden_size)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)



train_dataset = torch.utils.data.TensorDataset(x_train, y_train)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)



for epoch in range(num_of_epochs):
  batch_losses = []

  for index, (x, y) in enumerate(train_dataloader):
    y_pred = model(x)
    loss = loss_function(y_pred, y)
    batch_losses.append(loss.item())

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
  
  print('Epoch: ' + str(epoch+1) + '/' + str(num_of_epochs) + ' Loss: ' + str(sum(batch_losses)/len(train_dataloader)))

  


Epoch: 1/100 Loss: 0.7315863567590714
Epoch: 2/100 Loss: 0.6121771919727326
Epoch: 3/100 Loss: 0.5555927259922028
Epoch: 4/100 Loss: 0.48131192004680634
Epoch: 5/100 Loss: 0.3892309994697571
Epoch: 6/100 Loss: 0.29420304387807844
Epoch: 7/100 Loss: 0.21003246492147445
Epoch: 8/100 Loss: 0.14696762196719645
Epoch: 9/100 Loss: 0.10388402876257896
Epoch: 10/100 Loss: 0.07897730526328087
Epoch: 11/100 Loss: 0.06688874853402377
Epoch: 12/100 Loss: 0.05500690709799528
Epoch: 13/100 Loss: 0.05300597612932324
Epoch: 14/100 Loss: 0.047061178939417005
Epoch: 15/100 Loss: 0.04574764510430396
Epoch: 16/100 Loss: 0.04270966698974371
Epoch: 17/100 Loss: 0.041656162206083536
Epoch: 18/100 Loss: 0.04115706650167704
Epoch: 19/100 Loss: 0.03915246226638556
Epoch: 20/100 Loss: 0.04008368455246091
Epoch: 21/100 Loss: 0.041263415371999144
Epoch: 22/100 Loss: 0.03656896366085857
Epoch: 23/100 Loss: 0.03435428424365818
Epoch: 24/100 Loss: 0.0364202398378402
Epoch: 25/100 Loss: 0.034807802985887974
Epoch: 26/

In [92]:
result = model.forward(validation_vector[0])
torch.argmax(result)

predictions = []

for i in range(validation_vector.size()[0]):
  prediction = model.forward(validation_vector[i])
  predictions.append(torch.argmax(prediction).item())

predictions
y_validation.tolist()

precision = precision_score(y_validation.tolist(), predictions, average='weighted')

precision

0.6755810737654179