<a href="https://colab.research.google.com/github/d-maniatakos/vaccine-sentiment-classifier/blob/master/vaccine_sentiment_classifier_using_neural_networks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Modules Import**

In [1]:
import pandas as pd
import numpy as np
import torch
import torchtext
from torch import nn
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt

import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import precision_score, recall_score, f1_score, log_loss, confusion_matrix, ConfusionMatrixDisplay

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## **Download Glove**

In [2]:
!wget http://nlp.stanford.edu/data/glove.6B.zip

--2021-12-08 18:34:30--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2021-12-08 18:34:30--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2021-12-08 18:34:30--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip.1’


2021

In [3]:
!unzip glove.6B.zip

Archive:  glove.6B.zip
replace glove.6B.50d.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: glove.6B.50d.txt        
replace glove.6B.100d.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: yes
  inflating: glove.6B.100d.txt       
replace glove.6B.200d.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: glove.6B.200d.txt       
replace glove.6B.300d.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: glove.6B.300d.txt       


In [4]:
glove = {}

with open('glove.6B.50d.txt') as f:
    lines = f.readlines()
for line in lines:
  vector = []
  for dim in line.split(' ')[1:]:
    vector.append(float(dim))
  glove[line.split(' ')[0]] = np.array(vector)

## **Data Import**

In [5]:
# read datasets and ignore the first (index) column
train_data =  pd.read_csv('vaccine_train_set.csv').iloc[:, 1:]
validation_data = pd.read_csv('vaccine_validation_set.csv').iloc[:, 1:]    # <--- replace with test set path

train_data.head()

Unnamed: 0,tweet,label
0,Sip N Shop Come thru right now #Marjais #Popul...,0
1,I don't know about you but My family and I wil...,1
2,@MSignorile Immunizations should be mandatory....,2
3,President Obama spoke in favor of vaccination ...,0
4,"""@myfoxla: Arizona monitoring hundreds for mea...",0


## **Pre-processing & Vectorization**

In [6]:
# preprocess tweets texts before vectorization
def preprocess(text):
  text = text.lower()
  unwanted_chars = ['@', '#', '!', '(', ')', '*', ':', ',']

  for char in unwanted_chars:
    text = text.replace(char, '')
  lemmatizer = WordNetLemmatizer()

  lemmatized_text = ''
  for word in text.split():
    lemmatized_text += lemmatizer.lemmatize(word) + ' '
  return lemmatized_text


# vectorize a tweet using glove word embedding
def glove_vectorize(corpus, representation = 'average'):
  vectors = []
  for document in corpus:
    glove_vectors = []
    for token in document.split():
      try:
        glove_vectors.append(torch.from_numpy(glove[token]))
      except:
        glove_vectors.append(torch.from_numpy(np.zeros((50))))
    if representation == 'average':
      vectors.append(torch.stack(glove_vectors, dim=0).sum(dim=0).div(len(document.split())))
    elif representation == 'sum':
      vectors.append(torch.stack(glove_vectors, dim=0).sum(dim=0))
  return torch.stack(vectors, dim=0).to(torch.float32)

# create a tf-idf or bow vectorizer (using the training set's tweets)
def create_vectorizer(train_corpus, method='tf-idf', max_features=1000, ngram_range=(1, 2)):
  if method == 'tf-idf':
    vectorizer = TfidfVectorizer(max_features=max_features, ngram_range=ngram_range)
  elif method == 'bow':
    vectorizer = CountVectorizer(max_features=max_features, ngram_range=ngram_range)
  return vectorizer.fit(train_corpus)

# vectorize a dataset
def vectorize(vectorizer, corpus):
  return torch.from_numpy(vectorizer.transform(corpus).toarray()).to(torch.float32)


train_data['tweet'] = train_data['tweet'].apply(preprocess)
validation_data['tweet'] = validation_data['tweet'].apply(preprocess)

train_corpus = train_data['tweet']
validation_corpus = validation_data['tweet']

# tf-idf vectorization approach
tf_idf_vectorizer = create_vectorizer(train_data['tweet'])
train_vector = vectorize(tf_idf_vectorizer, train_corpus)
validation_vector = vectorize(tf_idf_vectorizer, validation_corpus)

# glove vectorization approach
# train_vector = glove_vectorize(train_corpus)
# validation_vector = glove_vectorize(validation_corpus)

x_train = train_vector
y_train = torch.tensor(train_data['label'].values, dtype=torch.long)

x_validation = validation_vector
y_validation = torch.tensor(validation_data['label'].values, dtype=torch.long)

## **Model Creation, Training & Evaluation**

In [25]:
class NeuralNetwork(nn.Module):
  def __init__(self, x_size, num_of_classes, hidden_size):
    super(NeuralNetwork, self).__init__()
    self.l1 = nn.Linear(x_size, hidden_size)
    self.l2 = nn.ReLU()
    self.l3 = nn.Linear(hidden_size, num_of_classes)

  def forward(self, x):
    out = self.l1(x)
    out = self.l2(out)
    out = self.l3(out)
    return out

input_size = 1000
hidden_size = 300
output_size = 3
num_of_epochs = 50
batch_size = 10000
learning_rate = 0.001
loss_function = nn.CrossEntropyLoss()
model = NeuralNetwork(input_size, output_size, hidden_size)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)



train_dataset = torch.utils.data.TensorDataset(x_train, y_train)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)



for epoch in range(num_of_epochs):
  batch_losses = []

  for index, (x, y) in enumerate(train_dataloader):
    y_pred = model(x)
    loss = loss_function(y_pred, y)
    batch_losses.append(loss.item())

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

  predictions = []

  for i in range(validation_vector.size()[0]):
    prediction = model.forward(validation_vector[i])
    predictions.append(prediction.tolist())

  preds = []

  for i in range(validation_vector.size()[0]):
    prediction = model.forward(validation_vector[i])
    preds.append(torch.argmax(prediction).item())

  predictions
  y_validation.tolist()

  precision = precision_score(y_validation.tolist(), preds, average='weighted')

  print(precision)
  

  validation_loss = log_loss(y_validation.tolist(), predictions)

  
  print('Epoch: ' + str(epoch+1) + '/' + str(num_of_epochs) + ' Training Loss: ' + str(sum(batch_losses)/len(train_dataloader)) + ' Validation Loss: ' + str(validation_loss))

  



0.6901163166728581
Epoch: 1/50 Training Loss: 0.7658413174152374 Validation Loss: 5.525486824159947
0.6961393723905571
Epoch: 2/50 Training Loss: 0.6406563619375228 Validation Loss: 5.274774885667301
0.7029443882112819
Epoch: 3/50 Training Loss: 0.6237607787847519 Validation Loss: 5.226679959747042
0.6961541502994107
Epoch: 4/50 Training Loss: 0.6127710593938828 Validation Loss: 5.4767363150233175
0.6965970787518068
Epoch: 5/50 Training Loss: 0.6083814537525177 Validation Loss: 5.36236419327047
0.700936576284288
Epoch: 6/50 Training Loss: 0.6034710947275161 Validation Loss: 5.462709636979239
0.6893195940375518
Epoch: 7/50 Training Loss: 0.602828196644783 Validation Loss: 5.623967661468575
0.6927511530364829
Epoch: 8/50 Training Loss: 0.5997678434848785 Validation Loss: 5.319826366201879
0.694639157296395
Epoch: 9/50 Training Loss: 0.5987647057771682 Validation Loss: 5.301644878029933
0.6940373742032948
Epoch: 10/50 Training Loss: 0.5970548651218415 Validation Loss: 5.285565727336307


KeyboardInterrupt: ignored

In [None]:
predictions = []

for i in range(validation_vector.size()[0]):
  prediction = model.forward(validation_vector[i])
  predictions.append(torch.argmax(prediction).item())

predictions
y_validation.tolist()

precision = precision_score(y_validation.tolist(), predictions, average='weighted')

predictions
y_validation.tolist()