<a href="https://colab.research.google.com/github/d-maniatakos/vaccine-sentiment-classifier/blob/master/vaccine_sentiment_classifier_using_neural_networks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Modules Import**

In [60]:
import pandas as pd
import numpy as np
import torch
import torchtext
from torch import nn
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt

import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## **Download Glove**

In [61]:
#!wget http://nlp.stanford.edu/data/glove.6B.zip

In [62]:
#!unzip glove.6B.zip

In [86]:
glove = {}

with open('glove.6B.50d.txt') as f:
    lines = f.readlines()
for line in lines:
  vector = []
  for dim in line.split(' ')[1:]:
    vector.append(float(dim))
  glove[line.split(' ')[0]] = np.array(vector)

## **Data Import**

In [64]:
# read datasets and ignore the first (index) column
train_data =  pd.read_csv('vaccine_train_set.csv').iloc[:, 1:]
validation_data = pd.read_csv('vaccine_validation_set.csv').iloc[:, 1:]    # <--- replace with test set path

train_data.head()

Unnamed: 0,tweet,label
0,Sip N Shop Come thru right now #Marjais #Popul...,0
1,I don't know about you but My family and I wil...,1
2,@MSignorile Immunizations should be mandatory....,2
3,President Obama spoke in favor of vaccination ...,0
4,"""@myfoxla: Arizona monitoring hundreds for mea...",0


## **Pre-processing & Vectorization**

In [130]:
# preprocess tweets texts before vectorization
def preprocess(text):
  text = text.lower()
  unwanted_chars = ['@', '#', '!', '(', ')', '*', ':', ',']

  for char in unwanted_chars:
    text = text.replace(char, '')
  lemmatizer = WordNetLemmatizer()

  lemmatized_text = ''
  for word in text.split():
    lemmatized_text += lemmatizer.lemmatize(word) + ' '
  return lemmatized_text


# vectorize a tweet using glove word embedding
def glove_vectorize(corpus, representation = 'average'):
  vectors = []
  for document in corpus:
    glove_vectors = []
    for token in document.split():
      try:
        glove_vectors.append(torch.from_numpy(glove[token]))
      except:
        glove_vectors.append(torch.from_numpy(np.zeros((50))))
    if representation == 'average':
      vectors.append(torch.stack(glove_vectors, dim=0).sum(dim=0).div(len(document.split())))
    elif representation == 'sum':
      vectors.append(torch.stack(glove_vectors, dim=0).sum(dim=0))
  return torch.stack(vectors, dim=0).to(torch.float32)

# create a tf-idf or bow vectorizer (using the training set's tweets)
def create_vectorizer(train_corpus, method='tf-idf', max_features=1000, ngram_range=(1, 2)):
  if method == 'tf-idf':
    vectorizer = TfidfVectorizer(max_features=max_features, ngram_range=ngram_range)
  elif method == 'bow':
    vectorizer = CountVectorizer(max_features=max_features, ngram_range=ngram_range)
  return vectorizer.fit(train_corpus)

# vectorize a dataset
def vectorize(vectorizer, corpus):
  return torch.from_numpy(vectorizer.transform(corpus).toarray())


train_data['tweet'] = train_data['tweet'].apply(preprocess)
validation_data['tweet'] = validation_data['tweet'].apply(preprocess)

train_corpus = train_data['tweet']
validation_corpus = validation_data['tweet']

# tf-idf vectorization approach
# tf_idf_vectorizer = create_vectorizer(train_data['tweet'])
# train_vector = vectorize(tf_idf_vectorizer, train_corpus)
# validation_vector = vectorize(tf_idf_vectorizer, validation_corpus)

# glove vectorization approach
train_vector = glove_vectorize(train_corpus)
validation_vector = glove_vectorize(validation_corpus)

x_train = train_vector
y_train = torch.tensor(train_data['label'].values, dtype=torch.long)

x_validation = validation_vector
y_validation = torch.tensor(validation_data['label'].values, dtype=torch.long)

## **Model Creation, Training & Evaluation**

In [138]:
class NeuralNetwork(nn.Module):
  def __init__(self, x_size, num_of_classes, hidden_size):
    super(NeuralNetwork, self).__init__()
    self.l1 = nn.Linear(x_size, hidden_size)
    self.l2 = nn.ReLU()
    self.l3 = nn.Linear(hidden_size, num_of_classes)

  def forward(self, x):
    out = self.l1(x)
    out = self.l2(out)
    out = self.l3(out)
    return out

input_size = 50
hidden_size = 50
output_size = 3
num_of_epochs = 100
batch_size = 50
learning_rate = 0.0005
loss_function = nn.CrossEntropyLoss()
model = NeuralNetwork(input_size, output_size, hidden_size)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)



train_dataset = torch.utils.data.TensorDataset(x_train, y_train)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)



for epoch in range(num_of_epochs):
  batch_losses = []

  for index, (x, y) in enumerate(train_dataloader):
    y_pred = model(x)
    loss = loss_function(y_pred, y)
    batch_losses.append(loss.item())

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
  
  print(sum(batch_losses)/len(train_dataloader))

  


0.9721210684776306
0.8980456585884095
0.8645398151874543
0.8470092039108277
0.8352774384021759
0.8276990504264832
0.8214411511421204
0.8160589730739594
0.8112691435813904
0.8071184015274048
0.8036636371612549
0.8002009191513062
0.7970786125659942
0.7938336899280548
0.7907508478164673
0.7873194839954376
0.7847209961414338
0.7819648029804229
0.7805574004650115
0.7778748118877411
0.775200325012207
0.7738631582260131
0.7710979471206665
0.7694821798801422
0.7681685435771942
0.7658142895698548
0.7636613645553589
0.7626716482639313
0.760807318687439
0.7593283529281616
0.7579209942817688
0.7559967212677002
0.7552576777935028
0.7532679607868195
0.7520756351947785
0.7510281281471253
0.7494487960338593
0.7488963658809662
0.7477855932712555
0.7468670768737793
0.7455000548362732
0.744374038696289
0.7429556584358216
0.7425107533931732
0.7417153072357178
0.7397462162971497
0.7391671404838562
0.7385078086853027
0.7372233324050903
0.7372061212062836
0.7354315669536591
0.7351475973129272
0.7343865342140