<a href="https://colab.research.google.com/github/d-maniatakos/vaccine-sentiment-classifier/blob/master/vaccine_sentiment_classifier_using_neural_networks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Modules Import**

In [88]:
import pandas as pd
import torch
import torchtext
from torch import nn
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt

import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## **Data Import**

In [69]:
# read datasets and ignore the first (index) column
train_data =  pd.read_csv('vaccine_train_set.csv').iloc[:, 1:]
validation_data = pd.read_csv('vaccine_validation_set.csv').iloc[:, 1:]    # <--- replace with test set path

train_data.head()

Unnamed: 0,tweet,label
0,Sip N Shop Come thru right now #Marjais #Popul...,0
1,I don't know about you but My family and I wil...,1
2,@MSignorile Immunizations should be mandatory....,2
3,President Obama spoke in favor of vaccination ...,0
4,"""@myfoxla: Arizona monitoring hundreds for mea...",0


## **Pre-processing & Vectorization**

In [102]:
# preprocess tweets texts before vectorization
def preprocess(text):
  text = text.lower()
  unwanted_chars = ['@', '#', '!', '(', ')', '*', ':', ',']

  for char in unwanted_chars:
    text = text.replace(char, '')
  lemmatizer = WordNetLemmatizer()

  lemmatized_text = ''
  for word in text.split():
    lemmatized_text += lemmatizer.lemmatize(word) + ' '
  return lemmatized_text

glove = torchtext.vocab.GloVe(name="6B", dim=50)

# vectorize a tweet using glove word embedding
def glove_vectorize(corpus, representation = 'average'):
  vectors = []
  for document in corpus:
    glove_vectors = []
    for token in document.split():
      glove_vectors.append(glove[token])

    if representation == 'average':
      vectors.append(torch.stack(glove_vectors, dim=0).sum(dim=0).div(len(document.split())))
    elif representation == 'sum':
      vectors.append(torch.stack(glove_vectors, dim=0).sum(dim=0))
  return torch.stack(vectors, dim=0)

# create a tf-idf or bow vectorizer (using the training set's tweets)
def create_vectorizer(train_corpus, method='tf-idf', max_features=1000, ngram_range=(1, 2)):
  if method == 'tf-idf':
    vectorizer = TfidfVectorizer(max_features=max_features, ngram_range=ngram_range)
  elif method == 'bow':
    vectorizer = CountVectorizer(max_features=max_features, ngram_range=ngram_range)
  return vectorizer.fit(train_corpus)

# vectorize a dataset
def vectorize(vectorizer, corpus):
  return torch.from_numpy(vectorizer.transform(corpus).toarray())


train_data['tweet'] = train_data['tweet'].apply(preprocess)
validation_data['tweet'] = validation_data['tweet'].apply(preprocess)

train_corpus = train_data['tweet']
validation_corpus = validation_data['tweet']

# tf-idf vectorization approach
# tf_idf_vectorizer = create_vectorizer(train_data['tweet'])
# train_vector = vectorize(tf_idf_vectorizer, train_corpus)
# validation_vector = vectorize(tf_idf_vectorizer, validation_corpus)

# glove vectorization approach
train_vector = glove_vectorize(train_corpus)
validation_vector = glove_vectorize(validation_corpus)

train_vector

tensor([[ 0.0571,  0.1024,  0.0599,  ..., -0.1544,  0.1259, -0.0059],
        [ 0.3693,  0.0891,  0.0584,  ..., -0.0749, -0.1401,  0.3206],
        [ 0.0855, -0.0778,  0.0221,  ..., -0.0894, -0.0012,  0.2162],
        ...,
        [ 0.3168,  0.0027,  0.1453,  ..., -0.1099, -0.0154, -0.1261],
        [ 0.4963, -0.0600,  0.2502,  ...,  0.5715,  0.2085,  0.2717],
        [ 0.4493, -0.1072,  0.1023,  ...,  0.1274, -0.0652,  0.0026]])

In [2]:
class NeuralNetwork(nn.Module):
  def __init__(self, x_size, num_of_classes, hidden_size):
    super(NeuralNetwork, self).__init__()
    self.l1 = nn.Linear(x_size, hidden_size)
    self.l2 = nn.ReLU()
    self.l3 = nn.Linear(hidden_size, num_of_classes)

  def forward(self, x):
    out = self.l1(x)
    out = self.l2(out)
    out = self.l3(out)
    return out

x_size = 10
num_of_classes = 3

model = NeuralNetwork(x_size, num_of_classes, 5)
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())