# **CNN Twitter Sentiment Analysis**

In [1]:
from google.colab import drive # for Google Colab
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import time
import spacy
import random
from pathlib import Path
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext.legacy import data 
import torchtext
from nltk.tokenize.treebank import TreebankWordDetokenizer

# Enables GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


## **1. Dataset Preparation**
The first column contains the sentiments and the last column contains the tweets.

In [3]:
# Read in data into a dataframe
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/tweet-dataset.csv", engine="python", header=None)

df.head(5)

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [4]:
# Count the number of tweets per sentiment
df[0].value_counts()

4    800000
0    800000
Name: 0, dtype: int64

In [5]:
# Model the sentiments from polarity to binary (0 - negative, 1 - positive)
df[0]=df[0].replace(to_replace=4,value=1)
df[0].value_counts()

1    800000
0    800000
Name: 0, dtype: int64

In [6]:
# Subset as a smaller dataset from training
df.sample(100000).to_csv("sentiment140-small.csv", header=None, index=None)

## **2. Data Preprocessing**

In [87]:
# Declare fields for tweets and labels
TEXT = data.Field(tokenize='spacy', lower=True, batch_first = True)
LABEL = data.LabelField(dtype=torch.float)

# Map data to fields
fields = [('label', LABEL), ('id',None),('date',None),('query',None),
      ('name',None), ('text', TEXT),('category',None)]

# Apply field definition to create torch dataset
dataset = torchtext.legacy.data.TabularDataset(
        path="sentiment140-small.csv",
        format="CSV",
        fields=fields,
        skip_header=False)

# Split data into train, test, validation sets
(train_data, test_data, valid_data) = dataset.split(split_ratio=[0.8,0.1,0.1])

print("Number of train data: {}".format(len(train_data)))
print("Number of test data: {}".format(len(test_data)))
print("Number of validation data: {}".format(len(valid_data)))

Number of train data: 80000
Number of test data: 10000
Number of validation data: 10000


In [88]:
# An example from the training set
print(vars(train_data.examples[0]))

{'label': '1', 'text': ['ca', "n't", 'believe', 'ruben', 'likes', 'me', 'to', ',', 'but', 'it', 'is', 'true', '.', 'anyway', ',', 'i', "'m", 'have', 'to', 'make', 'my', 'homework', 'now', '.', 'ciao', 'you', 'guys', ' ', '@drakebell', 'write', 'write', '!', ':-d', ':-d']}


### **Build Vocabulary**
Build the vocabulary for the training set using pre-trained GloVe embeddings. GloVe embeddings were trained on 6 billion tokens and the embeddings are 100-dimensional.

In [89]:
MAX_VOCAB_SIZE = 25000

# unk_init initializes words in the vocab using the Gaussian distribution
TEXT.build_vocab(train_data, 
                 max_size = MAX_VOCAB_SIZE,
                 vectors = "glove.6B.100d",
                 unk_init = torch.Tensor.normal_)

# build vocab for training set - convert words into integers
LABEL.build_vocab(train_data)

# Most frequent tokens
TEXT.vocab.freqs.most_common(10)

[('i', 50045),
 ('!', 45331),
 ('.', 40455),
 (' ', 29285),
 ('to', 28320),
 ('the', 26188),
 (',', 24256),
 ('a', 19024),
 ('my', 15915),
 ('and', 15231)]

In [102]:
BATCH_SIZE = 128

# sort_within_batch sorts all the tensors within a batch by their lengths
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE, 
    device = device,
    sort=False)

## **3. Architecture**

In [103]:
class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, 
                 dropout, pad_idx):
        
        super().__init__()
                
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.convs = nn.ModuleList([
                                    nn.Conv2d(in_channels = 1, 
                                              out_channels = n_filters, 
                                              kernel_size = (fs, embedding_dim)) 
                                    for fs in filter_sizes
                                    ])
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
                
        #text = [batch size, sent len]
        
        embedded = self.embedding(text)
                
        #embedded = [batch size, sent len, emb dim]
        
        embedded = embedded.unsqueeze(1)
        
        #embedded = [batch size, 1, sent len, emb dim]
        
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
            
        #conved_n = [batch size, n_filters, sent len - filter_sizes[n] + 1]
                
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        
        #pooled_n = [batch size, n_filters]
        
        cat = self.dropout(torch.cat(pooled, dim = 1))

        #cat = [batch size, n_filters * len(filter_sizes)]
            
        return self.fc(cat)

### **Create Model**

In [104]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
N_FILTERS = 100
FILTER_SIZES = [3,4,5]
OUTPUT_DIM = 1
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)

In [105]:
# Sample from the training set
print(vars(train_iterator.dataset[0]))

{'label': '1', 'text': ['ca', "n't", 'believe', 'ruben', 'likes', 'me', 'to', ',', 'but', 'it', 'is', 'true', '.', 'anyway', ',', 'i', "'m", 'have', 'to', 'make', 'my', 'homework', 'now', '.', 'ciao', 'you', 'guys', ' ', '@drakebell', 'write', 'write', '!', ':-d', ':-d']}


In [106]:
# Copy the pre-trained word embeddings into the embedding layer
pretrained_embeddings = TEXT.vocab.vectors

# [vocab size, embedding dim]
print(pretrained_embeddings.shape)

torch.Size([25002, 100])


In [107]:
# Replace the initial weights of the embedding layer with the pre-trained embeddings
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[-0.3755,  0.0550, -0.1225,  ..., -0.9230, -0.8510, -0.9215],
        [-0.7454, -0.8616, -0.6293,  ...,  0.6587,  0.1071, -0.7181],
        [-0.0465,  0.6197,  0.5665,  ..., -0.3762, -0.0325,  0.8062],
        ...,
        [ 1.0402,  0.6146, -0.2181,  ...,  1.9460, -1.5685, -0.2946],
        [-0.1222, -0.7869,  0.2676,  ...,  1.6120,  0.1865, -0.8650],
        [ 0.2206, -0.0253, -0.0070,  ...,  0.4942,  0.0057,  0.1924]])

In [108]:
# Initialize <unk> and <pad> both to all zeros - irrelevant for sentiment analysis
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

# Setting row in the embedding weights matrix to zero using the token index
model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

print(model.embedding.weight.data)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0465,  0.6197,  0.5665,  ..., -0.3762, -0.0325,  0.8062],
        ...,
        [ 1.0402,  0.6146, -0.2181,  ...,  1.9460, -1.5685, -0.2946],
        [-0.1222, -0.7869,  0.2676,  ...,  1.6120,  0.1865, -0.8650],
        [ 0.2206, -0.0253, -0.0070,  ...,  0.4942,  0.0057,  0.1924]])


## **4. Training**

In [109]:
# Adam optimizer used to update the weights
optimizer = optim.Adam(model.parameters(), lr=2e-3)

# binary cross entropy
criterion = nn.BCEWithLogitsLoss()

# Use GPU
model = model.to(device)
criterion = criterion.to(device)

In [110]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [111]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [112]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

### **CNN Training**

In [113]:
N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'model-small.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 1m 55s
	Train Loss: 0.504 | Train Acc: 75.00%
	 Val. Loss: 0.441 |  Val. Acc: 79.51%
Epoch: 02 | Epoch Time: 1m 56s
	Train Loss: 0.413 | Train Acc: 81.52%
	 Val. Loss: 0.435 |  Val. Acc: 80.43%
Epoch: 03 | Epoch Time: 1m 55s
	Train Loss: 0.359 | Train Acc: 84.44%
	 Val. Loss: 0.440 |  Val. Acc: 80.36%
Epoch: 04 | Epoch Time: 1m 55s
	Train Loss: 0.308 | Train Acc: 86.95%
	 Val. Loss: 0.475 |  Val. Acc: 79.89%
Epoch: 05 | Epoch Time: 1m 55s
	Train Loss: 0.261 | Train Acc: 89.27%
	 Val. Loss: 0.550 |  Val. Acc: 78.82%


## **5. Prediction**

In [114]:
# Load the model with the best validation loss
model.load_state_dict(torch.load('model-small.pt'))

# Evaluate test loss and accuracy
test_loss, test_acc = evaluate(model, test_iterator, criterion)

print("Test Loss: {} | Test Acc: {}%".format(round(test_loss, 2), round(test_acc*100, 2)))

Test Loss: 0.45 | Test Acc: 79.47%
