In [88]:
from collections import defaultdict
import math
import time
import copy
import itertools
import warnings
from datetime import datetime
warnings.filterwarnings('ignore')

import numpy as np
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext import data 
import torchtext
from pathlib import Path
import pandas as pd
import spacy

from torch.utils.tensorboard import SummaryWriter
# default `log_dir` is "runs" - we'll be more specific here
writer = SummaryWriter('../runs/lstm_{}'.format(datetime.now()))

In [75]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


# Embeddings

In [3]:
# test embeddings
cat_embed = nn.Embedding(5, 2)
cat_tensor = torch.LongTensor([1])
cat_embed.forward(cat_tensor)

tensor([[-0.5321,  1.0474]], grad_fn=<EmbeddingBackward>)

# Sentiment data

In [4]:
df = pd.read_csv("../dataset/sentiment/training.1600000.processed.noemoticon.csv", header=None, 
                 engine='python', names=['polarity', 'id', 'date', 'query', 'user', 'tweet'],
                )

In [5]:
df.head()

Unnamed: 0,polarity,id,date,query,user,tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [6]:
df.polarity.value_counts()

4    800000
0    800000
Name: polarity, dtype: int64

In [7]:
# create binary encoding 
df['sentiment'] = df.polarity.map(lambda x: 1 if x == 4 else 0)
df.head()

Unnamed: 0,polarity,id,date,query,user,tweet,sentiment
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",0
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,0
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,0
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,0
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",0


In [8]:
df.describe()

Unnamed: 0,polarity,id,sentiment
count,1600000.0,1600000.0,1600000.0
mean,2.0,1998818000.0,0.5
std,2.000001,193576100.0,0.5
min,0.0,1467810000.0,0.0
25%,0.0,1956916000.0,0.0
50%,2.0,2002102000.0,0.5
75%,4.0,2177059000.0,1.0
max,4.0,2329206000.0,1.0


In [9]:
# save processed data
df.to_csv('train-processed.csv', index=None)

In [119]:
df.sample(10000).to_csv('train-processed-sample.csv', index=None)

# Creating dataset

In [148]:
LABEL = data.LabelField()
TWEET = data.Field(tokenize='spacy', lower=True)
fields = [('polarity', None), ('id', None), ('date', None),
          ('query', None), ('user', None), ('tweet', TWEET), ('sentiment', LABEL)]
twitter_dataset = data.TabularDataset(
    path='train-processed-sample.csv',format='CSV', fields=fields, skip_header=True)

In [149]:
# split dataset
train, val, test = twitter_dataset.split(split_ratio=[0.8, 0.1, 0.1])
print(len(train), len(val), len(test))

8000 1000 1000


In [150]:
# view sample
vars(train.examples[7])

{'tweet': ['@nickymcb',
  'i',
  "'m",
  'good',
  'thank',
  'you',
  ',',
  'how',
  's',
  'you',
  '?',
  'i',
  'have',
  'bad',
  'hayfever',
  'today',
  ',',
  'i',
  'ca',
  "n't",
  'stop',
  'sneezing',
  'its',
  'such',
  'a',
  'nightmare'],
 'sentiment': '0'}

## build a vocabulary

In [151]:
vocab_size = 5000
TWEET.build_vocab(train, max_size=vocab_size)
LABEL.build_vocab(train)

In [152]:
# how big is our vocabulary
(len(TWEET.vocab), len(LABEL.vocab))

(5002, 2)

In [153]:
# most common words
TWEET.vocab.freqs.most_common(10)

[('i', 4921),
 ('!', 4311),
 ('.', 4108),
 (' ', 2982),
 ('to', 2879),
 ('the', 2565),
 (',', 2382),
 ('a', 1926),
 ('my', 1613),
 ('it', 1537)]

In [154]:
# create a dataloader
train_loader, val_loader, test_loader = data.BucketIterator.splits(
    (train, val, test), batch_size = 32, device = device,
    sort_key = lambda x: len(x.tweet),
    sort_within_batch = False
)

# Create LSTM model

In [155]:
# we use embedding and LSTM modules

class LSTM(nn.Module):
    
    def __init__(self, hidden_size, embedding_dim, vocab_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.encoder = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_size, num_layers=1)
        self.predictor = nn.Linear(hidden_size, 2)
    
    def forward(self, seq):
        output = self.embedding(seq)
        output, (hidden, _) = self.encoder(output)
        preds = self.predictor(hidden.squeeze(0))
        return preds

In [156]:
model = LSTM(100, 300, len(TWEET.vocab))

In [157]:
model

LSTM(
  (embedding): Embedding(5002, 300)
  (encoder): LSTM(300, 100)
  (predictor): Linear(in_features=100, out_features=2, bias=True)
)

## Training our model

In [158]:
def train_model(model, optimizer, criterion, data_loader, epochs):
    model = model.to(device)
    # define training variables
    since = time.time()
    best_weights = copy.deepcopy(model.state_dict())
    best_acc = 0.0
    
    for epoch in range(1, epochs + 1):
        print('\nEpoch {}/{}'.format(epoch, epochs))
        print('-' * 60)    
        
        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0
            
            for batch in data_loader[phase]:
                tweet, target = batch.tweet.to(device), batch.sentiment.to(device)
                # zero the parameter gradients
                optimizer.zero_grad()
                
                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    # make predictions
                    outputs = model(tweet)
                    loss = criterion(outputs, target)

                    _, preds = torch.max(outputs, 1)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * tweet.size(0)
                running_corrects += preds.eq(target.view_as(preds)).cpu().sum()
                
            batch_size = len(data_loader[phase].dataset)
            epoch_loss = running_loss / batch_size
            epoch_acc = running_corrects.double().item() / batch_size

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))
            # ...log the running loss to tensorboard
            writer.add_scalar('{}/loss'.format(phase), epoch_loss, epoch)
            writer.add_scalar('{}/accuracy'.format(phase), epoch_acc, epoch)
                        
            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_weights = copy.deepcopy(model.state_dict())
            

    time_elapsed = time.time() - since
    print('-' * 60)
    print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:.3f} %'.format(100 * best_acc))
    print('=' * 60, '\n')
    # load best weights
    model.load_state_dict(best_weights)
    model = model.to(device)
    return model

In [134]:
data_loader = {'train': train_loader, 'val': val_loader}
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

In [105]:
m = train_model(model, optimizer, criterion, data_loader, 5)


Epoch 1/5
------------------------------------------------------------
train Loss: 0.5627 Acc: 0.7267
val Loss: 0.2472 Acc: 0.7857

Epoch 2/5
------------------------------------------------------------
train Loss: 0.4240 Acc: 0.8239
val Loss: 0.2360 Acc: 0.8011

Epoch 3/5
------------------------------------------------------------
train Loss: 0.3484 Acc: 0.8618
val Loss: 0.2454 Acc: 0.8014

Epoch 4/5
------------------------------------------------------------
train Loss: 0.2812 Acc: 0.8922
val Loss: 0.2635 Acc: 0.7966

Epoch 5/5
------------------------------------------------------------
train Loss: 0.2253 Acc: 0.9162
val Loss: 0.2788 Acc: 0.7930
------------------------------------------------------------
Training complete in 10m 27s
Best val Acc: 80.140 %



## Test accuracy

In [106]:
def test_model(model, test_loader):
    all_targets = []
    all_preds = []
    model.eval()
    
    for batch in test_loader:
        tweets, targets = batch.tweet.to(device), batch.sentiment.to(device)
        
        outputs = model(tweets)
        _, preds = torch.max(outputs, 1)
        
        all_targets.extend(targets.cpu().numpy())
        all_preds.extend(preds.cpu().numpy())
        
    return all_targets, all_preds

In [107]:
y_true, y_preds = test_model(m, test_loader)

In [108]:
from sklearn.metrics import accuracy_score, plot_confusion_matrix, classification_report
categories = {0: 'Negative', 1: "Positive"}

In [109]:
score = accuracy_score(y_true, y_preds)
print("Test classification accuracy: {:.3f} %".format(score * 100))

Test classification accuracy: 79.330 %


In [110]:
print(classification_report(y_true, y_preds, labels=[0, 1]))

              precision    recall  f1-score   support

           0       0.77      0.83      0.80      4983
           1       0.82      0.76      0.79      5017

    accuracy                           0.79     10000
   macro avg       0.79      0.79      0.79     10000
weighted avg       0.79      0.79      0.79     10000



## classify tweets

In [111]:
def classify_tweet(model, tweet):
    
    processed = TWEET.process([TWEET.preprocess(tweet)])
    
    return categories[model(processed).argmax().item()]

In [112]:
example = test.examples[5]
vars(example)

{'tweet': ['keep', 'it', 'classy'], 'sentiment': '1'}

In [113]:
classify_tweet(m, example.tweet)

'Positive'