In [3]:
import os
import pandas as pd
import numpy as np
import spacy
import string
import matplotlib.pyplot as plt
import seaborn as sns
import timeit
import scattertext as st
import collections
from IPython.display import HTML, IFrame
from textblob import TextBlob
from w3lib.html import remove_tags
from wordcloud import WordCloud
from tqdm import tqdm_notebook
from torchtext import data
import torch
from sklearn.preprocessing import MinMaxScaler
from sklearn.manifold import TSNE
from sklearn.decomposition import KernelPCA

In [4]:
tokenizer = lambda x: str(x).translate(str.maketrans('', '', string.punctuation)).strip().split()

# Step one defination of our fields. 
TEXT = data.Field(sequential=True, lower=True, tokenize=tokenizer, fix_length=100)
LABEL = data.Field(sequential=False, use_vocab=False)

print("loading from csv ...")
tv_datafields = [("review", TEXT), ("label", LABEL)]

# Step two construction our dataset.
train, valid, test = data.TabularDataset.splits(path='',
                                                train="train.csv", validation="valid.csv",
                                                test="test_dataset.csv", format="csv",
                                                skip_header=True, fields=tv_datafields)
print(train[0].__dict__.keys())

loading from csv ...
dict_keys(['review', 'label'])


In [5]:
MAX_VOCAB_SIZE = 25_000

In [7]:
TEXT.build_vocab(train, max_size = MAX_VOCAB_SIZE)

print("build vocab success...")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Step four construct our iterator to our dataset. 
train_iter = data.BucketIterator(train, device=device, batch_size=32, sort_key=lambda x: len(x.text),
                                 sort_within_batch=False, repeat=False)
valid_iter = data.BucketIterator(valid, device=device, batch_size=32, sort_key=lambda x: len(x.text),
                                 sort_within_batch=False, repeat=False)
test_iter = data.BucketIterator(test, device=device, batch_size=32, sort_key=lambda x: len(x.text),
                                 sort_within_batch=False, repeat=False)
print("construct iterator success...")



build vocab success...
construct iterator success...


In [9]:
# most common words and their frequencies.
print(TEXT.vocab.freqs.most_common(20))

# top ten index to words transform.
print(TEXT.vocab.itos[:10])

[('the', 233929), ('and', 113202), ('a', 113038), ('of', 101442), ('to', 94437), ('is', 74279), ('in', 65120), ('it', 53784), ('i', 53168), ('this', 52561), ('that', 48504), ('br', 39942), ('was', 33534), ('as', 32463), ('for', 30754), ('with', 30625), ('movie', 29142), ('but', 29032), ('film', 26302), ('on', 23257)]
['<unk>', '<pad>', 'the', 'and', 'a', 'of', 'to', 'is', 'in', 'it']


In [29]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        
        self.rnn = nn.RNN(embedding_dim, hidden_dim,bidirectional=True)
        
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text):

        #text = [sent len, batch size]
        
        embedded = self.embedding(text)
        
        #embedded = [sent len, batch size, emb dim]
        
        output, hidden = self.rnn(embedded)
        
        #output = [sent len, batch size, hid dim]
        #hidden = [1, batch size, hid dim]
        
        assert torch.equal(output[-1,:,:], hidden.squeeze(0))
        
        return self.fc(hidden.squeeze(0))

In [12]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1

model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)

In [13]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 2,592,105 trainable parameters


In [14]:
import torch.optim as optim

optimizer = optim.SGD(model.parameters(), lr=1e-3)


In [15]:
criterion = nn.BCEWithLogitsLoss()

In [16]:
model = model.to(device)
criterion = criterion.to(device)

In [17]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [26]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
                
        predictions = model(batch.review).squeeze(1)
        
        loss = criterion(predictions, batch.label.float())
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [27]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.review).squeeze(1)
            
            loss = criterion(predictions, batch.label.float())
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [20]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [28]:
N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iter, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iter, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 1m 5s
	Train Loss: 0.696 | Train Acc: 50.65%
	 Val. Loss: 0.695 |  Val. Acc: 51.07%
Epoch: 02 | Epoch Time: 0m 52s
	Train Loss: 0.695 | Train Acc: 50.99%
	 Val. Loss: 0.695 |  Val. Acc: 51.34%
Epoch: 03 | Epoch Time: 0m 41s
	Train Loss: 0.694 | Train Acc: 51.38%
	 Val. Loss: 0.694 |  Val. Acc: 50.93%
Epoch: 04 | Epoch Time: 0m 42s
	Train Loss: 0.693 | Train Acc: 51.70%
	 Val. Loss: 0.694 |  Val. Acc: 51.22%
Epoch: 05 | Epoch Time: 0m 43s
	Train Loss: 0.693 | Train Acc: 51.90%
	 Val. Loss: 0.694 |  Val. Acc: 51.72%


In [30]:
N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iter, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iter, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 39s
	Train Loss: 0.692 | Train Acc: 52.18%
	 Val. Loss: 0.694 |  Val. Acc: 51.41%
Epoch: 02 | Epoch Time: 0m 42s
	Train Loss: 0.692 | Train Acc: 52.49%
	 Val. Loss: 0.694 |  Val. Acc: 51.56%
Epoch: 03 | Epoch Time: 0m 42s
	Train Loss: 0.691 | Train Acc: 52.44%
	 Val. Loss: 0.694 |  Val. Acc: 51.49%
Epoch: 04 | Epoch Time: 0m 42s
	Train Loss: 0.691 | Train Acc: 52.59%
	 Val. Loss: 0.694 |  Val. Acc: 51.48%
Epoch: 05 | Epoch Time: 0m 41s
	Train Loss: 0.691 | Train Acc: 52.79%
	 Val. Loss: 0.693 |  Val. Acc: 51.38%
