In [1]:
# dataframe and series 
import pandas as pd
import numpy as np

# sklearn imports for modeling part
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,balanced_accuracy_score
from sklearn.model_selection import train_test_split

from mlxtend.evaluate import confusion_matrix
from mlxtend.plotting import plot_confusion_matrix
from mlxtend.plotting import plot_decision_regions

from sklearn.metrics import confusion_matrix

# To plot
import matplotlib.pyplot as plt  
%matplotlib inline    
import matplotlib as mpl
import seaborn as sns

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from PIL import Image

import torch
from torchtext import data
from torchtext import datasets
import random

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import time

import random

In [26]:
# pip install spacy

In [2]:
import spacy

In [3]:
nlp = spacy.load('en')

# import en_core_web_sm
# nlp = en_core_web_sm.load()

In [7]:
# from spacy.lang.en import English

In [8]:
# spacy.load('en_core_web_sm')

In [4]:
df = pd.read_csv('cleaned_data.csv')

My aim is to compare products and determine less seller products with giving importance to negative reviewed books to take action. So, to focus on less ratings, I will divide my target to two-class as positive and negative where 1 and 2 rating values counted as negative and others are positive.

In [5]:
def calc_two_sentiment(overall):
    '''This function encodes the rating 1 and 2 as 0, others as 1'''
    if overall >= 3:
        return 1
    else:
        return 0

In [6]:
df['sentiment'] = df['overall'].apply(calc_two_sentiment)

In [7]:
df['sentiment'].value_counts()

1    2031419
0     109546
Name: sentiment, dtype: int64

In [7]:
df_torch = df.head(100000)

To write codes easily and keep less data in memory, I will just only choose the columns which I need for modeling.

In [8]:
df_torch= df_torch.loc[:, ['review_clean', 'sentiment']]

To use more easily I will divide train-test splits and write them csv files.

In [9]:
train_data, test_data = train_test_split(df_torch, test_size=0.2)

In [10]:
train_data.to_csv('train.csv', index = False)

In [11]:
test_data.to_csv('test.csv', index = False)

In [13]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 80000
Number of testing examples: 20000


In [13]:
# X = df_torch['review_clean']
# y = df_torch['sentiment']

In [14]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Preparing Data to Torch Model

One of the important concepts of TorchText is the Field function, which defines how the data should be processed. 

I will use TEXT to define how the reviews will be processed and use LABEL field to process the target. As a preprocessing technique, I will use bi-grams. It creates a set of co-occuring words.

In [12]:
def generate_bigrams(text):
    '''creating set of co-occuring words'''
    bi_grams = set(zip(*[text[i:] for i in range(2)]))
    for bi_gram in bi_grams:
        text.append(' '.join(bi_gram))
    return text

In [13]:
# To check bi-gram function is working proporly or not
generate_bigrams(['I', 'love', 'this', 'book'])

['I', 'love', 'this', 'book', 'I love', 'this book', 'love this']

My bi-gram function is working properly, I can see two-words couples.

I will define my model to preprocess with bi-grams, SpaCy tokenizer and LabelField to handle the target.

In [14]:
SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

TEXT = data.Field(tokenize = 'spacy', preprocessing = generate_bigrams)
TARGET = data.LabelField(dtype = torch.float)

In [None]:
# SEED = 1234

# torch.manual_seed(SEED)
# torch.backends.cudnn.deterministic = True

# TEXT = data.Field(tokenize = 'spacy')
# LABEL = data.LabelField(dtype = torch.float)

In [15]:
fields_train = [('review_clean', TEXT),('sentiment', TARGET)]

With using TabularDataset, we will take our train, test splits easily each time and preprocessed with bi-grams. 

In [16]:
# Taking training data from train.csv
train_data = data.TabularDataset(path = 'train.csv',
                                 format = 'csv',
                                 fields = fields_train,
                                 skip_header = True)

In [17]:
# Taking test data from test.cvs
test_data = data.TabularDataset(path = 'test.csv',
                                 format = 'csv',
                                 fields = fields_train,
                                 skip_header = True)

In [36]:
# # To check the first elements in train and test
# print(vars(train_data[0]))
# print(vars(test_data[0]))

Now, I want to split a validation data from my train data, to make sure my model is doing good. I will use default for split sizes and define my random seed to get same data each time.

### Building Validation Set 

In [18]:
# Creating validation set from train data

train_data, valid_data = train_data.split(random_state = random.seed(SEED))

In [21]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 56000
Number of validation examples: 24000
Number of testing examples: 20000


Now, I need to build a vocabulary. There are lots of words so I will define maximum top words sizes. Then, I will load the pre-trained word embeddings.

### Building Vocabulary with Pre-Trained Embeddings

In [19]:
MAX_VOCAB_SIZE = 25_000

TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE, 
                 vectors = "glove.6B.100d", 
                 unk_init = torch.Tensor.normal_)

TARGET.build_vocab(train_data)

I only build vocabulary on train set. Because, in machine learning models test set must not be seen before to test it well. I do not add validation set also, because I want it to reflect the test set as much as possible.

In [41]:
print(f"# of unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"# of unique tokens in TARGET vocabulary: {len(TARGET.vocab)}")

# of unique tokens in TEXT vocabulary: 25002
# of unique tokens in TARGET vocabulary: 2


I chose my max vocabulary size 25000, it means there is two additional tokens like <...> default. Because all sentences in the batches must be at same size. To make each sentence equal in the batch, it padded longer or shorter batches.

In [46]:
print(TEXT.vocab.freqs.most_common(25)) # to see most common words in the vocabulary with their frequencies

[('the', 239106), ('and', 150173), ('a', 139894), ('i', 135734), ('to', 130913), ('of', 98771), (' ', 88659), ('is', 78752), ('this', 76517), ('it', 74223), ('in', 65221), ('was', 60491), ('that', 56178), ('book', 52295), ('for', 44131), ('story', 38653), ('but', 37886), ('her', 37795), ('with', 37778), ('read', 35331), ('you', 34217), ('nt', 33620), ('\n\n', 32602), ('she', 28880), ('not', 28241)]


### Setting Iterators

Now, I create my vocabulary using pre-trained embeddings. The final step of preparing data to Torch model is creating iterators. I will iterate train and evaluation loop and get a batch of examples which indexed and converted into tensors for each iteration. I will use Iterator function of torch. Also, I need to keep the tensors which returned by iterators in GPU so I will use torch.device function.

In [20]:
# To set batch size and iterators for train and validation data 

BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator = data.Iterator(dataset = train_data, batch_size = BATCH_SIZE,device = device, 
                               shuffle = None, train = True, sort_key = lambda x: len(x.review_clean), 
                               sort_within_batch = False)
valid_iterator = data.Iterator(dataset = valid_data, batch_size = BATCH_SIZE,device = device, 
                               shuffle = None, train = False, sort_key = lambda x: len(x.review_clean), 
                               sort_within_batch = False)

In [21]:
test_iterator = data.Iterator(dataset = test_data, batch_size = BATCH_SIZE,device = device, 
                               shuffle = None, train = False, sort_key = lambda x: len(x.review_clean), 
                               sort_within_batch = False)

## Building the Model

There are many ready classes to building a model. I prefer to use FastText class for baseline model, because gets comparable results significantly faster and using around half of the parameters. The details about this class can be found in [Bag of Tricks for Efficient Text Classification paper](https://arxiv.org/abs/1607.01759). 

In [22]:
class FastText(nn.Module):
    def __init__(self, vocab_size, embedding_dim, output_dim, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        
        self.fc = nn.Linear(embedding_dim, output_dim)
        
    def forward(self, text):
        
        #text = [sent len, batch size]
        
        embedded = self.embedding(text)
                
        #embedded = [sent len, batch size, emb dim]
        
        embedded = embedded.permute(1, 0, 2)
        
        #embedded = [batch size, sent len, emb dim]
        
        pooled = F.avg_pool2d(embedded, (embedded.shape[1], 1)).squeeze(1) 
        
        #pooled = [batch size, embedding_dim]
                
        return self.fc(pooled)

This model only has 2 layers that have any parameters, the linear and the embedding layer. There in no RNN layer. It will calculate the word embedding by using embedding layer, and taking average of them feeds the linear layer. Now, I will create my FastText class with defining dimensions and tokens.

In [23]:
INPUT_DIM = len(TEXT.vocab) #vocabulary size 
EMBEDDING_DIM = 100 # embedding dimension
OUTPUT_DIM = 1 # our output has only 2 classes - 0/1. So, it is one-dimensional.
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token] # string to integer method on padding tokens

model = FastText(INPUT_DIM, EMBEDDING_DIM, OUTPUT_DIM, PAD_IDX)

To compare trainable parameters in different models, count parameters function will be used. 

In [24]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters.')

The model has 2,500,301 trainable parameters.


Now I will copy pre-trained vectors to my embedding layers.

In [25]:
class WordEmbedding:
    __author__ = "Edward Ma"
    __copyright__ = "Copyright 2018, Edward Ma"
    __credits__ = ["Edward Ma"]
    __license__ = "Apache"
    __version__ = "2.0"
    __maintainer__ = "Edward Ma"
    __email__ = "makcedward@gmail.com"

    def __init__(self, verbose=0):
        self.verbose = verbose
        
        self.model = {}
        
    def convert(self, source, ipnut_file_path, output_file_path):
        if source == 'glove':
            input_file = datapath(ipnut_file_path)
            output_file = get_tmpfile(output_file_path)
            glove2word2vec(input_file, output_file)
        elif source == 'word2vec':
            pass
        elif source == 'fasttext':
            pass
        else:
            raise ValueError('Possible value of source are glove, word2vec, fasttext')
        
    def load(self, source, file_path):
        print(datetime.datetime.now(), 'start: loading', source)
        if source == 'glove':
            self.model[source] = gensim.models.KeyedVectors.load_word2vec_format(file_path)
        elif source == 'word2vec':
            self.model[source] = gensim.models.KeyedVectors.load_word2vec_format(file_path, binary=True)
        elif source == 'fasttext':
            self.model[source] = gensim.models.wrappers.FastText.load_fasttext_format(file_path)
        else:
            raise ValueError('Possible value of source are glove, word2vec, fasttext')
            
        print(datetime.datetime.now(), 'end: loading', source)
            
        return self

In [26]:
word_embedding = WordEmbedding()
word_embedding.load(source='word2vec', file_path=word2vec_file_path)

# model.embedding.weight.data.copy_(pretrained_embeddings)

NameError: name 'word2vec_file_path' is not defined

In [28]:
pretrained_embeddings = TEXT.vocab.vectors

model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[-0.1117, -0.4966,  0.1631,  ...,  1.2647, -0.2753, -0.1325],
        [-0.8555, -0.7208,  1.3755,  ...,  0.0825, -1.1314,  0.3997],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 1.1952, -1.7452, -1.4624,  ..., -0.6250, -0.0424, -0.2210],
        [-0.8913, -1.5787, -1.2862,  ..., -0.6323,  0.9110,  0.9361],
        [-2.0606, -0.3205, -0.4881,  ...,  0.1854,  0.3752, -0.1491]])

I must assign zero for initial weight for unknown and padding tokens. I have already defined padding token before as PAD_IDX. So, I will define unknows as UNK_IDX and set initials to zeros.

In [29]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

## Training the Model

To train the model, firstly I will create optimizer and criterion. Optimizer updates parameters of module. I will use SGD and Adam as optimizer. SGD is a variant of gradient descent. It does not perform on whole dataset, it computes on a small subset or random selection. It performs good when the learning rate is low. Optimizer needs two parameters, one is optimizer type and second is learning rate. Adam optimizer is a technique which implementing adaptive learning rate. 

I tried both optimizers one by one with uncommenting the cell below.

In [30]:
optimizer = optim.SGD(model.parameters(), lr=1e-3)
# optimizer = optim.Adam(model.parameters())

Now, I will define loss function. My target contains binary labels, so I will choose binary loss function as criterion.
Cross-entropy loss is commonly used for classification porblems. Also, BCEWithLogitsLoss is contains one sigmoid layer and binary cross-entropy loss. So, I will use this one.

In [31]:
criterion = nn.BCEWithLogitsLoss()

# keeping model and criterion in GPU
model = model.to(device)
criterion = criterion.to(device)

The loss will be calculated by using criterion but I want to see accuracy to compare models. This function turn the values to 0-1 with rounding them in sigmoid layer. Then, it calculates the rounded predictions equal actual labels and take the mean of the batch.

In [32]:
def binary_accuracy(pred, target):
      
    # rounding predictions to the closest integer
    rounded_pred = torch.round(torch.sigmoid(pred))
    true = (rounded_pred == target).float() # convert into float for taking mean 
    accuracy = true.sum() / len(true)
    return accuracy

In [33]:
# setting the train method

def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0 # 
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator: # for each batch
        
        optimizer.zero_grad() # zero gradient
        # PyTorch does not automatically zero the gradients calculated from the last gradient calculation
        
        predictions = model(batch.review_clean).squeeze(1) # with feeding batch with reviews no need to .forward
        
        '''squeeze for removing dimension in the list and taking only batch size bec. torch wants
        predictions input as batch size'''
        
        loss = criterion(predictions, batch.sentiment) # calculating loss
        
        acc = binary_accuracy(predictions, batch.sentiment) # calculating accuracy with taking mean
        
        loss.backward() #gradient of each parameter
        
        optimizer.step() #update the optimizer algorithm
        
        # loss and accuracy by epoches
        epoch_loss += loss.item() 
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator) # returning loss and acc avg across epoch

I will do same function for evaluate validation part below.

In [34]:
def evaluate(model, iterator, criterion):
    '''Evaluating validation set'''
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.review_clean).squeeze(1)
            
            loss = criterion(predictions, batch.sentiment)
            
            acc = binary_accuracy(predictions, batch.sentiment)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

I also use a function which informs that how long each epoch takes.

In [35]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

# Training the Model for Baseline

In [36]:
# with Adam optimizer
N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    # to keep model for test set
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 2m 39s
	Train Loss: 0.431 | Train Acc: 91.61%
	 Val. Loss: 0.748 |  Val. Acc: 91.67%
Epoch: 02 | Epoch Time: 2m 24s
	Train Loss: 0.313 | Train Acc: 91.61%
	 Val. Loss: 0.422 |  Val. Acc: 91.60%
Epoch: 03 | Epoch Time: 2m 11s
	Train Loss: 0.235 | Train Acc: 91.80%
	 Val. Loss: 0.449 |  Val. Acc: 92.43%
Epoch: 04 | Epoch Time: 2m 31s
	Train Loss: 0.192 | Train Acc: 92.48%
	 Val. Loss: 0.597 |  Val. Acc: 92.25%
Epoch: 05 | Epoch Time: 2m 28s
	Train Loss: 0.171 | Train Acc: 93.15%
	 Val. Loss: 0.739 |  Val. Acc: 92.18%


In [37]:
model.load_state_dict(torch.load('tut1-model.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.408 | Test Acc: 91.81%


# Changing Optimizer

In [61]:
# with SGD optimizer

N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    # to keep model for test set
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut2-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 2m 17s
	Train Loss: 0.652 | Train Acc: 73.05%
	 Val. Loss: 0.526 |  Val. Acc: 91.41%
Epoch: 02 | Epoch Time: 2m 32s
	Train Loss: 0.535 | Train Acc: 91.63%
	 Val. Loss: 0.415 |  Val. Acc: 91.42%
Epoch: 03 | Epoch Time: 2m 14s
	Train Loss: 0.462 | Train Acc: 91.63%
	 Val. Loss: 0.357 |  Val. Acc: 91.44%
Epoch: 04 | Epoch Time: 2m 7s
	Train Loss: 0.414 | Train Acc: 91.63%
	 Val. Loss: 0.326 |  Val. Acc: 91.44%
Epoch: 05 | Epoch Time: 2m 7s
	Train Loss: 0.382 | Train Acc: 91.63%
	 Val. Loss: 0.309 |  Val. Acc: 91.44%


In [63]:
model.load_state_dict(torch.load('tut2-model.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.299 | Test Acc: 91.96%


# Adding Tri-Gram Function

In [12]:
def generate_trigrams(text):
    '''creating set of 3 co-occuring words'''
    tri_grams = set(zip(*[text[i:] for i in range(3)]))
    for tri_gram in tri_grams:
        text.append(' '.join(tri_gram))
    return text

In [13]:
# To check tri-gram function is working proporly or not
generate_trigrams(['I', 'love', 'this', 'book'])

['I', 'love', 'this', 'book', 'I love this', 'love this book']

In [14]:
SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

TEXT = data.Field(tokenize = 'spacy', preprocessing = generate_trigrams)
TARGET = data.LabelField(dtype = torch.float)

In [15]:
fields_train = [('review_clean', TEXT),('sentiment', TARGET)]

In [16]:
# Taking training data from train.csv
train_data = data.TabularDataset(path = 'train.csv',
                                 format = 'csv',
                                 fields = fields_train,
                                 skip_header = True)

In [17]:
# Taking test data from test.cvs
test_data = data.TabularDataset(path = 'test.csv',
                                 format = 'csv',
                                 fields = fields_train,
                                 skip_header = True)

In [18]:
# print(vars(train_data[0])) # to check tri-grams

In [19]:
# Creating validation set from train data

train_data, valid_data = train_data.split(random_state = random.seed(SEED))

In [20]:
MAX_VOCAB_SIZE = 25_000

TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE, 
                 vectors = "glove.6B.100d", 
                 unk_init = torch.Tensor.normal_)

TARGET.build_vocab(train_data)

In [21]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator = data.Iterator(dataset = train_data, batch_size = BATCH_SIZE,device = device, 
                               shuffle = None, train = True, sort_key = lambda x: len(x.review_clean), 
                               sort_within_batch = False)
valid_iterator = data.Iterator(dataset = valid_data, batch_size = BATCH_SIZE,device = device, 
                               shuffle = None, train = False, sort_key = lambda x: len(x.review_clean), 
                               sort_within_batch = False)

In [22]:
test_iterator = data.Iterator(dataset = test_data, batch_size = BATCH_SIZE,device = device, 
                               shuffle = None, train = False, sort_key = lambda x: len(x.review_clean), 
                               sort_within_batch = False)

In [26]:
INPUT_DIM = len(TEXT.vocab) #vocabulary size 
EMBEDDING_DIM = 100 # embedding dimension
OUTPUT_DIM = 1 # our output has only 2 classes - 0/1. So, it is one-dimensional.
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token] # string to integer method on padding tokens

model = FastText(INPUT_DIM, EMBEDDING_DIM, OUTPUT_DIM, PAD_IDX)

In [28]:
pretrained_embeddings = TEXT.vocab.vectors

model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[-0.1117, -0.4966,  0.1631,  ...,  1.2647, -0.2753, -0.1325],
        [-0.8555, -0.7208,  1.3755,  ...,  0.0825, -1.1314,  0.3997],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 0.6060, -0.3566,  0.7984,  ..., -0.5360, -1.6375, -0.1762],
        [-0.0916, -1.7693,  0.2442,  ..., -1.3095, -0.1623, -2.5278],
        [ 0.4193, -1.6199, -0.2880,  ...,  1.2291,  0.9729,  0.9876]])

In [29]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

In [29]:
# optimizer = optim.Adam(model.parameters())

In [30]:
optimizer = optim.SGD(model.parameters(), lr=1e-3)

In [31]:
criterion = nn.BCEWithLogitsLoss()

# keeping model and criterion in GPU
model = model.to(device)
criterion = criterion.to(device)

### Results with Adam Optimizer

In [35]:
N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    # to keep model for test set
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut3-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 2m 48s
	Train Loss: 0.452 | Train Acc: 90.65%
	 Val. Loss: 0.705 |  Val. Acc: 91.75%
Epoch: 02 | Epoch Time: 2m 40s
	Train Loss: 0.327 | Train Acc: 91.64%
	 Val. Loss: 0.431 |  Val. Acc: 91.88%
Epoch: 03 | Epoch Time: 2m 44s
	Train Loss: 0.249 | Train Acc: 91.74%
	 Val. Loss: 0.411 |  Val. Acc: 92.38%
Epoch: 04 | Epoch Time: 2m 36s
	Train Loss: 0.206 | Train Acc: 92.14%
	 Val. Loss: 0.556 |  Val. Acc: 92.13%
Epoch: 05 | Epoch Time: 2m 28s
	Train Loss: 0.183 | Train Acc: 92.63%
	 Val. Loss: 0.685 |  Val. Acc: 92.33%


In [36]:
model.load_state_dict(torch.load('tut3-model.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.418 | Test Acc: 92.06%


### Results with SGD Optimizer

In [36]:
# SGD
N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    # to keep model for test set
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut4-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 2m 2s
	Train Loss: 0.593 | Train Acc: 91.67%
	 Val. Loss: 0.505 |  Val. Acc: 91.73%
Epoch: 02 | Epoch Time: 2m 5s
	Train Loss: 0.498 | Train Acc: 91.67%
	 Val. Loss: 0.409 |  Val. Acc: 91.73%
Epoch: 03 | Epoch Time: 2m 4s
	Train Loss: 0.438 | Train Acc: 91.67%
	 Val. Loss: 0.356 |  Val. Acc: 91.73%
Epoch: 04 | Epoch Time: 2m 2s
	Train Loss: 0.398 | Train Acc: 91.67%
	 Val. Loss: 0.325 |  Val. Acc: 91.73%
Epoch: 05 | Epoch Time: 2m 0s
	Train Loss: 0.371 | Train Acc: 91.67%
	 Val. Loss: 0.308 |  Val. Acc: 91.73%


In [37]:
model.load_state_dict(torch.load('tut4-model.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.312 | Test Acc: 91.48%


# Do not forget resources

https://github.com/bentrevett/pytorch-sentiment-analysis

https://www.kaggle.com/lalwaniabhishek/abhishek-lalwani-bits-twitter-text