In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import pandas as pd
import numpy as np
import spacy
import string
import matplotlib.pyplot as plt
import seaborn as sns
import collections
from IPython.display import HTML, IFrame
from textblob import TextBlob
from wordcloud import WordCloud
from tqdm import tqdm_notebook
from torchtext import data
import torch

In [3]:
# tokenizer = lambda x: str(x).translate(str.maketrans('', '', string.punctuation)).strip().split()

# Step one defination of our fields. 
TEXT = data.Field(sequential=True, lower=True, tokenize='spacy',fix_length=250)
LABEL = data.Field(sequential=False, use_vocab=False)

print("loading from csv ...")
tv_datafields = [("review", TEXT), ("label", LABEL)]

# Step two construction our dataset.
train, valid, test = data.TabularDataset.splits(path='/content/drive/MyDrive/SENTIMENT',
                                                train="train.csv", validation="valid.csv",
                                                test="test_dataset.csv", format="csv",
                                                skip_header=True, fields=tv_datafields)
print(train[0].__dict__.keys())

loading from csv ...
dict_keys(['review', 'label'])


In [4]:
MAX_VOCAB_SIZE = 25_000
TEXT.build_vocab(train, max_size = MAX_VOCAB_SIZE)
LABEL.build_vocab(train)
print("build vocab success...")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

build vocab success...


In [5]:
print(len(TEXT.vocab))
# most common words and their frequencies.
print(TEXT.vocab.freqs.most_common(20))

# top ten index to words transform.
print(TEXT.vocab.itos[:10])

25002
[('the', 229888), (',', 191498), ('.', 165465), ('and', 113644), ('a', 112867), ('of', 101635), ('to', 94665), ('is', 76637), ('it', 65408), ('in', 64967), ('i', 58068), ('this', 51308), ('that', 51082), ('"', 44013), ("'s", 43406), ('-', 36944), ('/><br', 35620), ('was', 35166), ('as', 32146), ('for', 30732)]
['<unk>', '<pad>', 'the', ',', '.', 'and', 'a', 'of', 'to', 'is']


In [6]:
# Step four construct our iterator to our dataset. 

print("construct iterator success...")
train_iter , valid_iter , test_iter = data.BucketIterator.splits((train,valid,test), device=device, 
                                                 batch_sizes=(32,32,32), sort_key=lambda x: len(x.review),
                                                 sort_within_batch=True, repeat=False)

construct iterator success...


In [7]:
from torch import nn
import torch.nn.functional as F
class DefaultTransformer(nn.Module):
    def __init__(self, input_dim, emb_dim, n_heads, hid_dim, n_layers, output_dim, dropout, max_length, pad_idx):
        super().__init__()

        self.tok_embedding = nn.Embedding(input_dim, emb_dim, padding_idx = pad_idx)
        self.pos_embedding = nn.Embedding(max_length, emb_dim)
        self.layer_norm = nn.LayerNorm(emb_dim)
        transformer_layer = nn.TransformerEncoderLayer(emb_dim, n_heads, hid_dim, activation = 'gelu')
        norm = nn.LayerNorm(emb_dim)
        self.transformer = nn.TransformerEncoder(transformer_layer, n_layers, norm)
        self.fc = nn.Linear(emb_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):

        # text = [seq len, batch size]

        seq_len, batch_size = text.shape
        
        pos = torch.arange(0, seq_len).unsqueeze(-1).repeat(1, batch_size).to(device)
        
        # pos = [seq len, batch size]
        
        embedded_pos = self.pos_embedding(pos)
#         embedded_pos = nn.Embedding(seq_len, emb_dim)(pos)
        embedded_tok = self.tok_embedding(text)
        
        embedded = self.dropout(embedded_pos + embedded_tok)

        # embedded = [seq len, batch size, emb dim]

        emedded = self.layer_norm(embedded)
        
        # embedded = [seq len, batch size, emb dim]
        
        transformed = self.transformer(embedded)

        # transformed = [seq len, batch size, emb dim]

        sos_transformed = transformed[0]

        # sos_transformed = [batch size, emb dim]

        prediction = self.fc(self.dropout(sos_transformed))

        # prediction = [batch size, output dim]

        return prediction

In [None]:
for batch in train_iter:
    text = batch.review
    seq_len, batch_size = text.shape
    pos = torch.arange(0, seq_len).unsqueeze(-1).repeat(1, batch_size).to(device)
    print(text.shape)
    nn.Embedding(5000, 100)(pos)

In [8]:
from torch.functional import F
input_dim = len(TEXT.vocab)
emb_dim = 100
n_heads = 10
hid_dim = 1024
n_layers = 3
output_dim = 2
dropout = 0.1
pad_idx = TEXT.vocab.stoi[TEXT.pad_token]
ff_layer_activation = F.gelu
max_length = 250

# encoder_norm = nn.LayerNorm(emb_dim)


model = DefaultTransformer(input_dim, emb_dim, n_heads,hid_dim, n_layers, output_dim, dropout, max_length, pad_idx)

In [9]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 3,265,974 trainable parameters


In [10]:
def initialize_parameters(m):
    if isinstance(m, nn.Embedding):
        nn.init.normal_(m.weight, std = 0.02)
    elif isinstance(m, nn.Linear):
        nn.init.normal_(m.weight, std = 0.02)
        nn.init.zeros_(m.bias)
    elif isinstance(m, nn.LayerNorm):
        nn.init.ones_(m.weight)
        nn.init.zeros_(m.bias)

In [11]:
model.apply(initialize_parameters)

DefaultTransformer(
  (tok_embedding): Embedding(25002, 100, padding_idx=1)
  (pos_embedding): Embedding(250, 100)
  (layer_norm): LayerNorm((100,), eps=1e-05, elementwise_affine=True)
  (transformer): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): _LinearWithBias(in_features=100, out_features=100, bias=True)
        )
        (linear1): Linear(in_features=100, out_features=1024, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=1024, out_features=100, bias=True)
        (norm1): LayerNorm((100,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((100,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
      (1): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): _LinearWithBias(in_features=100, out_

In [17]:
from torch import optim
optimizer = optim.Adam(model.parameters(),lr=0.0001)
criterion = nn.CrossEntropyLoss()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
criterion = criterion.to(device)
def calculate_accuracy(predictions, labels):
    top_predictions = predictions.argmax(1, keepdim = True)
    correct = top_predictions.eq(labels.view_as(top_predictions)).sum()
    accuracy = correct.float() / labels.shape[0]
    return accuracy

In [18]:
def train(model, iterator, optimizer, criterion, device):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:

#         labels = labels.to(device)
#         text = text.to(device)

        optimizer.zero_grad()
        
        predictions = model(batch.review)
        
        loss = criterion(predictions, batch.label)
        
        acc = calculate_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [19]:
def evaluate(model, iterator, criterion, device):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

#             labels = labels.to(device)
#             text = text.to(device)
            
            predictions = model(batch.review)
            
            loss = criterion(predictions, batch.label)
            
            acc = calculate_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [20]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [21]:
n_epochs = 10
import time
best_valid_loss = float('inf')

for epoch in range(n_epochs):

    start_time = time.monotonic()
    
    train_loss, train_acc = train(model, train_iter, optimizer, criterion,device)
    valid_loss, valid_acc = evaluate(model, valid_iter, criterion,device)
    
    end_time = time.monotonic()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'transformer-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 38s
	Train Loss: 0.468 | Train Acc: 81.10%
	 Val. Loss: 0.580 |  Val. Acc: 76.85%
Epoch: 02 | Epoch Time: 0m 38s
	Train Loss: 0.392 | Train Acc: 84.67%
	 Val. Loss: 0.574 |  Val. Acc: 78.16%
Epoch: 03 | Epoch Time: 0m 38s
	Train Loss: 0.346 | Train Acc: 86.13%
	 Val. Loss: 0.518 |  Val. Acc: 78.75%
Epoch: 04 | Epoch Time: 0m 38s
	Train Loss: 0.271 | Train Acc: 89.48%
	 Val. Loss: 0.483 |  Val. Acc: 81.20%
Epoch: 05 | Epoch Time: 0m 38s
	Train Loss: 0.232 | Train Acc: 91.07%
	 Val. Loss: 0.466 |  Val. Acc: 80.90%
Epoch: 06 | Epoch Time: 0m 38s
	Train Loss: 0.225 | Train Acc: 91.49%
	 Val. Loss: 0.450 |  Val. Acc: 81.94%
Epoch: 07 | Epoch Time: 0m 38s
	Train Loss: 0.215 | Train Acc: 91.98%
	 Val. Loss: 0.450 |  Val. Acc: 83.01%
Epoch: 08 | Epoch Time: 0m 38s
	Train Loss: 0.207 | Train Acc: 92.24%
	 Val. Loss: 0.438 |  Val. Acc: 83.48%
Epoch: 10 | Epoch Time: 0m 38s
	Train Loss: 0.200 | Train Acc: 92.70%
	 Val. Loss: 0.449 |  Val. Acc: 83.39%
