In [1]:
from torchtext.vocab import GloVe
embedding_glove = GloVe(name='6B', dim=50)

# Self-Attention

In [1]:
import torch
from torch import nn
import torch.nn.functional as F

In [2]:
class SelfAttention(nn.Module):
    def __init__(self, num_heads, num_dim):
        super().__init__()

        self.num_heads = num_heads
        self.num_dim = num_dim

        self.toqueries = nn.Linear(self.num_dim, self.num_heads*self.num_dim, bias=False)
        self.tokeys = nn.Linear(self.num_dim, self.num_heads*self.num_dim, bias=False)
        self.tovalues = nn.Linear(self.num_dim, self.num_heads*self.num_dim, bias=False)
        self.unify_heads = nn.Linear(self.num_heads*self.num_dim, self.num_dim)

    def forward(self, x):
        num_batch, num_words, num_dim = x.shape
        num_heads = self.num_heads

        queries = self.toqueries(x).view(num_batch, num_words, num_heads, num_dim)
        keys = self.tokeys(x).view(num_batch, num_words, num_heads, num_dim)
        values = self.tovalues(x).view(num_batch, num_words, num_heads, num_dim)

        queries = queries.transpose(1, 2).contiguous().view(num_batch*num_heads, num_words, num_dim)
        keys = keys.transpose(1, 2).contiguous().view(num_batch*num_heads, num_words, num_dim)
        values = values.transpose(1, 2).contiguous().view(num_batch*num_heads, num_words, num_dim)

        queries = queries/(num_dim**(1/4))
        keys = keys/(num_dim**(1/4))

        raw_weights = torch.bmm(queries, keys.transpose(1, 2))
        weights = torch.softmax(raw_weights, dim=2)

        out = torch.bmm(weights, values).view(num_batch, num_heads, num_words, num_dim)
        out = out.transpose(1, 2).contiguous().view(num_batch, num_words, num_heads*num_dim)
        out = self.unify_heads(out)
        return out

# Transformer Block

In [3]:
class Transformer(nn.Module):
    def __init__(self, num_dim, num_heads):
        super().__init__()
        
        self.num_heads = num_heads
        self.num_dim = num_dim
        self.sa = SelfAttention(self.num_heads, self.num_dim)
        self.norm1 = nn.LayerNorm(self.num_dim)
        self.mlp = nn.Sequential(
                        nn.Linear(self.num_dim, 4*self.num_dim),
                        nn.ReLU(),
                        nn.Linear(4*self.num_dim, self.num_dim)
                        )
        self.norm2 = nn.LayerNorm(self.num_dim)

    def forward(self, x):
        out1 = self.sa(x)
        out2 = self.norm1(out1+x)
        out3 = self.mlp(out2)
        final = self.norm2(out3+out2)
        return final

# IMDB data pre-processing

In [107]:
!head 'aclImdb/train/pos/45_10.txt'

Home Room was a great movie if you've ever had drama in your life. It keeps you wanting to see more. Wondering what the secret Alicia is hiding. I think I watched that movie 6 times in a row and never lost interest. Plus I usually don't cry over movies but this one made me cry each time. I wish I could find more movies like that one. All in All I thought it was a great movie. The more you watch of it the more you become part of it. The very end is the part that really got me when she cried when getting her diploma, because it had her daughter's name on it. My heart felt as if it had shattered just then. And how her new friend came to comfort her when she hadn't gotten hers yet. I loved it so much.

In [108]:
from os import listdir
from os.path import isfile, join
trainposfiles = [join('aclImdb/train/pos/', f) for f in listdir('aclImdb/train/pos/') if isfile(join('aclImdb/train/pos/', f))]
trainnegfiles = [join('aclImdb/train/neg/', f) for f in listdir('aclImdb/train/neg/') if isfile(join('aclImdb/train/neg/', f))]
testposfiles = [join('aclImdb/test/pos/', f) for f in listdir('aclImdb/test/pos/') if isfile(join('aclImdb/test/pos/', f))]
testnegfiles = [join('aclImdb/test/neg/', f) for f in listdir('aclImdb/test/neg/') if isfile(join('aclImdb/test/neg/', f))]

In [109]:
train = []
test = []
for x in trainposfiles:
    f = open(x, "r")
    train.append({"review": f.read(), "sentiment": 1})
    f.close()
for x in trainnegfiles:
    f = open(x, "r")
    train.append({"review": f.read(), "sentiment": 0})
    f.close()
for x in testposfiles:
    f = open(x, "r")
    test.append({"review": f.read(), "sentiment": 1})
    f.close()
for x in testnegfiles:
    f = open(x, "r")
    test.append({"review": f.read(), "sentiment": 0})
    f.close()

In [45]:
import pandas as pd
df1 = pd.DataFrame(train)
df2 = pd.DataFrame(test)
df1.to_csv('train.csv', index=False)
df2.to_csv('test.csv', index=False)

In [32]:
import json
with open("train.json", "w") as trainfile:  
    json.dump(train, trainfile) 

with open("test.json", "w") as testfile:  
    json.dump(test, testfile) 

In [4]:
import spacy
spacy_en = spacy.load('en')
def tokenize(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [5]:
from torchtext.data import Field, TabularDataset, BucketIterator
review = Field(sequential=True, tokenize=tokenize, use_vocab=True, lower=True, batch_first=True)
sentiment = Field(sequential=False, use_vocab=False)

In [6]:
fields = {"review": ("review", review), "sentiment": ("sentiment", sentiment)}
train_data, test_data = TabularDataset.splits(
    path='/home/chirag_17bit012/Attention-Is-All-You-Get/data',
    format='csv',
    train='train.csv',
    test='test.csv',
    fields=[('review', review), ('sentiment', sentiment)]
)

In [7]:
review.build_vocab(train_data, vectors="glove.6B.50d")

In [23]:
train_iterator, test_iterator = BucketIterator.splits(
    (train_data, test_data),
    batch_size=16,
    sort_key=lambda x: len(x.Text),
    device='cuda'
)

In [24]:
cnt = 0
for b in train_iterator:
    cnt+=1
    print(b.review.size())
    if cnt==10:
        break

torch.Size([16, 873])
torch.Size([16, 1154])
torch.Size([16, 365])
torch.Size([16, 642])
torch.Size([16, 1120])
torch.Size([16, 804])
torch.Size([16, 882])
torch.Size([16, 392])
torch.Size([16, 420])
torch.Size([16, 398])


In [25]:
vocab = review.vocab

In [26]:
len(vocab)

101513

In [27]:
embed = nn.Embedding(len(vocab), 50).to('cuda')
embed.weight.data.copy_(vocab.vectors)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.4180,  0.2497, -0.4124,  ..., -0.1841, -0.1151, -0.7858],
        ...,
        [-0.3564, -0.8063,  0.2048,  ..., -0.0914,  0.2320,  0.7523],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],
       device='cuda:0')

In [28]:
embed(b.review).shape

torch.Size([16, 398, 50])

In [29]:
import math

In [33]:
class Embedding(nn.Module):
    def __init__(self, embed, num_dim, device, max_len=20000):
        super().__init__()
        self.embed = embed
        self.device = device
        pe = torch.zeros(max_len, num_dim)
        for pos in range(max_len):
            for i in range(0, num_dim, 2):
                pe[pos, i] = math.sin(pos / (10000 ** ((2 * i)/num_dim)))
                pe[pos, i + 1] = math.cos(pos / (10000 ** ((2 * (i + 1))/num_dim)))
                
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
    def forward(self, x):
        num_words = x.shape[1]
        word_embedding = self.embed(x).to(self.device)
        postional_encoding = self.pe[ :, :num_words]
        postional_encoding.requires_grad = False
        postional_encoding = postional_encoding.to(self.device)
        return word_embedding + postional_encoding

# Classification Transformer
![image](http://peterbloem.nl/files/transformers/classifier.svg)

In [34]:
class ClassificationTransformer(nn.Module):
    def __init__(self, embed, device, num_dim=50, num_heads=8, num_classes=2):
        super().__init__()
        self.embbeding = Embedding(embed, num_dim, device)
        self.transformer = Transformer(num_dim, num_heads)
        self.out = nn.Linear(num_dim, num_classes)
    def forward(self, x):
        embedded_input = self.embbeding(x)
        #print(embedded_input.get_device())
        output = self.transformer(embedded_input)
        output = self.out(output.mean(dim=1))
        return F.log_softmax(output, dim=1)
    

In [35]:
import tqdm
import torch.optim as optim

num_heads = 8
num_classes = 2
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device = torch.device("cpu")
embed = embed
print(embed)
model = ClassificationTransformer(embed, device, num_heads=num_heads, num_classes=num_classes).to(device)
opt = optim.Adam(model.parameters(), lr=1e-2)
loss = nn.NLLLoss()

epochs = 80

for epoch in range(1, epochs + 1):
    running_loss = 0.0
    running_corrects = 0
    model.train()
    for obj in tqdm.tqdm(train_iterator): 
        x = obj.review.to(device)
        y = obj.sentiment.to(device)
        opt.zero_grad()

        preds = model(x)
        error = loss(preds, y)
        error.backward()
        opt.step()

        running_loss += error.data * x.shape[0]

    epoch_loss = running_loss / len(train_iterator)

    val_loss = 0.0
    model.eval() 
    for test_obj in test_iterator:
        x = test_obj.review.to(device)
        y = test_obj.sentiment.to(device)
        preds = model(x)
        error = loss(preds, y)
        val_loss += error.data * x.shape[0]

    val_loss /= len(test_iterator)
    print('Epoch: {}, Training Loss: {:.4f}, Validation Loss: {:.4f}'.format(epoch, epoch_loss, val_loss))

print("Done")

Embedding(101513, 50)
  3%|▎         | 43/1563 [00:01<00:42, 35.54it/s]


RuntimeError: CUDA out of memory. Tried to allocate 3.71 GiB (GPU 0; 31.74 GiB total capacity; 26.74 GiB already allocated; 2.45 GiB free; 27.94 GiB reserved in total by PyTorch)

In [64]:
 a = torch.randn(1, 6, 4)
 print(a)

tensor([[[ 1.1633,  0.6877,  0.3475, -0.9342],
         [ 1.2643, -0.3659, -0.2325,  3.3317],
         [-0.1619, -1.1090,  0.5910, -0.3541],
         [ 1.0268,  0.3485, -0.6662,  0.9243],
         [ 0.6598,  0.0377, -0.3637, -0.6380],
         [-1.0930, -0.6696, -0.2064,  1.2736]]])


In [9]:
torch.cuda.empty_cache()

NameError: name 'torch' is not defined