In [1]:
import torch
import torch.nn as nn
from torch import Tensor
import torch.nn.functional as F
import math

In [2]:
class Attn(nn.Module):
    def __init__(self, emb_dim, q_dim, k_dim):
        super().__init__()
        self.Wq = nn.Linear(emb_dim, q_dim)
        self.Wk = nn.Linear(emb_dim, k_dim)
        self.Wv = nn.Linear(emb_dim, k_dim)
        
    def forward(self, x, z):
        Q = self.Wq(x)
        K = self.Wk(z)
        V = self.Wv(z)
#         print(K.shape)
        S = Q.bmm(K.transpose(1,2))
        # uhh masking here
#         print(S.shape)
        sm = F.softmax(torch.div(S,torch.sqrt(torch.tensor(x.shape[-1]))),dim=-1)
#         print(sm.shape)
#         print(V.shape)
        return sm.bmm(V)
    
class MHAttn(nn.Module):
    def __init__(self, num_heads, emb_dim, q_dim, k_dim):
        super().__init__()
        self.heads = nn.ModuleList([Attn(emb_dim, q_dim, k_dim) for _ in range(num_heads)])
        self.Wo = nn.Linear(num_heads * k_dim, emb_dim)
        
    def forward(self, x, z):
        subAttns = torch.cat([h(x,z) for h in self.heads], dim=-1)
        return self.Wo(subAttns)
        
# encoder is definitionally self-attn
class EncoderBlock(nn.Module):
    def __init__(self, emb_dim, z_dim, heads):
        super().__init__()
        self.attn = MHAttn(heads, emb_dim, z_dim, z_dim)
        self.ln1 = nn.LayerNorm(emb_dim)
        self.ln2 = nn.LayerNorm(emb_dim)
        self.ff1 = nn.Linear(emb_dim,emb_dim)
        self.ff2 = nn.Linear(emb_dim,emb_dim)
        
    def forward(self, z):
        z = z + self.attn(z,z)
        z = self.ln1(z)
        z = z + self.ff2(F.relu(self.ff1(z)))
        return self.ln2(z)
    
class DecoderBlock(nn.Module):
    def __init__(self, emb_dim, x_dim, z_dim, heads):
        super().__init__()
        self.attn = MHAttn(heads, emb_dim, x_dim, z_dim)
        self.ln1 = nn.LayerNorm(emb_dim)
        self.ln2 = nn.LayerNorm(emb_dim)
        self.ln3 = nn.LayerNorm(emb_dim)
        self.ff1 = nn.Linear(emb_dim,emb_dim)
        self.ff2 = nn.Linear(emb_dim,emb_dim)
        
    def forward(self, x, z):
        x = x + self.attn(x,x)
        x = self.ln1(x)
        x = x + self.attn(x,z)
        x = self.ln2(x)
        x = x + self.ff2(F.relu(self.ff1(x)))
        return self.ln3(x)
        
# review this

class EDTransformer(nn.Module):
    def __init__(self, embs, pos, emb_dim, x_dim, z_dim, heads, enc_blocks, dec_blocks, out_dim):
        # needs embedding matrix
        # garbage for now
        # positional embedding scheme
        # softmax FF at the end
        super().__init__()
        self.addpos = pos
        self.emb = embs
        self.enc_blocks = enc_blocks
        self.dec_blocks = dec_blocks
        self.encoderBlocks = nn.ModuleList(
            [EncoderBlock(emb_dim, x_dim, heads) for _ in range(enc_blocks)]
        )
        self.decoderBlocks = nn.ModuleList(
            [DecoderBlock(emb_dim, x_dim, z_dim, heads) for _ in range(dec_blocks)]
        )
        self.ff1 = nn.Linear(emb_dim, 1)
        self.ff2 = nn.Linear(emb_dim, out_dim)

    def forward(self, x,z):
        # embed + pos
        # if emb is not None, x better be an embedding already
        if self.emb is not None:
            x = self.emb(x)
            z = self.emb(z)
        # ! positions are encoded s, b, e
        print("after emb layer")
        print(x.shape)
        x = self.addpos(x.permute(1,0,2)).permute(1,0,2)
        z = self.addpos(z.permute(1,0,2)).permute(1,0,2)
        # loop through encoder blocks
        for i in range(self.enc_blocks):
            z = self.encoderBlocks[i](z)
        # loop through decoder blocks
        for i in range(self.dec_blocks):
            x = self.decoderBlocks[i](x, z)
        # FF and softmax
        print("after decoder")
        print(x.shape)
        x = self.ff1(x)
        print("after ff1 (emb->1)")
        print(x.shape)
        x = torch.flatten(x, start_dim=1)
        print("after flatten")
        print(x.shape)
        return F.softmax(self.ff2(x), dim=-1)
    
class posEncoding(nn.Module):
    def __init__(self, model_dim, dropout, maxlen):
        super().__init__()
        self.dropout = dropout
        position = torch.arange(maxlen).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, model_dim, 2) * (-math.log(10000.0)/model_dim))
#         print("position.shape")
#         print(position.shape)
#         print("div_term.shape")
#         print(div_term.shape)
#         print("model_dim")
#         print(model_dim)
        pe = torch.zeros(maxlen, 1, model_dim)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer("pe", pe)
#         print(self.pe.shape)
    
    def forward(self, x):
        print("x.shape")
        print(x.shape)
        print("self.pe[:x.shape[0]].shape")
        print(self.pe[:x.shape[0]].shape)
        x = x + self.pe[:x.shape[0]]
        return x #self.dropout(x)
        
        


In [3]:
exAttn = Attn(256, 32, 32)

In [4]:
# sequence of 8 words
x = torch.randn(1,8,256)
# seq of 10
z = torch.randn(1,10,256)

In [5]:
# three seqs of 8 words
x = torch.randn(3,8,256)
# three seqs of 10
z = torch.randn(3,10,256)

In [6]:
res = exAttn(x,x)
res.shape

torch.Size([3, 8, 32])

In [7]:
exMHA = MHAttn(8, 256, 32, 32)

In [8]:
mhres = exMHA(x,x)
mhres.shape

torch.Size([3, 8, 256])

In [9]:
exEB = EncoderBlock(256, 32, 8)

In [10]:
ebres = exEB(x)

In [11]:
ebres.shape

torch.Size([3, 8, 256])

In [12]:
exDB = DecoderBlock(256, 32, 32, 8)

In [13]:
dbres = exDB(x, z)

In [14]:
dbres.shape

torch.Size([3, 8, 256])

In [15]:
exPE2 = posEncoding(256, 0, 10000)

In [16]:
exEDT = EDTransformer(None, exPE2, 256, 32, 32, 8, 3,3,1)

In [21]:
edtres = exEDT(x,z)

after emb layer
torch.Size([3, 8, 256])
x.shape
torch.Size([8, 3, 256])
self.pe[:x.shape[0]].shape
torch.Size([8, 1, 256])
x.shape
torch.Size([10, 3, 256])
self.pe[:x.shape[0]].shape
torch.Size([10, 1, 256])
after decoder
torch.Size([3, 8, 256])
after ff1 (emb->1)
torch.Size([3, 8, 1])
after flatten
torch.Size([3, 8])


RuntimeError: mat1 and mat2 shapes cannot be multiplied (3x8 and 256x1)

In [18]:
edtres.shape

NameError: name 'edtres' is not defined

In [22]:
from torchsummary import summary
summary(exEDT, [(8,256),(10,256)])


after emb layer
torch.Size([2, 8, 256])
x.shape
torch.Size([8, 2, 256])
self.pe[:x.shape[0]].shape
torch.Size([8, 1, 256])
x.shape
torch.Size([10, 2, 256])
self.pe[:x.shape[0]].shape
torch.Size([10, 1, 256])
after decoder
torch.Size([2, 8, 256])
after ff1 (emb->1)
torch.Size([2, 8, 1])
after flatten
torch.Size([2, 8])


RuntimeError: mat1 and mat2 shapes cannot be multiplied (2x8 and 256x1)

In [3]:
import torchtext
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import torchvision

In [None]:
!python --version

#### mnist

In [None]:
mnist = torchvision.datasets.MNIST("./mnist", download=True)
mnistVal = torchvision.datasets.MNIST("./mnist", train=False, download=True)


#### imdb

In [None]:
from torchtext.datasets import IMDB
from collections import Iterable
train_iter = IMDB(split='train')
val_iter = IMDB(split='test')

#### imdb csv

In [4]:
import pandas as pd
df = pd.read_csv('imdb_processed.csv')
df.head()

Unnamed: 0,processed,label
0,One reviewer mentioned watching Oz episode hoo...,1
1,A wonderful little production . The filming te...,1
2,I thought wonderful way spend time hot summer ...,1
3,Basically family little boy Jake think zombie ...,0
4,Petter Mattei Love Time Money visually stunnin...,1


In [5]:
# df['token_length'] = df.processed.apply(lambda x: len(x.split()))
df

Unnamed: 0,processed,label
0,One reviewer mentioned watching Oz episode hoo...,1
1,A wonderful little production . The filming te...,1
2,I thought wonderful way spend time hot summer ...,1
3,Basically family little boy Jake think zombie ...,0
4,Petter Mattei Love Time Money visually stunnin...,1
...,...,...
49995,I thought movie right good job . It creative o...,1
49996,"Bad plot , bad dialogue , bad acting , idiotic...",0
49997,I Catholic taught parochial elementary school ...,0
49998,I going disagree previous comment side Maltin ...,0


In [6]:
reviews = df.processed.values
words = ' '.join(reviews)
words = words.split()
words[:10]

['One',
 'reviewer',
 'mentioned',
 'watching',
 'Oz',
 'episode',
 'hooked',
 '.',
 'They',
 'right']

In [7]:
from collections import Counter
ctr = Counter(words)
vocab = sorted(ctr, key=ctr.get, reverse=True)
int2word = dict(enumerate(vocab, 1))
int2word[0] = '<PAD>'
word2int = {word: id for id, word in int2word.items()}

In [8]:
len(vocab)

121300

In [9]:
exEMB = nn.Embedding(len(vocab), 256)

In [10]:
exEMB(torch.ones(1, dtype=torch.long)).shape

torch.Size([1, 256])

In [11]:
from tqdm import tqdm
rev_enc = [[word2int[word] for word in review.split()] for review in tqdm(reviews)]


100%|██████████| 50000/50000 [00:01<00:00, 27679.15it/s]


In [12]:
import numpy as np
def pad_features(reviews, pad_id, seq_length=128):
    # features = np.zeros((len(reviews), seq_length), dtype=int)
    features = np.full((len(reviews), seq_length), pad_id, dtype=int)

    for i, row in enumerate(reviews):
        # if seq_length < len(row) then review will be trimmed
        features[i, :len(row)] = np.array(row)[:seq_length]

    return features

seq_length = 256
features = pad_features(rev_enc, pad_id=word2int['<PAD>'], seq_length=seq_length)

assert len(features) == len(rev_enc)
assert len(features[0]) == seq_length

features[:10, :10]

array([[  191,  1083,   930,    81,  3724,   186,  3030,     1,   118,
          114],
       [   47,   328,    59,   244,     1,     7,  1267,  1608, 17875,
            4],
       [    3,    95,   328,    30,  1041,    13,   845,  1774,  2633,
            2],
       [ 2408,   136,    59,   241,  3230,    37,   650,  4298,   583,
          882],
       [70982, 10566,  1081,  1941,  7538,  2280,  1313,     6,    46,
            1],
       [ 2795,     4,    13,   368,     5,     2,    17, 42503,     2,
         2952],
       [    3,   197,    18,    10,    21, 12517,  1904, 55922,   121,
         5112],
       [   14,    31,   427,     2,  1342,  4021,   165,    34,  2960,
            1],
       [47893,   980,   359,     6,     3,   180,   776,    81,     6,
            1],
       [   64,    10,   125,  2269,  5606,  1980,    10,     5,     1,
           64]])

In [13]:
labels = df.label.to_numpy()
labels

array([1, 1, 1, ..., 0, 0, 0])

In [14]:
train_size = .75   # we will use 80% of whole data as train set
val_size = 1      # and we will use 50% of test set as validation set

# make train set
split_id = int(len(features) * train_size)
train_x, remain_x = features[:split_id], features[split_id:]
train_y, remain_y = labels[:split_id], labels[split_id:]

# make val and test set
split_val_id = int(len(remain_x) * val_size)
val_x, test_x = remain_x[:split_val_id], remain_x[split_val_id:]
val_y, test_y = remain_y[:split_val_id], remain_y[split_val_id:]

# print out the shape
print('Feature Shapes:')
print('===============')
print('Train set: {}'.format(train_x.shape))
print('Validation set: {}'.format(val_x.shape))
print('Test set: {}'.format(test_x.shape))

Feature Shapes:
Train set: (37500, 256)
Validation set: (12500, 256)
Test set: (0, 256)


#### for mnist

In [None]:
transform = transforms.Compose([
    transforms.PILToTensor(),
])

traindata, trainlabels = zip(*[(transform(x[0])/255.0, x[1]) for x in mnist])
valdata, vallabels = zip(*[(transform(x[0])/255.0, x[1]) for x in mnistVal])


In [None]:
from PIL import Image
import torchvision.transforms as transforms
import math

In [None]:
type(mnist)

In [None]:
transform = transforms.Compose([
    transforms.PILToTensor(),
])

traindata, trainlabels = zip(*[(transform(x[0])/255.0, x[1]) for x in mnist])
valdata, vallabels = zip(*[(transform(x[0])/255.0, x[1]) for x in mnistVal])


In [None]:
print(len(trainlabels))
len(traindata)

In [None]:
# print(trainlabels[0])
# print(traindata[0])
valdata = torch.cat(valdata)
vallabels = torch.cat([torch.tensor([y]) for y in vallabels])

In [None]:
print(trainlabels[0])
print(traindata[0])
traindata = torch.cat(traindata)
trainlabels = torch.cat([torch.tensor([y]) for y in trainlabels])

In [None]:
traindata.shape

In [None]:
train_ds = TensorDataset(traindata, trainlabels)
val_ds = TensorDataset(valdata, vallabels)

In [None]:
train_dl = DataLoader(train_ds, batch_size=32, shuffle=False)
val_dl = DataLoader(val_ds, batch_size=32, shuffle=False)

In [None]:
train_x[2]

#### back to imdb

In [15]:
# define batch size
batch_size = 128

# create tensor datasets
trainset = TensorDataset(torch.from_numpy(train_x[:128*8]), torch.from_numpy(train_y[:128*8]).to(torch.float32))
validset = TensorDataset(torch.from_numpy(val_x[:128*4]), torch.from_numpy(val_y[:128*4]).to(torch.float32))
# testset = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))

# create dataloaders
trainloader = DataLoader(trainset, shuffle=False, batch_size=batch_size)
valloader = DataLoader(validset, shuffle=False, batch_size=batch_size)
# testloader = DataLoader(testset, shuffle=False, batch_size=batch_size)

In [16]:
exEMB = nn.Embedding(len(vocab), 256)

In [19]:
exPE2 = posEncoding(256, 0, 10000)
lmaoOhLawd = EDTransformer(exEMB, exPE2, 256, 4, 4, 7, 3,3,1)

In [None]:
# s, b, e order
# exPE2(torch.ones(28,1,28)).shape

### todo: train with text data (quick)
### next: LC babyyy

In [20]:
n_epochs = 5
opt = optim.Adagrad(lmaoOhLawd.parameters())
loss_func = F.binary_cross_entropy

for i in range(n_epochs):
    for xtb, ytb in trainloader:
        print(xtb.shape)
        preds = lmaoOhLawd(xtb, xtb)
        print(preds.shape)
        print(ytb[0])
        print(preds[0])
        loss = loss_func(preds.squeeze().to(torch.float32), ytb.to(torch.float32))
        loss.backward()
        opt.step()
        opt.zero_grad()
    with torch.no_grad():
        currvalloss = sum([loss_func(lmaoOhLawd(xtb, xtb).squeeze(), ytb) for xtb, ytb in valloader])
        print("val loss epoch ", i, ":", currvalloss)

torch.Size([128, 256])
after emb layer
torch.Size([128, 256, 256])
x.shape
torch.Size([256, 128, 256])
self.pe[:x.shape[0]].shape
torch.Size([256, 1, 256])
x.shape
torch.Size([256, 128, 256])
self.pe[:x.shape[0]].shape
torch.Size([256, 1, 256])
after decoder
torch.Size([128, 256, 256])
after ff1 (emb->1)
torch.Size([128, 256, 1])
after flatten
torch.Size([128, 256])
torch.Size([128, 1])
tensor(1.)
tensor([1.], grad_fn=<SelectBackward0>)
torch.Size([128, 256])
after emb layer
torch.Size([128, 256, 256])
x.shape
torch.Size([256, 128, 256])
self.pe[:x.shape[0]].shape
torch.Size([256, 1, 256])
x.shape
torch.Size([256, 128, 256])
self.pe[:x.shape[0]].shape
torch.Size([256, 1, 256])
after decoder
torch.Size([128, 256, 256])
after ff1 (emb->1)
torch.Size([128, 256, 1])
after flatten
torch.Size([128, 256])
torch.Size([128, 1])
tensor(1.)
tensor([1.], grad_fn=<SelectBackward0>)
torch.Size([128, 256])
after emb layer
torch.Size([128, 256, 256])
x.shape
torch.Size([256, 128, 256])
self.pe[:x.shap

after decoder
torch.Size([128, 256, 256])
after ff1 (emb->1)
torch.Size([128, 256, 1])
after flatten
torch.Size([128, 256])
torch.Size([128, 1])
tensor(1.)
tensor([1.], grad_fn=<SelectBackward0>)
after emb layer
torch.Size([128, 256, 256])
x.shape
torch.Size([256, 128, 256])
self.pe[:x.shape[0]].shape
torch.Size([256, 1, 256])
x.shape
torch.Size([256, 128, 256])
self.pe[:x.shape[0]].shape
torch.Size([256, 1, 256])
after decoder
torch.Size([128, 256, 256])
after ff1 (emb->1)
torch.Size([128, 256, 1])
after flatten
torch.Size([128, 256])
after emb layer
torch.Size([128, 256, 256])
x.shape
torch.Size([256, 128, 256])
self.pe[:x.shape[0]].shape
torch.Size([256, 1, 256])
x.shape
torch.Size([256, 128, 256])
self.pe[:x.shape[0]].shape
torch.Size([256, 1, 256])
after decoder
torch.Size([128, 256, 256])
after ff1 (emb->1)
torch.Size([128, 256, 1])
after flatten
torch.Size([128, 256])
after emb layer
torch.Size([128, 256, 256])
x.shape
torch.Size([256, 128, 256])
self.pe[:x.shape[0]].shape
torch

torch.Size([128, 256])
after emb layer
torch.Size([128, 256, 256])
x.shape
torch.Size([256, 128, 256])
self.pe[:x.shape[0]].shape
torch.Size([256, 1, 256])
x.shape
torch.Size([256, 128, 256])
self.pe[:x.shape[0]].shape
torch.Size([256, 1, 256])
after decoder
torch.Size([128, 256, 256])
after ff1 (emb->1)
torch.Size([128, 256, 1])
after flatten
torch.Size([128, 256])
torch.Size([128, 1])
tensor(0.)
tensor([1.], grad_fn=<SelectBackward0>)
torch.Size([128, 256])
after emb layer
torch.Size([128, 256, 256])
x.shape
torch.Size([256, 128, 256])
self.pe[:x.shape[0]].shape
torch.Size([256, 1, 256])
x.shape
torch.Size([256, 128, 256])
self.pe[:x.shape[0]].shape
torch.Size([256, 1, 256])
after decoder
torch.Size([128, 256, 256])
after ff1 (emb->1)
torch.Size([128, 256, 1])
after flatten
torch.Size([128, 256])
torch.Size([128, 1])
tensor(0.)
tensor([1.], grad_fn=<SelectBackward0>)
torch.Size([128, 256])
after emb layer
torch.Size([128, 256, 256])
x.shape
torch.Size([256, 128, 256])
self.pe[:x.shap