In [1]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [2]:
import math
import os
from tempfile import TemporaryDirectory
from typing import Tuple

from torch import nn, Tensor
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import dataset

In [3]:
from torchdata.datapipes.iter import IterableWrapper, FileOpener
import os

def get_data(raw_iter):
    return zip(*[(data[3], int(data[4])) for data in raw_iter])
def yield_X(raw_iter):
    for data in raw_iter:
        yield data[3]

def yield_y(raw_iter):
    for data in raw_iter:
        yield data[4]

def get_name(path_and_stream):
    return os.path.basename(path_and_stream[0]), path_and_stream[1]

def get_csv_dataset(csv_path):
    datapipe1 = IterableWrapper([csv_path])
    datapipe2 = FileOpener(datapipe1, mode="b")
    datapipe3 = datapipe2.map(get_name)
    csv_parser_dp = datapipe3.parse_csv(skip_lines=1)
    train_iter = csv_parser_dp
    X, y = get_data(train_iter)
    return X,y

X, y = get_csv_dataset("data/train.csv")
# X_eval, y_eval = get_csv_dataset("data/test.csv")

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator


def pad_TextSequence(batch):
  return torch.nn.utils.rnn.pad_sequence(batch,batch_first=True, padding_value=0)

# train_iter = WikiText2(split='train')
tokenizer = get_tokenizer('basic_english')
vocab = build_vocab_from_iterator(map(tokenizer, X), specials=['<unk>'])
vocab.set_default_index(vocab['<unk>'])

def x_data_process(raw_text_iter: dataset.IterableDataset) -> Tensor:
    """Converts raw text into a flat Tensor."""
    data = [torch.tensor(vocab(tokenizer(item)), dtype=torch.long) for item in raw_text_iter]
    return pad_TextSequence(data) #torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))

In [5]:
def data_process(X, y):
    X = x_data_process(X)
    y = torch.tensor(y, dtype=torch.int).unsqueeze(1)
    return X.to(device), y.to(device)

X, y = data_process(X, y)
X, X_eval = X[:6000], X[6000:]
y, y_eval = y[:6000], y[6000:]
# ``train_iter`` was "consumed" by the process of building the vocab,
# so we have to create it again
#train_iter, val_iter, test_iter = WikiText2()
#val_data = data_process(val_iter)
#test_data = data_process(test_iter)
X.shape, y.shape

(torch.Size([6000, 74]), torch.Size([6000, 1]))

In [6]:
bptt = 100
def get_batch(X: Tensor, y:Tensor, i: int) -> Tuple[Tensor, Tensor]:
    """
    Args:
        source: Tensor, shape ``[full_seq_len, batch_size]``
        i: int

    Returns:
        tuple (data, target), where data has shape ``[seq_len, batch_size]`` and
        target has shape ``[seq_len * batch_size]``
    """
    seq_len = min(bptt, len(X) - 1 - i)
    data = X[i:i+seq_len]
    target = y[i:i+seq_len]
    return data, target

In [7]:
drop_prob = 0.2
ntokens = len(vocab)  # size of vocabulary
emsize = 200  # embedding dimension
d_hid = 200  # dimension of the feedforward network model in ``nn.TransformerEncoder``
nlayers = 4  # number of ``nn.TransformerEncoderLayer`` in ``nn.TransformerEncoder``
nhead = 2  # number of heads in ``nn.MultiheadAttention``
dropout = 0.2  # dropout probability

In [8]:
embeds_size = emsize
num_heads = nhead
vocab_size = ntokens
block_size = nlayers
num_classes = 2

class transformer_block(nn.Module):
   def __init__(self):
       super(transformer_block, self).__init__()
       self.attention = nn.MultiheadAttention(embeds_size, num_heads, batch_first=True)
       self.ffn = nn.Sequential(
           nn.Linear(embeds_size, 4 * embeds_size),
           nn.LeakyReLU(),
           nn.Linear(4 * embeds_size, embeds_size),
       )
       self.drop1 = nn.Dropout(drop_prob)
       self.drop2 = nn.Dropout(drop_prob)
       self.ln1 = nn.LayerNorm(embeds_size, eps=1e-6)
       self.ln2 = nn.LayerNorm(embeds_size, eps=1e-6)
   def forward(self, hidden_state):
       attn, _ = self.attention(hidden_state, hidden_state, hidden_state, need_weights=False)
       attn = self.drop1(attn)
       out = self.ln1(hidden_state + attn)
       observed = self.ffn(out)
       observed = self.drop2(observed)
       return self.ln2(out + observed)

In [9]:
class transformer(nn.Module):
   def __init__(self):
       super(transformer, self).__init__()
       self.tok_embs = nn.Embedding(vocab_size, embeds_size)
       self.pos_embs = nn.Embedding(block_size, embeds_size)
       self.block = transformer_block()
       self.ln1 = nn.LayerNorm(embeds_size)
       self.ln2 = nn.LayerNorm(embeds_size)
       self.classifier_head = nn.Sequential(
           nn.Linear(embeds_size, embeds_size),
           nn.LeakyReLU(),
           nn.Dropout(drop_prob),
           nn.Linear(embeds_size, embeds_size),
           nn.LeakyReLU(),
           nn.Linear(embeds_size, num_classes),
           nn.Softmax(dim=1),
       )
       print("number of parameters: %.2fM" % (self.num_params()/1e6,))
   def num_params(self):
       n_params = sum(p.numel() for p in self.parameters())
       return n_params
   def forward(self, seq):
       B,T = seq.shape
       embedded = self.tok_embs(seq)
       embedded = embedded + self.pos_embs(torch.arange(T, device=device))
       output = self.block(embedded)
       output = output.mean(dim=1)
       output = self.classifier_head(output)
       return output

In [12]:
model = transformer().to(device)

number of parameters: 5.27M


In [13]:
model(X[:1])

../aten/src/ATen/native/cuda/Indexing.cu:1290: indexSelectLargeIndex: block: [103,0,0], thread: [32,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1290: indexSelectLargeIndex: block: [103,0,0], thread: [33,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1290: indexSelectLargeIndex: block: [103,0,0], thread: [34,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1290: indexSelectLargeIndex: block: [103,0,0], thread: [35,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1290: indexSelectLargeIndex: block: [103,0,0], thread: [36,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1290: indexSelectLargeIndex: block: [103,0,0], thread: [37,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1290: indexSelectLargeIndex: block: [103,

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
