In [1]:
pip install portalocker


Collecting portalocker
  Downloading portalocker-2.8.2-py3-none-any.whl.metadata (8.5 kB)
Downloading portalocker-2.8.2-py3-none-any.whl (17 kB)
Installing collected packages: portalocker
Successfully installed portalocker-2.8.2
Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu118

Looking in indexes: https://download.pytorch.org/whl/nightly/cu118
Note: you may need to restart the kernel to use updated packages.


In [3]:
import torch
import torchtext
from torchtext.data import get_tokenizer
from torchinfo import summary
import numpy as np
import collections


In [4]:
# check the paltform, Apple Silicon or Linux
import os,platform
torch_device= "cpu"
if 'kaggle' in os.environ.get('KAGGLE_URL_BASE','loaclhost'):
    torch_device ='cuda'
else:
    torch_device = 'mps' if platform.system()=='Darwin' else 'cpu'
torch_device

'cuda'

In [5]:
train_dataset,test_dataset = torchtext.datasets.AG_NEWS(root='./data')
train_dataset,test_dataset = list(train_dataset),list(test_dataset)
classes = ['World','Sports','Business','Sci\Tech']

tokenizer = torchtext.data.utils.get_tokenizer('basic_english')

In [6]:
def build_vocab(train_dataset,ngrams=1,min_freq=1):
    counter = collections.Counter()
    for (label,line) in train_dataset:
        counter.update(torchtext.data.utils.ngrams_iterator(tokenizer(line),ngrams=ngrams))
    vocab = torchtext.vocab.vocab(counter,min_freq=1)
    return vocab
vocab = build_vocab(train_dataset, ngrams=1, min_freq=1)

In [7]:
def encode(x,voc=None,tokenizer=tokenizer):
    v =vocab   if not voc else voc
    return [v.get_stoi()[s] for s in tokenizer(x)]

def padify(b):
    v= [encode(x[1]) for x in b ]
    # b is the list of tuples of length batch_size
    # - first element of a tuple = label
    # - second = feature (text, sequence)
    # build vectorized sequence
    l=max(map(len,v))
    return(
    torch.LongTensor([t[0]-1 for t in b]),
    torch.stack([torch.nn.functional.pad(torch.tensor(t),(0,l-len(t)),mode ='constant',value=0) 
                 for t in v])
    )
    
    

For each tuple in b, it extracts the second element (x[1]), which is the feature (text sequence), and applies the encode function to vectorize the text sequence. The result is a list of vectorized sequences.

It computes the maximum length (l) among all the vectorized sequences in the minibatch. The map(len, v) applies the len function to each vectorized sequence in v, and max finds the maximum length.

In [8]:
first_sentence = train_dataset[0][1]
second_sentence = train_dataset[1][1]

f_tokens = encode(first_sentence)
s_tokens = encode(second_sentence)

print(f'First sentence in dataset:\n{first_sentence}')
print('Length:', len(train_dataset[0][1]), '\n')
print(f'\nSecond sentence in dataset:\n{second_sentence}')
print('Length:', len(train_dataset[1][1]), '\n')

First sentence in dataset:
Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.
Length: 144 


Second sentence in dataset:
Carlyle Looks Toward Commercial Aerospace (Reuters) Reuters - Private investment firm Carlyle Group,\which has a reputation for making well-timed and occasionally\controversial plays in the defense industry, has quietly placed\its bets on another part of the market.
Length: 266 



In [None]:
vocab_size =len(vocab)
labels, features = padify(train_dataset)
print(f'features:{features}')

print(f'\nlength of first sentence: {len(f_tokens)}')
print(f'length of second sentence: {len(s_tokens)}')
print(f'size of features: {features.size()}')

In [None]:
class EmbedClassifier(torch.nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super().__init__()
        self.embedding = torch.nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = torch.nn.Linear(embed_dim, num_class)
    
    def forward(self, x):
        x = self.embedding(x)
        x = torch.mean(x, dim=1) # torch.mean() computes the mean of all elements in a tensor, it is a reduction operation
        return self.fc(x)

In [None]:
def offsetify(b):
    x = [torch.tensor(encode(t[1])) for t in b]
    o = [0] + [len(t) for t in x]
    o = torch.tensor(o[:-1]).cumsum(dim=0)
    
    return(
    torch.LongTensor([t[0]-1 for t in b]),
    torch.cat(x),
    o
    )


train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=offsetify)


In [None]:
labels, features, offset = offsetify(train_dataset)
print(f'offset:{offset}')
print(f'\nlength of first sentence: {len(f_tokens)}')
print(f'length of second sentence: {len(s_tokens)}')
print(f'size of data vector: {features.size()}')
print(f'size of offset vector: {offset.size()}')

In [None]:
net = EmbedClassifier(vocab_size,32,len(classes)).to(torch_device)


In [None]:
def train_epoch_emb(net, dataloader, lr=0.01, optimizer=None, loss_fn=torch.nn.CrossEntropyLoss(),epoch_size=None, report_freq=200):
    optimizer  = optimizer or torch.optim.Adam(net.parameters(), lr=lr)
    loss_fn = loss_fn.to(torch_device)
    net.train()
    total_loss, acc, count, i = 0,0,0,0
    for labels, text,off in dataloader:
        optimizer.zero_grad()
        labels, text, off = labels.to(torch_device), text.to(torch_device), off.to(torch_device)
        output = net(text, off)
        loss = loss_fn(output, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss
        _, predicted = torch.max(output,1) # torch.max() is used to get the max value of a tensor
        acc += (predicted == labels).sum()
        count += len(labels)
        i += 1
        if i%report_freq == 0:
            print(f'iteration {count}, loss {total_loss.item()/count}, accuracy {acc.item()/count}')
        if epoch_size and count >= epoch_size:
            break
    return total_loss.item()/count, acc.item()/count

train_epoch_emb(net, train_loader, lr=4, epoch_size=1000)