In [None]:
import torch 
from torch import nn 
import numpy as np 

In [None]:
torch.manual_seed(1400)

In [None]:
x = torch.rand(2, 3) # , 5)
x

In [None]:
torch.sigmoid(x)

In [None]:
torch.relu(x)

In [None]:
x.shape

In [None]:
x_out = torch.sum(x, dim=0) # dim 1 , 2 , ..? 
x_out

In [None]:
x_out.shape

$$ softmax(x)_i = \frac{exp(x_i)}{\sum_{j}^{ }exp(x_j))}.$$

<br>

In [None]:
torch.softmax(x, dim=0)

In [None]:
l1 = nn.Linear(3, 2)

In [None]:
result_l1 = l1(x)
result_l1

In [None]:
result_l1.shape


# ⚠️⚠️⚠️

####   • torch.nn only supports mini-batches. The entire torch.nn package only supports inputs that are a mini-batch of samples, and not a single sample.

<br>

- pip install torchtext --user -U

In [None]:
from torchtext.datasets import AG_NEWS

- AG is a collection of more than 1 million news articles
- Consists of class ids 1-4 where 1-World, 2-Sports, 3-Business, 4-Sci/Tech

In [None]:
train_iter = AG_NEWS(root='${HOME}/.data', split='train')

In [None]:
next(train_iter)

In [None]:
from torchtext.data.utils import get_tokenizer

In [None]:
from collections import Counter

- Also look at nltk, spacy

In [None]:
sample_data = ['a', 'a', 'b', 'c', 'a', 'c', 'd', 'd']
sample_counter = Counter(sample_data)

In [None]:
sample_counter

In [None]:
train_x = [i[1] for i in AG_NEWS(split='train')]

In [None]:
train_y = [i[0] for i in AG_NEWS(split='train')]

In [None]:
print(train_x[0])

In [None]:
print(train_y[0])

In [None]:
tokenizer = get_tokenizer('basic_english')

In [None]:
tokenizer(train_x[0])

## NLTK

- pip install nltk
- python shell for downloading data:

    \>>> import nltk 

    \>>> nltk.download()

In [None]:
from nltk.corpus import stopwords

In [None]:
stopwords.words('english')

In [None]:
all_words = set()
for line in train_x:
    all_words.update(tokenizer(line))

In [None]:
len(all_words)

In [None]:
sorted(list(all_words))

In [None]:
import string

In [None]:
string.punctuation

In [None]:
from scipy import sparse

In [None]:
from typing import List
class CountEncoder:
    def __init__(self, string_data: List[str], min_count = 12): 
        self.all_words = Counter()
        self.trans_dict = str.maketrans('', '', string.punctuation)
        for line in string_data:
            tokenized_line = tokenizer(line.lower())
            line = [l.translate(self.trans_dict) for l in tokenized_line]
            self.all_words.update(line)
        for punc in stopwords.words('english'):
            del self.all_words[punc]
        for c in list(self.all_words):
            if self.all_words[c] < min_count:
                del self.all_words[c]
        self.word_count = len(self.all_words)
        self.all_words = {key:index for index, key in enumerate(self.all_words)}
        
    def translate(self, strings):
        string_len = len(strings)
        output = sparse.dok_matrix((string_len, self.word_count), dtype=np.float32)
        for index, s in enumerate(strings):
            s = s.lower()
            tokenized_line = tokenizer(s)
            line = [l.translate(self.trans_dict) for l in tokenized_line]
            line = [l for l in tokenized_line if l in self.all_words]
            counter = Counter(line)
            for c in counter.keys():
                output[index, self.all_words[c]] = counter[c]
        return output


In [None]:
count_encoder = CountEncoder(train_x)

In [None]:
count_encoder.word_count

In [None]:
train_x_sparse = count_encoder.translate(train_x)

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
device

In [None]:
class CounterModel(nn.Module):
    def __init__(self, input_dim, hidden_dim = 48, output_dim = 4):
        super().__init__()
        self.w1 = nn.Linear(input_dim, hidden_dim)
        self.w2 = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        x = torch.relu(self.w1(x))
        x = self.w2(x) # why we are not using softmax here??
        return x 

In [None]:
model = CounterModel(count_encoder.word_count).to(device)

In [None]:
batch_size = 256
epoches = 2

In [None]:
loss_fn = nn.CrossEntropyLoss()

In [None]:
optimizer = torch.optim.Adam(model.parameters())

In [None]:
# train loop
for e in range(epoches):
    size = len(train_x) // batch_size
    for i in range(0, len(train_x), batch_size):
        batch_x = torch.from_numpy(train_x_sparse[i:i+batch_size].toarray()).to(device)
        batch_y = torch.tensor(train_y[i: i+batch_size], dtype=torch.int64).to(device) - 1
        pred = model(batch_x)
        loss_val = loss_fn(pred, batch_y)
        optimizer.zero_grad()
        loss_val.backward()
        optimizer.step()
        if i%100 == 0:
            loss = loss_val.item()
            print(f"loss: {loss:>7f}  epoch: {e}, batch_percent: [{i/len(train_x)*100}%]")


In [None]:
torch.tensor(train_y, dtype=torch.int64).unique()

In [None]:
import gc

In [None]:
gc.collect()

In [None]:
test_x = [i[1] for i in AG_NEWS(split='test')]

In [None]:
test_y = [i[0] for i in AG_NEWS(split='test')]

In [None]:
len(test_x)

In [None]:
test_x_sparse = count_encoder.translate(test_x)

In [None]:
test_x_sparse[:100].toarray().shape

In [None]:
with torch.no_grad():
    test_batch = 1000
    test_x_batch = torch.from_numpy(test_x_sparse[:test_batch].toarray()).to(device)
    test_y_batch = torch.tensor(test_y[:test_batch]).to(device)
    outputs = model(test_x_batch)
    _, predict = torch.max(outputs, dim=1)
    correct = (predict == test_y_batch).sum().item()