In [None]:
!pip install torch==1.12.1  torchtext==0.13.1 torchdata==0.4.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
'''
Puebla: Equipo 2

Alejandro López Hernández
Fernando Jiménez Pereyra
Daniel Flores Rodríguez
Daniel Munive Meneses

'''

In [None]:
import numpy as np
#PyTorch libraries
import torch
from torchtext.datasets import AG_NEWS
from torch import nn
from torch.nn import functional as F
# Dataset and dataloader
from torch.utils.data import DataLoader
from torch.utils.data.dataset import random_split
# Libraries to prepare the data
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.functional import to_map_style_dataset

In [None]:
torch.cuda.is_available()

True

In [None]:
# Use Gpu
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [None]:
train_dataset,  test_dataset = AG_NEWS()

In [None]:
train_dataset, test_dataset = to_map_style_dataset(train_dataset), to_map_style_dataset(test_dataset)

In [None]:
len(train_dataset), len(test_dataset)

(120000, 7600)

In [None]:
train_dataset[:10]

[(3,
  "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again."),
 (3,
  'Carlyle Looks Toward Commercial Aerospace (Reuters) Reuters - Private investment firm Carlyle Group,\\which has a reputation for making well-timed and occasionally\\controversial plays in the defense industry, has quietly placed\\its bets on another part of the market.'),
 (3,
  "Oil and Economy Cloud Stocks' Outlook (Reuters) Reuters - Soaring crude prices plus worries\\about the economy and the outlook for earnings are expected to\\hang over the stock market next week during the depth of the\\summer doldrums."),
 (3,
  'Iraq Halts Oil Exports from Main Southern Pipeline (Reuters) Reuters - Authorities have halted oil export\\flows from the main pipeline in southern Iraq after\\intelligence showed a rebel militia could strike\\infrastructure, an oil official said on Saturday.'),
 (3,
  'Oil prices soar to all-time record, 

In [None]:
tokeniser = get_tokenizer('basic_english')
def yield_tokens(data):
  for _, text in data:
    yield tokeniser(text)

In [None]:
vocab = build_vocab_from_iterator(yield_tokens(train_dataset), specials=['<unk>'])
vocab.set_default_index(vocab['<unk>'])

In [None]:
len(vocab)

95811

In [None]:
for i in range(20):
  print(i, vocab.lookup_token(i))

0 <unk>
1 .
2 the
3 ,
4 to
5 a
6 of
7 in
8 and
9 s
10 on
11 for
12 #39
13 (
14 )
15 -
16 '
17 that
18 with
19 as


In [None]:
tokens = tokeniser('Welcome to TE3007B')
print(tokens, vocab(tokens))

['welcome', 'to', 'te3007b'] [3314, 4, 0]


In [None]:
NUM_TRAIN = int(len(train_dataset) * 0.9)
NUM_VAL =len(train_dataset) - NUM_TRAIN

In [None]:
NUM_VAL

12000

In [None]:
train_dataset, val_dataset = random_split(train_dataset, [NUM_TRAIN, NUM_VAL])

In [None]:
len(train_dataset), len(val_dataset), len(test_dataset)

(108000, 12000, 7600)

In [None]:
max_tokens = 50

In [None]:
def collate_batch(batch):
  y, x = list(zip(*batch))
  #create list with tokens
  x = [vocab(tokeniser(text)) for text in x]
  # padding or clipping
  x = [t + ([0]*(max_tokens - len(t))) if len(t) < max_tokens else t[:max_tokens] for t in x]
  return torch.tensor(x, dtype=torch.long), torch.tensor(y, dtype=torch.long)-1

In [None]:
labels = ['World', 'Sports', 'Business', 'Sci/Tech']
BATCH_SIZE = 1024

In [None]:
# DataLoaders
train_loader = DataLoader(train_dataset, 
                          batch_size=BATCH_SIZE,
                          collate_fn=collate_batch,
                          shuffle = True)
val_loader = DataLoader(val_dataset, 
                          batch_size=BATCH_SIZE,
                          collate_fn=collate_batch,
                          shuffle = True)
test_loader = DataLoader(test_dataset, 
                          batch_size=BATCH_SIZE,
                          collate_fn=collate_batch,
                          shuffle = True)

In [None]:
for i, (x, y) in enumerate(test_loader):
  print(i, x.shape, y.shape)

0 torch.Size([1024, 50]) torch.Size([1024])
1 torch.Size([1024, 50]) torch.Size([1024])
2 torch.Size([1024, 50]) torch.Size([1024])
3 torch.Size([1024, 50]) torch.Size([1024])
4 torch.Size([1024, 50]) torch.Size([1024])
5 torch.Size([1024, 50]) torch.Size([1024])
6 torch.Size([1024, 50]) torch.Size([1024])
7 torch.Size([432, 50]) torch.Size([432])


In [None]:
#let us build our RNN

In [None]:
EMBEDDING_SIZE = 300
NEURONS = 400 #hidden
LAYERS = 4
NUM_CLASSES = 4

In [None]:
class GRU_Model(nn.Module):
  def __init__(self, embed_size, hidden, layers, num_classes):
    super().__init__()
    self.embedding_layer = nn.Embedding(num_embeddings=len(vocab),
                                        embedding_dim=embed_size)
    
    self.rnn = nn.GRU(input_size=embed_size,
                      hidden_size = hidden,
                      num_layers = layers,
                      batch_first = True)

    self.fc = nn.Linear(in_features=hidden, out_features= num_classes)

  def forward(self, x):
    vector_embs = self.embedding_layer(x)
    y, h = self.rnn(vector_embs)
    return self.fc(y[:,-1])
    

In [None]:
rnn_model = GRU_Model(EMBEDDING_SIZE, NEURONS, LAYERS, NUM_CLASSES)

In [None]:
rnn_model

GRU_Model(
  (embedding_layer): Embedding(95811, 300)
  (rnn): GRU(300, 400, num_layers=4, batch_first=True)
  (fc): Linear(in_features=400, out_features=4, bias=True)
)

In [None]:
def accuracy(model, loader):
    num_correct = 0
    num_total = 0
    model.eval()
    model = model.to(device=device)
    with torch.no_grad():
        for x, y in loader:
            x = x.to(device=device, dtype = torch.long)
            y = y.to(device=device, dtype = torch.long)
            scores = model(x)
            _, preds = scores.max(dim=1)
            num_correct += (preds == y).sum()
            num_total += preds.size(0)
        acc = float(num_correct)/num_total
        return acc

In [None]:
def train(model, optimiser, epochs=100):
  model = model.to(device=device)
  for epoch in range(epochs):
    for i, (x, y) in enumerate(train_loader):
      model.train()
      x = x.to(device=device, dtype=torch.long)
      y = y.to(device=device, dtype=torch.long)
      # run model
      scores = model(x)
      # compute cost
      cost = F.cross_entropy(input=scores, target=y)
      # reset gradient
      optimiser.zero_grad()
      # compute gradient
      cost.backward()
      # update parameter
      optimiser.step()
    acc = accuracy(model, val_loader)
    print(f'Epoch {epoch}, costo {cost.item():.4f}, val acc {acc:.4f}')


In [None]:
epochs = 20
lr = 0.0001

rnn_model = GRU_Model(EMBEDDING_SIZE, NEURONS, LAYERS, NUM_CLASSES)
optimiser = torch.optim.Adam(rnn_model.parameters(), lr=lr)

In [None]:
train(rnn_model, optimiser, epochs)

Epoch 0, costo 0.8305, val acc 0.6753
Epoch 1, costo 0.4335, val acc 0.8275
Epoch 2, costo 0.3122, val acc 0.8610
Epoch 3, costo 0.3008, val acc 0.8751
Epoch 4, costo 0.3747, val acc 0.8850
Epoch 5, costo 0.2491, val acc 0.8893
Epoch 6, costo 0.2731, val acc 0.8912
Epoch 7, costo 0.2330, val acc 0.8948
Epoch 8, costo 0.1811, val acc 0.8977
Epoch 9, costo 0.1675, val acc 0.8973
Epoch 10, costo 0.1677, val acc 0.8938
Epoch 11, costo 0.1317, val acc 0.8983
Epoch 12, costo 0.1267, val acc 0.8988
Epoch 13, costo 0.1315, val acc 0.8971
Epoch 14, costo 0.0915, val acc 0.9000
Epoch 15, costo 0.0822, val acc 0.8929
Epoch 16, costo 0.1042, val acc 0.9015
Epoch 17, costo 0.0964, val acc 0.8982
Epoch 18, costo 0.1134, val acc 0.8956
Epoch 19, costo 0.1135, val acc 0.8988


In [None]:
print(f'{accuracy(rnn_model, test_loader):.4f}')

0.8962
