## Setup

In [148]:
# load the necessary libraries
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import random

import warnings
warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning) 

In [149]:
# set seeds
SEED = 42
def set_seeds(seed):
  '''Sets seed for reproduciability'''
  np.random.seed(seed)
  random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)

set_seeds(SEED)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # set device


In [150]:
# Load data
url = "https://raw.githubusercontent.com/GokuMohandas/Made-With-ML/main/datasets/news.csv"
df = pd.read_csv(url, header=0) # load
df = df.sample(frac=1).reset_index(drop=True) # shuffle
df.head()

Unnamed: 0,title,category
0,"BBC set for major shake-up, claims newspaper",Business
1,Marsh averts cash crunch,Business
2,"Jeter, Yankees Look to Take Control (AP)",Sports
3,Flying the Sun to Safety,Sci/Tech
4,Stocks Seen Flat as Nortel and Oil Weigh,Business


### Preprocessing

In [151]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

In [152]:
nltk.download("stopwords")
STOPWORDS = stopwords.words("english")
print (STOPWORDS[:5])
porter = PorterStemmer()

['i', 'me', 'my', 'myself', 'we']


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [153]:
def preprocess(text, stopwords=STOPWORDS):
    """Conditional preprocessing on our text unique to our task."""
    # Lower
    text = text.lower()

    # Remove stopwords
    pattern = re.compile(r"\b(" + r"|".join(stopwords) + r")\b\s*")
    text = pattern.sub("", text)

    # Remove words in parenthesis
    text = re.sub(r"\([^)]*\)", "", text)

    # Spacing and filters
    text = re.sub(r"([-;;.,!?<=>])", r" \1 ", text)  # separate punctuation tied to words
    text = re.sub("[^A-Za-z0-9]+", " ", text)  # remove non alphanumeric chars
    text = re.sub(" +", " ", text)  # remove multiple spaces
    text = text.strip()

    return text

In [154]:
# test preprocessing
text  = 'BBC set for major shake-up, claims newspaper'
print(preprocess(text))

bbc set major shake claims newspaper


In [155]:
# Now apply preprocessing to the whole dataframe
df.title = df.title.apply(preprocess)
df.head()

Unnamed: 0,title,category
0,bbc set major shake claims newspaper,Business
1,marsh averts cash crunch,Business
2,jeter yankees look take control,Sports
3,flying sun safety,Sci/Tech
4,stocks seen flat nortel oil weigh,Business


### Spliting the data

In [156]:
from sklearn.model_selection import train_test_split

In [157]:
TRAIN_SIZE = 0.7
VAL_SIZE = 0.15
TEST_SIZE = 0.15

def get_data_splits(X, y, train_size = TRAIN_SIZE):
  X_train, X_ , y_train, y_ = train_test_split(X, y, train_size = train_size, shuffle = True, stratify= y)
  X_val, X_test, y_val, y_test = train_test_split(X, y, train_size = 0.5, stratify= y)
  return X_train, X_val, X_test, y_train, y_val, y_test

In [158]:
# create datasplits
X , y= df.title.values , df.category.values
X_train, X_val, X_test, y_train, y_val, y_test = get_data_splits(X, y)
print (f'X_train: {X_train.shape}, y_train: {y_train.shape}')
print (f'X_val: {X_val.shape}, y_val: {y_val.shape}')
print (f'X_test: {X_test.shape}, y_test: {y_test.shape}')
print (f'Sample point: {X_train[0]} → {y_train[0]}')

X_train: (84000,), y_train: (84000,)
X_val: (60000,), y_val: (60000,)
X_test: (60000,), y_test: (60000,)
Sample point: ibm pay 320m partially settle pension case → Sci/Tech


### Encoding

here we would build a custom encoder class

In [159]:
import json

In [160]:
class LabelEncoder(object):
  ''' Encodes and decodes class labels as required'''

  def __init__(self, class_to_index = None):
      self.class_to_index = {} if class_to_index is None else class_to_index
      self.index_to_class = {v:k for k,v in self.class_to_index.items()}
      self.classes = list(self.class_to_index.keys())

  def fit(self, y):
      classes = np.unique(y)
      for i, value in enumerate(classes):
        self.class_to_index[value] = i
      self.index_to_class = {v:k for k,v in self.class_to_index.items()}
      self.classes = list(self.class_to_index.keys())
      return self

  def encode(self, y):
      labels = np.vectorize(self.class_to_index.get)(y)
      return np.asarray(labels)

  def decode(self, y):
    labels = np.vectorize(self.index_to_class.get)(y)
    return labels

  def __str__(self) -> str:
      return f'<LabelEncoder(), number of classes - {len(self)}'

  def __len__(self):
      return len(self.class_to_index)

  def save(self, fp):
      with open(fp, 'w') as fp:
        contents = {'class_to_index': self.class_to_index}
        json.dump(contents, fp, indent=4, sort_keys=False)
  
  @classmethod
  def load(cls, fp):
      with open(fp, 'r') as fp:
        kwargs = json.load(fp=fp)
      return cls(**kwargs)

In [161]:

# test Encoder
y = ['paris', 'paris', 'tokyo', 'amsterdam']
encoder = LabelEncoder()
encoder.fit(y)
print(encoder.classes)
print(encoder.encode(y))
print(encoder.decode(encoder.encode(y)))

['amsterdam', 'paris', 'tokyo']
[1 1 2 0]
['paris' 'paris' 'tokyo' 'amsterdam']


In [162]:
# Encode
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
NUM_CLASSES = len(label_encoder)
label_encoder.class_to_index

{'Business': 0, 'Sci/Tech': 1, 'Sports': 2, 'World': 3}

In [163]:
# Convert labels to tokens
print (f'y_train[0]: {y_train[0]}')
y_train = label_encoder.encode(y_train)
y_val = label_encoder.encode(y_val)
y_test = label_encoder.encode(y_test)
print (f'y_train[0]: {y_train[0]}')

y_train[0]: Sci/Tech
y_train[0]: 1


### Tokenizer

We would also build our custom tokenizer

In [164]:
from collections import Counter
from more_itertools import take

In [165]:
class Tokenizer(object):

  """Tokenize texts to indexes using ) as Padindex and 1 as UNK index"""

  def __init__(self, type : str = 'word', num_tokens = None, 
               pad_token='<PAD>', oov_token='<UNK>',token_to_index = None) -> None:
      self.type = type
      if num_tokens: num_tokens -= 2 # pad + unk tokens
      self.num_tokens = num_tokens
      self.pad_token = pad_token
      self.oov_token = oov_token
      if token_to_index is None:
        token_to_index = {pad_token: 0, oov_token: 1}
      self.token_to_index = token_to_index
      self.index_to_token = {v:k for k, v in self.token_to_index.items()}

  def __len__(self):
      return len(self.token_to_index)

  def __str__(self) -> str:
      return f'<Tokenizer(num_tokens={len(self)})>'

  def fit_on_texts(self, texts):
      all_texts = []
      for text in texts:
        tokens = [*text] if self.type == 'char' else text.split()
        all_texts.extend(tokens)
      counts = Counter(all_texts).most_common(self.num_tokens)
      self.min_token_freq = counts[-1][1]
      for word, _ in counts:
        self.token_to_index[word] = len(self.token_to_index)
      self.index_to_token = {v:k for k, v in self.token_to_index.items()}
      return self

  def texts_to_sequences(self, texts):
      sequences = []
      for text in texts:
        tokens = [*text] if self.type == 'char' else text.split()
        indices = []
        for token in tokens:
          indices.append(self.token_to_index.get(token, self.token_to_index[self.oov_token]))
        sequences.append(np.asarray(indices))
      return sequences

  def sequences_to_texts(self, sequence):
      texts = []
      for seq in sequence:
        tokens = []
        for i in seq:
          tokens.append(self.index_to_token.get(i, self.num_tokens))
        texts.append(tokens)
      return texts

  def save(self, fp):
      with open(fp, 'w') as fp:
        contents = {'type': self.type, 'oov_token': self.oov_token,
                    'token_to_index': self.token_to_index}
        json.dump(contents, fp, indent=4, sort_keys=False)
  
  @classmethod
  def load(cls, fp):
      with open(fp, 'r') as fp:
        kwargs = json.load(fp=fp)
      return cls(**kwargs)

In [166]:
# test tokenizer
texts = X_train[0:3]
print(texts[0])
tokenizer = Tokenizer(type = 'word',num_tokens= 5)
tokenizer.fit_on_texts(texts)
sequence = tokenizer.texts_to_sequences(texts)
print(sequence[0])
new_texts = tokenizer.sequences_to_texts(sequence)
print(new_texts[0])

ibm pay 320m partially settle pension case
[2 3 4 1 1 1 1]
['ibm', 'pay', '320m', '<UNK>', '<UNK>', '<UNK>', '<UNK>']


In [167]:
# Tokenize
tokenizer = Tokenizer(num_tokens=500)
tokenizer.fit_on_texts(texts=X_train)
VOCAB_SIZE = len(tokenizer)
print (tokenizer)

<Tokenizer(num_tokens=500)>


In [168]:
# Sample of tokens
print (take(5, tokenizer.token_to_index.items()))
print (f"least freq token's freq: {tokenizer.min_token_freq}") # use this to adjust num_tokens

[('<PAD>', 0), ('<UNK>', 1), ('39', 2), ('b', 3), ('gt', 4)]
least freq token's freq: 165


In [169]:
# Convert texts to sequences of indices
X_train = tokenizer.texts_to_sequences(X_train)
X_val = tokenizer.texts_to_sequences(X_val)
X_test = tokenizer.texts_to_sequences(X_test)
preprocessed_text = tokenizer.sequences_to_texts([X_train[0]])[0]
print ('Text to indices:\n'
    f'  (preprocessed) → {preprocessed_text}\n'
    f'  (tokenized) → {X_train[0]}')


Text to indices:
  (preprocessed) → ['ibm', 'pay', '<UNK>', '<UNK>', '<UNK>', '<UNK>', 'case']
  (tokenized) → [ 30 187   1   1   1   1 103]


In [170]:
# pad sequences to ensure consistency in shape
def pad_sequences(sequences, max_seq_len=0):
  size = len(sequences)
  max_seq_len= max(max_seq_len , max(len(sequence) for sequence in sequences))
  padded_sequences = np.zeros((size, max_seq_len), dtype= int)
  for i, row in enumerate(sequences):
    padded_sequences[i,:len(row)] = row

  return padded_sequences

In [171]:
# test padding
padded_train= pad_sequences(X_train[0:3])
print(padded_train.shape)
print(f'Unpadded at index 0 -> {X_train[1]}')
print(f'padded at index 0 -> {padded_train[1]}')

(3, 7)
Unpadded at index 0 -> [  1 347   2   7   1]
padded at index 0 -> [  1 347   2   7   1   0   0]


### Build Dataset With Loader

In [172]:
class Dataset(torch.utils.data.Dataset):

  '''Build a pytorch Dataset that batchs and load data for our model'''

  def __init__(self, X, y) -> None:
      super().__init__()
      self.X = X
      self.y = y

  def __len__(self):
      return len(self.y)

  def __str__(self):
      return f'<Dataset(N={len(self)})>'

  def __getitem__(self, index):
      X = self.X[index]
      y = self.y[index]
      return [X, len(X), y]

  def collate_fn(self, batch):
      batch = np.array(batch)
      X = batch[:, 0]
      seq_lens = batch[:, 1]
      y = batch[:, 2]
      X = pad_sequences(X)

      # Cast
      X = torch.LongTensor(X.astype(np.int32))
      seq_lens = torch.LongTensor(seq_lens.astype(np.int32))
      y = torch.LongTensor(y.astype(np.int32))

      return X, seq_lens, y

  def create_dataloader(self, batch_size, shuffle=False, drop_last=False):
      return torch.utils.data.DataLoader(dataset = self, batch_size= batch_size, 
                                         collate_fn=self.collate_fn,shuffle=shuffle, drop_last=drop_last,
                                         pin_memory=True)

In [173]:
# Create datasets for embedding
train_dataset = Dataset(X=X_train, y=y_train)
val_dataset = Dataset(X=X_val, y=y_val)
test_dataset = Dataset(X=X_test, y=y_test)
print ("Datasets:\n"
    f'  Train dataset:{train_dataset.__str__()}\n'
    f'  Val dataset: {val_dataset.__str__()}\n'
    f'  Test dataset: {test_dataset.__str__()}\n'
    'Sample point:\n'
    f'  X: {test_dataset[0][0]}\n'
    f'  y: {test_dataset[0][1]}')

Datasets:
  Train dataset:<Dataset(N=84000)>
  Val dataset: <Dataset(N=60000)>
  Test dataset: <Dataset(N=60000)>
Sample point:
  X: [1 1]
  y: 2


In [174]:
# Create dataloaders
batch_size = 64
train_dataloader = train_dataset.create_dataloader(
    batch_size=batch_size)
val_dataloader = val_dataset.create_dataloader(
    batch_size=batch_size)
test_dataloader = test_dataset.create_dataloader(
    batch_size=batch_size)
batch_X, batch_seq_lens, batch_y = next(iter(train_dataloader))
print ('Sample batch:\n'
    f'  X: {list(batch_X.size())}\n'
    f'  seq_lens: {list(batch_seq_lens.size())}\n'
    f'  y: {list(batch_y.size())}\n'
    'Sample point:\n'
    f'  X: {batch_X[0]}\n'
    f' seq_len: {batch_seq_lens[0]}\n'
    f'  y: {batch_y[0]}')

Sample batch:
  X: [64, 9]
  seq_lens: [64]
  y: [64]
Sample point:
  X: tensor([ 30, 187,   1,   1,   1,   1, 103,   0,   0])
 seq_len: 7
  y: 1


In [175]:
# test embedding
embed = nn.Embedding(num_embeddings=5000, embedding_dim= 100,
                     padding_idx= 0)
batch_X,seq_len, batch_y = next(iter(test_dataloader))
embed(batch_X).shape

torch.Size([64, 13, 100])

In [176]:
def gather_last_relevant_hidden(hiddens, seq_lens):
    """Extract and collect the last relevant
    hidden state based on the sequence length."""
    seq_lens = seq_lens.long().detach().cpu().numpy() - 1
    out = []
    for batch_index, column_index in enumerate(seq_lens):
        out.append(hiddens[batch_index, column_index])
    return torch.stack(out)

In [177]:
BATCH_SIZE = 64
EMBEDDING_DIM = 100
RNN_HIDDEN_DIM = 128
DROPOUT_P = 0.1
sequence_size = 8

In [178]:
x = torch.rand((BATCH_SIZE, sequence_size, EMBEDDING_DIM))
rnn = nn.RNN(EMBEDDING_DIM, RNN_HIDDEN_DIM, batch_first=True)
seq_lens = torch.randint(high=sequence_size, size=(BATCH_SIZE, ))
out, h_n = rnn(x) # h_n is the last hidden state
print ("out: ", out.shape)
print ("h_n: ", h_n.shape)

out:  torch.Size([64, 8, 128])
h_n:  torch.Size([1, 64, 128])


In [179]:
gather_last_relevant_hidden(hiddens=out, seq_lens=seq_lens).squeeze(0).shape

torch.Size([64, 128])

In [180]:
a = torch.randn(size = (5, 7, 30))
b = torch.randint(low = 2, high = 6, size = (5,))
print(b)
gather_last_relevant_hidden(hiddens=a, seq_lens=b).squeeze(0).shape

tensor([3, 5, 5, 5, 4])


torch.Size([5, 30])

In [181]:
class RNN(nn.Module):
  '''RNN model for sequence classification'''
  def __init__(self, vocab_size, embedding_dim, hidden_size,
               num_classes, dropout_prob
               ) -> None:
      super(RNN, self).__init__()
      self.num_classes = num_classes
      self.embedding = nn.Embedding(num_embeddings= vocab_size, embedding_dim= embedding_dim,
                                    padding_idx= 0)
      self.rnn = nn.RNN(input_size = embedding_dim, hidden_size= hidden_size,
                        num_layers = 2, batch_first = True)
      self.dropout = nn.Dropout(dropout_prob)
      self.relu = nn.ReLU()
      self.fc1 = nn.Linear(in_features= hidden_size, out_features = hidden_size*2)
      self.fc2 = nn.Linear(in_features= hidden_size*2, out_features = num_classes)

  def forward(self, inputs):
      x_in, seq_lens = inputs
      x = self.embedding(x_in)
      out, h_n = self.rnn(x)
      z = gather_last_relevant_hidden(hiddens=out, seq_lens=seq_lens)
      z = self.dropout(z)
      z = self.relu(self.fc1(z))
      z = self.dropout(z)
      z = self.fc2(z)
      return z

In [182]:
EMBEDDING_DIM = 100
VOCAB_SIZE = 5000
HIDDEN_DIM = 128
NUM_CLASSES = 4
DROPOUT_P = 0.1

model = RNN(
    embedding_dim=EMBEDDING_DIM, vocab_size=VOCAB_SIZE,hidden_size=HIDDEN_DIM,
    dropout_prob=DROPOUT_P, num_classes=NUM_CLASSES)

In [183]:
from sklearn.metrics import f1_score
import torch.nn.functional as F

In [184]:
class Trainer():
  ''' Custom Trainer'''
  def __init__(self, model, device,loss_fn=None, optimizer=None, scheduler=None, compute_metrics = None, 
               train_dataloader = None,val_dataloader =None, num_epochs = None, patience = None) -> None:
      self.train_dataloader = train_dataloader
      self.val_dataloader = val_dataloader
      self.model = model.to(device)
      self.num_epochs = num_epochs
      self.loss_fn = loss_fn
      self.patience = patience
      self.patient = patience
      self.optimizer = optimizer
      self.scheduler = scheduler
      self.compute_metrics = compute_metrics
      self.device = device

  def train_step(self):
      '''Perform single training step accross all batches'''
      self.model.train()
      running_loss = []
      y_probs = []
      y_trues = []
      for i, batch in enumerate(self.train_dataloader):
        batch = [item.to(self.device) for item in batch]
        inputs, labels = batch[:-1], batch[-1]
        self.optimizer.zero_grad()
        outputs = self.model(inputs)
        loss = self.loss_fn(outputs,labels)
        loss.backward()
        self.optimizer.step()
        running_loss.append(loss.item())
        y_prob = F.softmax(outputs.detach(), dim = -1).cpu().numpy()
        y_probs.extend(y_prob)
        y_trues.extend(labels.cpu().numpy())

      loss = np.asarray(running_loss).mean()
      return loss, np.vstack(y_trues), np.vstack(y_probs)

  def eval_step(self, dataloader = None):
      '''Perform single evaluation step accross all batches'''
      dataloader = self.val_dataloader if dataloader is None else dataloader 
      self.model.eval()
      running_loss = []
      y_probs = []
      y_trues = []
      with torch.inference_mode():
        for i, batch in enumerate(self.val_dataloader):
          batch = [item.to(self.device) for item in batch]
          inputs, labels = batch[:-1], batch[-1]
          outputs = self.model(inputs)
          loss = self.loss_fn(outputs,labels)
          running_loss.append(loss.item())
          y_prob = F.softmax(outputs.detach(), dim = -1).cpu().numpy()
          y_probs.extend(y_prob)
          y_trues.extend(labels.cpu().numpy())

      loss = np.asarray(running_loss).mean()
      return loss, np.vstack(y_trues), np.vstack(y_probs)


  def predict(self, dataloader):
      self.model.eval()
      y_probs = []
      model = self.model.to('cpu')
      with torch.inference_mode():
        for i, batch in enumerate(dataloader):
          inputs, labels = batch[:-1], batch[-1]
          outputs = model(inputs)
          probs = F.softmax(outputs, dim = -1)
          y_probs.extend(probs)
      return np.vstack(y_probs)

  def train(self):
      best_score = np.inf
      best_model = self.model
      for epoch in range(self.num_epochs):
        if self.patience == 0 :
           print('Early Stopping Reached')
           break
        train_loss, train_y_trues, train_y_probs = self.train_step()
        train_score = self.compute_metrics(train_y_trues, train_y_probs)
        val_loss, val_y_trues, val_y_probs = self.eval_step()
        val_score = self.compute_metrics(train_y_trues, train_y_probs)
        self.scheduler.step(val_loss)
        if val_loss < best_score:
          best_score = val_loss
          best_model = self.model
          self.patience = self.patient
        else:
          self.patience -= 1

        print(
                f'Epoch: {epoch+1} | '
                f'train_loss: {train_loss:.5f}, '
                f'val_loss: {val_loss:.5f}, '
                f'train_{list(train_score.keys())[0]}: {list(train_score.values())[0]:.5f}, '
                f'val_{list(val_score.keys())[0]}: {list(val_score.values())[0]:.5f}, '
            )

      return best_model 

In [185]:
def compute_metrics(y_trues, y_probs):
  y_preds = np.argmax(y_probs, axis = -1)
  score = f1_score(y_trues,y_preds, average = 'macro')
  return {'F1_score': score}

In [186]:
LEARNING_RATE = 1e-4
PATIENCE = 10
NUM_EPOCHS = 50

optimizer = torch.optim.Adam(params = model.parameters(), lr=LEARNING_RATE)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode="min", factor=0.1, patience=3)
citerion = nn.CrossEntropyLoss(reduction='mean')

In [187]:
# initialize trainer with arguments
trainer = Trainer(train_dataloader= train_dataloader, model = model,
                optimizer= optimizer,scheduler = scheduler, compute_metrics = compute_metrics,device = device,
                val_dataloader = val_dataloader, loss_fn = citerion,num_epochs = NUM_EPOCHS, patience = PATIENCE)

In [188]:
# train and return best model
best_model = trainer.train()

Epoch: 1 | train_loss: 1.19800, val_loss: 1.02977, train_F1_score: 0.46144, val_F1_score: 0.46144, 
Epoch: 2 | train_loss: 0.98806, val_loss: 0.92975, train_F1_score: 0.59402, val_F1_score: 0.59402, 
Epoch: 3 | train_loss: 0.91517, val_loss: 0.87903, train_F1_score: 0.62883, val_F1_score: 0.62883, 
Epoch: 4 | train_loss: 0.87289, val_loss: 0.84346, train_F1_score: 0.64960, val_F1_score: 0.64960, 
Epoch: 5 | train_loss: 0.84457, val_loss: 0.82232, train_F1_score: 0.66188, val_F1_score: 0.66188, 
Epoch: 6 | train_loss: 0.82388, val_loss: 0.80434, train_F1_score: 0.67057, val_F1_score: 0.67057, 
Epoch: 7 | train_loss: 0.81048, val_loss: 0.79312, train_F1_score: 0.67669, val_F1_score: 0.67669, 
Epoch: 8 | train_loss: 0.79976, val_loss: 0.78435, train_F1_score: 0.68031, val_F1_score: 0.68031, 
Epoch: 9 | train_loss: 0.79009, val_loss: 0.77773, train_F1_score: 0.68413, val_F1_score: 0.68413, 
Epoch: 10 | train_loss: 0.78210, val_loss: 0.77181, train_F1_score: 0.68774, val_F1_score: 0.68774, 

In [189]:
best_model = trainer.train()

Epoch: 1 | train_loss: 0.64819, val_loss: 0.69017, train_F1_score: 0.73837, val_F1_score: 0.73837, 
Epoch: 2 | train_loss: 0.64512, val_loss: 0.68256, train_F1_score: 0.73941, val_F1_score: 0.73941, 
Epoch: 3 | train_loss: 0.64147, val_loss: 0.68510, train_F1_score: 0.74057, val_F1_score: 0.74057, 
Epoch: 4 | train_loss: 0.63679, val_loss: 0.68137, train_F1_score: 0.74348, val_F1_score: 0.74348, 
Epoch: 5 | train_loss: 0.63456, val_loss: 0.67768, train_F1_score: 0.74371, val_F1_score: 0.74371, 
Epoch: 6 | train_loss: 0.63027, val_loss: 0.68268, train_F1_score: 0.74558, val_F1_score: 0.74558, 
Epoch: 7 | train_loss: 0.62733, val_loss: 0.68096, train_F1_score: 0.74570, val_F1_score: 0.74570, 
Epoch: 8 | train_loss: 0.62326, val_loss: 0.67187, train_F1_score: 0.74765, val_F1_score: 0.74765, 
Epoch: 9 | train_loss: 0.62054, val_loss: 0.67408, train_F1_score: 0.74869, val_F1_score: 0.74869, 
Epoch: 10 | train_loss: 0.61740, val_loss: 0.66718, train_F1_score: 0.74976, val_F1_score: 0.74976, 

In [190]:
best_model = trainer.train()

Epoch: 1 | train_loss: 0.56557, val_loss: 0.63634, train_F1_score: 0.77070, val_F1_score: 0.77070, 
Epoch: 2 | train_loss: 0.56553, val_loss: 0.63630, train_F1_score: 0.77042, val_F1_score: 0.77042, 
Epoch: 3 | train_loss: 0.56457, val_loss: 0.63628, train_F1_score: 0.77032, val_F1_score: 0.77032, 
Epoch: 4 | train_loss: 0.56495, val_loss: 0.63628, train_F1_score: 0.77088, val_F1_score: 0.77088, 
Epoch: 5 | train_loss: 0.56546, val_loss: 0.63626, train_F1_score: 0.77054, val_F1_score: 0.77054, 
Epoch: 6 | train_loss: 0.56452, val_loss: 0.63627, train_F1_score: 0.77037, val_F1_score: 0.77037, 
Epoch: 7 | train_loss: 0.56398, val_loss: 0.63626, train_F1_score: 0.77150, val_F1_score: 0.77150, 
Epoch: 8 | train_loss: 0.56570, val_loss: 0.63625, train_F1_score: 0.77118, val_F1_score: 0.77118, 
Epoch: 9 | train_loss: 0.56528, val_loss: 0.63624, train_F1_score: 0.76986, val_F1_score: 0.76986, 
Epoch: 10 | train_loss: 0.56459, val_loss: 0.63624, train_F1_score: 0.77050, val_F1_score: 0.77050, 

## Evaluation

In [191]:
import json
from sklearn.metrics import precision_recall_fscore_support

In [192]:
def get_metrics(y_true, y_pred, classes):
    """Per-class performance metrics."""
    # Performance
    performance = {"overall": {}, "class": {}}

    # Overall performance
    metrics = precision_recall_fscore_support(y_true, y_pred, average="weighted")
    performance["overall"]["precision"] = metrics[0]
    performance["overall"]["recall"] = metrics[1]
    performance["overall"]["f1"] = metrics[2]
    performance["overall"]["num_samples"] = np.float64(len(y_true))

    # Per-class performance
    metrics = precision_recall_fscore_support(y_true, y_pred, average=None)
    for i in range(len(classes)):
        performance["class"][classes[i]] = {
            "precision": metrics[0][i],
            "recall": metrics[1][i],
            "f1": metrics[2][i],
            "num_samples": np.float64(metrics[3][i]),
        }

    return performance

In [193]:
# Get predictions
test_loss, y_true, y_prob = trainer.eval_step(dataloader=test_dataloader)
y_pred = np.argmax(y_prob, axis=1)

In [194]:
# Determine performance
performance = get_metrics(
    y_true=y_true, y_pred=y_pred, classes=label_encoder.classes)
print (json.dumps(performance["overall"], indent=2))

{
  "precision": 0.7728983536971095,
  "recall": 0.75235,
  "f1": 0.7534082475175906,
  "num_samples": 60000.0
}


In [195]:
from pathlib import Path

In [196]:
# Save artifacts
dir = Path('artifacts')
dir.mkdir(parents=True, exist_ok=True)
label_encoder.save(fp=Path(dir, 'label_encoder.json'))
tokenizer.save(fp=Path(dir, 'tokenizer.json'))
torch.save(best_model.state_dict(), Path(dir, 'model.pt'))
with open(Path(dir, 'performance.json'), 'w') as fp:
    json.dump(performance, indent=2, sort_keys=False, fp=fp)


## Inference

In [197]:
def get_probability_distribution(y_prob, classes):
    """Create a dict of class probabilities from an array."""
    results = {}
    for i, class_ in enumerate(classes):
        results[class_] = np.float64(y_prob[i])
    sorted_results = {k: v for k, v in sorted(
        results.items(), key=lambda item: item[1], reverse=True)}
    return sorted_results


In [198]:
# Load artifacts
device = torch.device('cpu')
label_encoder = LabelEncoder.load(fp=Path(dir, 'label_encoder.json'))
tokenizer = Tokenizer.load(fp=Path(dir, 'tokenizer.json'))
model = RNN(
    embedding_dim=EMBEDDING_DIM, vocab_size=VOCAB_SIZE,
    hidden_size=HIDDEN_DIM,
    dropout_prob=DROPOUT_P, num_classes=NUM_CLASSES)
model.load_state_dict(torch.load(Path(dir, 'model.pt'), map_location=device))
model.to(device)

RNN(
  (embedding): Embedding(5000, 100, padding_idx=0)
  (rnn): RNN(100, 128, num_layers=2, batch_first=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (relu): ReLU()
  (fc1): Linear(in_features=128, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=4, bias=True)
)

In [199]:
# Initialize trainer
trainer = Trainer(model=model, device=device)

In [200]:
# Dataloader
text = 'The final tennis tournament starts next week.'
X = tokenizer.texts_to_sequences([preprocess(text)])
print (tokenizer.sequences_to_texts(X))
y_filler = label_encoder.encode([label_encoder.classes[0]]*len(X))
dataset = Dataset(X=X, y=y_filler)
dataloader = dataset.create_dataloader(batch_size=batch_size)


[['final', '<UNK>', '<UNK>', '<UNK>', 'next', 'week']]


In [201]:
# Inference
y_prob = trainer.predict(dataloader)
y_pred = np.argmax(y_prob,axis = 1)
print(y_pred)
label_encoder.decode(y_pred)


[2]


array(['Sports'], dtype='<U6')

In [202]:
# Class distributions
prob_dist = get_probability_distribution(y_prob=y_prob[0], classes=label_encoder.classes)
print (json.dumps(prob_dist, indent=2))


{
  "Sports": 0.45584022998809814,
  "World": 0.3801567852497101,
  "Sci/Tech": 0.14259421825408936,
  "Business": 0.021408775821328163
}
