In [None]:
import numpy as np
import pickle
import json
import torch
import torch.nn as nn
import torch.nn.functional as F
from pathlib import Path
from matplotlib import pyplot as plt
from tqdm.auto import tqdm, trange
from typing import Callable, Type
from matplotlib import pyplot as plt

## Revision

**Goal**: We want to classify sentiments in a document.

**Problem**: How do we convert a document into a set of numbers?

- tokenize text
- make vocab
- convert word to IDs
- use them to make a bag of words document

In [None]:
from torch.utils.data import DataLoader, TensorDataset

def acc(y_pred, y_true):
    return ((y_pred > 0.5).int() == y_true).float().mean()

def train(
    model: torch.nn.Module,
    X: torch.Tensor, 
    Y: torch.Tensor,
    X_valid: torch.Tensor,
    Y_valid: torch.Tensor,
    loss_function: Callable = torch.nn.MSELoss(), 
    optimizer: Type = torch.optim.SGD,
    epochs: int = 200,
    batch_size: int = 32,
    ) -> tuple[torch.nn.Module, list[float]]:
    
    dataloader = DataLoader(TensorDataset(X, Y), batch_size=batch_size, shuffle=True)
    losses = []
    training_accs = []
    validation_accs = []
    
    for epoch in (pbar := tqdm(range(epochs + 1))):
        epoch_loss = 0.0
        epoch_training_acc = []
        model.train()
        for X_batch, Y_batch in dataloader:
            # Do a train step
            optimizer.zero_grad()
            Y_pred = model(X_batch)
            loss = loss_function(Y_pred, Y_batch)
            loss.backward()
            optimizer.step()
            
            # Accumulate loss
            epoch_loss += loss.item()
            epoch_training_acc.append(acc(Y_pred, Y_batch).item())
    
        # Calculate validation metrics
        with torch.no_grad():
            model.eval()
            y_pred_valid = model(X_valid)
            avg_vlacc = acc(y_pred_valid, Y_valid)
        
        # Log results
        avg_loss = epoch_loss / len(dataloader)
        avg_tracc = sum(epoch_training_acc) / len(epoch_training_acc)
        losses.append(avg_loss)
        training_accs.append(avg_tracc)
        validation_accs.append(avg_vlacc)



        pbar.set_description(f"Epoch {epoch}/{epochs} - Loss: {avg_loss:.6f} - Tracc: {avg_tracc:.3f} - Vlacc: {avg_vlacc:.3f}")

    # Visualize loss and accuracy
    plt.figure(figsize=(15, 5), dpi=200)

    # Plot training loss
    plt.subplot(1, 2, 1)
    plt.plot(losses, label='Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title(f'Training Loss (last_avg: {float(np.mean(losses[-len(losses)//10:])):.6f})')
    plt.xticks()
    plt.yticks()
    plt.legend()

    # Plot training and validation accuracy
    plt.subplot(1, 2, 2)
    plt.plot(training_accs, label='Training Accuracy', color='orange')
    plt.plot(validation_accs, label='Validation Accuracy', color='green')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.title(f'Accuracy (Training last_avg: {float(np.mean(training_accs[-len(training_accs)//10:])):.6f}, Validation last_avg: {float(np.mean(validation_accs[-len(validation_accs)//10:])):.6f})')
    plt.xticks()
    plt.yticks()
    plt.legend()

    plt.tight_layout()
    plt.show()

    return model, losses

In [None]:
## lets load up word ids and vocab

data_dir = Path('..') / 'resources' / 'datasets' / 'imdb' / 'proc'
assert data_dir.exists()

with (data_dir / 'wordids_train.pkl').open('rb') as f:
    train_docs = pickle.load(f)

with (data_dir / 'train_labels.pkl').open('rb') as f:
    train_labels = pickle.load(f)

with (data_dir / 'wordids_test.pkl').open('rb') as f:
    test_docs = pickle.load(f)

with (data_dir / 'test_labels.pkl').open('rb') as f:
    test_labels = pickle.load(f)

with (data_dir / 'vocab.json').open('r') as f:
    vocab = json.load(f)

len(train_docs), len(test_docs), len(vocab)

In [None]:
n_docs = ...
n_words = ...

In [None]:
# Lets do the bag of words representation really quickly
X = ...
Y = ...

...

X.shape, Y.shape

In [None]:
# Shuffle the dataset (using np random permutation)
p = np.random.permutation(len(X))
X = ...
Y = ...

In [None]:
# Split the dataset 
x_train, x_valid = ...
y_train, y_valid = ...

x_train.shape, x_valid.shape, y_train.shape, y_valid.shape

In [None]:
# Brr lets never do it ourselves
from sklearn.model_selection import train_test_split

...
x_train.shape, x_valid.shape, y_train.shape, y_valid.shape

In [None]:
class NonLin(torch.nn.Module):

    def __init__(self, n_words):
        # Linear -> BatchNorm -> Dropout
        ...
        
    def forward(self, x):
        # End with Sigmoid
        return x


In [None]:
m = ...
lfn = ... # BCE
opt = # Adam with 0.005


print(m)

In [None]:
epochs = ...

In [None]:
model, losses = train(...)