# Homework 4 - Chris Winsor

In [None]:
! pip install datasets
! pip install wandb
! pip install sklearn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sklearn
  Using cached sklearn-0.0.post1.tar.gz (3.6 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: sklearn
  Building wheel for sklearn (setup.py) ... [?25l[?25hdone
  Created wheel for sklearn: filename=sklearn-0.0.post1-py3-none-any.whl size=2344 sha256=ccce605fcfce776ff763a671e785a7adf4459bf9855cad8dca65ed89e7edf9a1
  Stored in directory: /root/.cache/pip/wheels/14/25/f7/1cc0956978ae479e75140219088deb7a36f60459df242b1a72
Successfully built sklearn
Installing collected packages: sklearn
Successfully installed sklearn-0.0.post1


In [None]:
import random
from datetime import datetime

import torch
import torch.nn as nn
import torch.nn.functional as F

import datasets

import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

import wandb

In [None]:
assert torch.cuda.is_available(), "the code requires CUDA"
device = torch.device("cuda")

In [None]:
wandb.login()

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mcwinsor[0m ([33mmetrowest[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
# Load IMDB data
imdb = datasets.load_dataset("imdb")



  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
imdb_train = imdb["train"]
imdb_test = imdb["test"]

In [None]:
# Split train data into train and validation allocating 10% to validation and shuffle
_x_train, _x_val, _y_train, _y_val = train_test_split(imdb_train['text'], imdb_train['label'], test_size=0.1, shuffle=True)
_x_test = imdb_test['text']
_y_test = imdb_test['label']

In [None]:
# Pre-process the data using TfidfVectorizer
# Convert the data from a sparse matrix to a dense matrix using the .toarray() method.
vectorizer = TfidfVectorizer(
    #  stop_words='english',
    max_features=2500, # 30000
    ngram_range=(1, 1))

vectorizer = vectorizer.fit(_x_train)

x_train = vectorizer.transform(_x_train).todense()
y_train = np.array(_y_train)

x_val = vectorizer.transform(_x_val).todense()
y_val = np.array(_y_val)

x_test = vectorizer.transform(_x_test).todense()
y_test = np.array(_y_test)

In [None]:
# common routines
def logit_to_predictions(x, threshold):
    # return (F.sigmoid(logits) > threshold).long()
    return (torch.sigmoid(logits) > threshold).long()

def mean_accuracy(predictions, y):
  return (predictions == y).mean()

In [None]:
# Make a test classifier using torch.nn
# Subclass from nn.Module
# Use nn.Linear, nn.ReLU, nn.Dropout, nn.BatchNorm1d, etc to define layers
# Add a parameter use_batch_norm so nn.BatchNorm1d is applied
# in hidden layers prior to linear

class TextClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size,
                 n_hidden_layers, use_batch_norm):
        super().__init__()
        
        self.my_parameter = torch.nn.Parameter(torch.randn(3))
        self.use_batch_norm = use_batch_norm
        self.num_hidden_layers = n_hidden_layers

        # input layer
        self.input_layer = nn.Linear(input_size, hidden_size)
        torch.nn.init.xavier_uniform_(self.input_layer.weight)
        # torch.nn.init.uniform_(self.input_layer.weight)

        self.hidden_batch_norms = nn.ModuleList()
        self.hidden_linears = nn.ModuleList()
        for n in range(n_hidden_layers):

            if use_batch_norm:
                batch_norm = nn.BatchNorm1d(
                    num_features=hidden_size,
                    eps=1e-05, momentum=0.1)
                self.hidden_batch_norms.append(batch_norm)

            linear = nn.Linear(hidden_size, hidden_size)
            torch.nn.init.xavier_uniform_(linear.weight)
            # torch.nn.init.uniform_(linear.weight)
            self.hidden_linears.append(linear)
    
        self.output_layer = nn.Linear(hidden_size, output_size)
        torch.nn.init.xavier_uniform_(self.output_layer.weight)
        # torch.nn.init.uniform_(self.output_layer.weight)

    def forward(self, x):
        """
        Args:
        x: torch.FloatTensor[batch_size, input_features]
        
        Returns:
        torch.FloatTensor[batch_size, output_size]
        """
        h = self.input_layer(x)

        for n in range(self.num_hidden_layers):
            if self.use_batch_norm:
                h = self.hidden_batch_norms[n](h) #
            h = self.hidden_linears[n](h)
            h = F.relu(h)
            h = F.dropout(h)

        h = self.output_layer(h)
        return h

In [None]:
# Training loop
config = {
    "run_name": datetime.now().strftime("train_%m%d_%H_%M_%S"),

    "input_size": x_train.shape[1],
    "num_hidden_layers": 3,
    "hidden_size": 30,
    "use_batch_norm": True,
    "output_size": 1,

    "learning_rate": 1e-5,
    "momentum": 0.2,

    "batch_size": 64,
    "num_epochs": 60,
    "eval_every": 2000,

    
    "logit_threshold": 0.5,
    }

train_size = x_train.shape[0]
val_size = x_val.shape[0]
test_size = x_test.shape[0]

batch_size = config["batch_size"]
num_batches = train_size // batch_size

wandb.init(project="hw4_nn_text_classifier", config=config)
wandb.define_metric("batch_loss", summary="min")
wandb.define_metric("batch_accuracy", summary="max")
wandb.define_metric("val_accuracy", summary="max")

model = TextClassifier(
    input_size=config["input_size"],
    hidden_size=config["hidden_size"],
    output_size=config["output_size"],
    n_hidden_layers=config["num_hidden_layers"],
    use_batch_norm=config["use_batch_norm"])
model.to(device)
# wandb.watch(model, log='all')
wandb.watch(model)

optimizer = torch.optim.Adam(model.parameters(), lr=config["learning_rate"])
# optimizer = torch.optim.SGD(model.parameters(),
#                             lr=config["learning_rate"],
#                             momentum=config["momentum"])

global_step = 0
max_val_accuracy = 0.
for epoch in range(config["num_epochs"]):

    for batch in range(num_batches):
        global_step += 1

        from_x = batch * batch_size 
        to_x = from_x + batch_size 
        x = torch.tensor(x_train[from_x: to_x], dtype=torch.float32)
        y = torch.tensor(y_train[from_x: to_x], dtype=torch.float32)
        x = x.to(device)
        y = y.to(device)
 
        logits = model(x)
        logits = logits.reshape(-1)

        loss = F.binary_cross_entropy_with_logits(logits, y)

        predictions = logit_to_predictions(logits, config["logit_threshold"])
        num_correct = torch.sum(predictions == y)
        batch_accuracy = num_correct / batch_size

        log_dict = {
            "batch_loss": loss,
            "batch_accuracy": batch_accuracy
        }
        wandb.log(step=global_step, data=log_dict)

        # update the model
        loss.backward()
        optimizer.step()
        optimizer.zero_grad

        if global_step % config["eval_every"] == 0:
            model.eval()
            with torch.no_grad():
                val_correct = 0
                for val_batch in range(val_size // batch_size):
                    from_xval = val_batch * batch_size 
                    to_xval = from_xval + batch_size
                    x = torch.tensor(x_val[from_xval: to_xval], dtype=torch.float32)
                    y = torch.tensor(y_val[from_xval: to_xval], dtype=torch.float32)
                    x = x.to(device)
                    y = y.to(device)

                    logits = model(x)
                    logits = logits.reshape(-1)

                    predictions = logit_to_predictions(logits, config["logit_threshold"])
                    val_correct += torch.sum(predictions == y)

                val_accuracy = val_correct.item() / val_size
                if val_accuracy > max_val_accuracy:
                    max_val_accuracy = val_accuracy
                print("global_step: ", global_step, " val:accuracy: ", val_accuracy, " max_val_accuracy: ", max_val_accuracy)
                wandb.log(step=global_step,
                          data={"val_accuracy": val_accuracy})
                
                # ZONA - note torch.save does not like wandb.watch(model, log='all')
                # checkpoint_path = "{}_{}".format(config["run_name"], str(global_step))
                # torch.save(model, checkpoint_path)
print("final max_val_accuracy: ", max_val_accuracy)

global_step:  2000  val:accuracy:  0.7224  max_val_accuracy:  0.7224
global_step:  4000  val:accuracy:  0.82  max_val_accuracy:  0.82
global_step:  6000  val:accuracy:  0.8388  max_val_accuracy:  0.8388
global_step:  8000  val:accuracy:  0.8336  max_val_accuracy:  0.8388
global_step:  10000  val:accuracy:  0.8376  max_val_accuracy:  0.8388
global_step:  12000  val:accuracy:  0.8204  max_val_accuracy:  0.8388
global_step:  14000  val:accuracy:  0.8268  max_val_accuracy:  0.8388
global_step:  16000  val:accuracy:  0.8204  max_val_accuracy:  0.8388
global_step:  18000  val:accuracy:  0.8304  max_val_accuracy:  0.8388
global_step:  20000  val:accuracy:  0.8312  max_val_accuracy:  0.8388
final max_val_accuracy:  0.8388


In [None]:
# Test
model.eval()

with torch.no_grad():
    test_correct = 0
    for test_batch in range(test_size // batch_size):
        from_test = test_batch * batch_size 
        to_test = from_test + batch_size
        x = torch.tensor(x_test[from_test: to_test], dtype=torch.float32)
        y = torch.tensor(y_test[from_test: to_test], dtype=torch.float32)
        x = x.to(device)
        y = y.to(device)

        logits = model(x)
        logits = logits.reshape(-1)

        predictions = logit_to_predictions(logits, config["logit_threshold"])
        test_correct += torch.sum(predictions == y)

    test_accuracy = test_correct.item() / test_size
    print("test_accuracy: ", test_accuracy)

test_accuracy:  0.83248
