In [1]:
%pwd

'/home/ec2-user/SageMaker/code/dlsyscourse-homework/final/notebooks'

In [22]:
# !pip install --quiet -r ../requirements.txt

1426.71s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


In [3]:
import os
import sys
sys.path.append(os.path.abspath(".."))
sys.path.append(os.path.abspath("../python"))

In [4]:
import dotenv
_ = dotenv.load_dotenv(dotenv_path="../conf/dev.env")

In [5]:
import tqdm
import numpy as np

In [6]:
import needle as ndl
import needle.nn as nn

import needle.nn as nn
import needle.ops as ops

from needle.autograd import Tensor
from needle import backend_ndarray as nd

In [7]:
import apps.etl
import apps.data
import apps.utils.aws
import apps.utils.common
import apps.models

## Hot code reloading, useful during dev:
%load_ext autoreload
%autoreload 1
%aimport apps.etl
%aimport apps.data
%aimport apps.models
%aimport apps.utils.common
%aimport apps.utils.aws.s3
%aimport apps.utils.aws.athena

In [8]:
import logging
logger = logging.getLogger("notebooks.debug")
apps.utils.common.setup_logging(config_file="../conf/logging.yml")

In [9]:
# apps.etl.init_raw_athena_table()

In [10]:
raw_days = sorted([p[0] for p in apps.utils.aws.athena.get_partitions("criteo_raw").values()])
print(f"{raw_days=}")

raw_days=['00', '01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23']


In [11]:
# apps.utils.aws.athena.run_query("select * from criteo_raw where day = '00' limit 30")

In [12]:
dataset_iter = apps.data.read_s3_dataset(
    s3_prefix="anatoly/datasets/criteo-terabyte-click-log", 
    s3_path="preprocessed/joined",
    day_from = 0,
    day_to = 0,
    batch_size = 1024,
    limit_batches = 100,
    as_numpy=True
)

In [13]:
# pbar = tqdm.tqdm(desc="reading data", total=len(dataset_iter))
# for batch_id, (X_dense, X_sparse, Y) in enumerate(dataset_iter, start=1):
#     pbar.update(n=X_dense.shape[0])

In [14]:
device = ndl.cpu()
dense_layer_sizes = [512, 256, 64]
# interaction_layer_sizes = [512,512,256,1]
lr = 0.1 ## also try 4.0, 15.0
weight_decay = 0.0
model = apps.models.DLRM(
    dense_in_features = len(apps.etl.DENSE_COLUMNS),
    dense_layer_sizes=[512,256,64],
    device=device
)
model.train()

In [15]:
## Numerically unstable version:
class BinaryCrossEntropyLoss(nn.Module):
    def forward(self, input: Tensor, target: Tensor):
        x = input
        z = target
        size = nd.prod(x.shape)
        res = nn.ops.summation(x - x * z + ops.log(1 + nn.ops.exp(-x)))
        return nn.ops.divide_scalar(res, size)


In [12]:
import numpy as np
np.triu_indices(14, k=1)

(91,)

In [8]:
import torch
torch.triu_indices(14, 14, offset=1)

tensor([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  1,
          1,  1,  1,  1,  1,  1,  1,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
          3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  4,  4,  4,  4,  4,  4,  4,  4,
          4,  5,  5,  5,  5,  5,  5,  5,  5,  6,  6,  6,  6,  6,  6,  6,  7,  7,
          7,  7,  7,  7,  8,  8,  8,  8,  8,  9,  9,  9,  9, 10, 10, 10, 11, 11,
         12],
        [ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13,  2,  3,  4,  5,  6,
          7,  8,  9, 10, 11, 12, 13,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13,
          4,  5,  6,  7,  8,  9, 10, 11, 12, 13,  5,  6,  7,  8,  9, 10, 11, 12,
         13,  6,  7,  8,  9, 10, 11, 12, 13,  7,  8,  9, 10, 11, 12, 13,  8,  9,
         10, 11, 12, 13,  9, 10, 11, 12, 13, 10, 11, 12, 13, 11, 12, 13, 12, 13,
         13]])

In [16]:
# import torch
# target = torch.ones([10, 64], dtype=torch.float32)  # 64 classes, batch size = 10
# output = torch.full([10, 64], 1.5)  # A prediction (logit)
# loss_fn = torch.nn.BCEWithLogitsLoss()
# loss_fn(output, target)  # -log(sigmoid(1.5))

In [17]:
# output = Tensor(output.numpy())
# target = Tensor(target.numpy())
# loss_fn = BinaryCrossEntropyLoss()
# loss_fn(output, target)

In [18]:
# loss_fn = nn.SoftmaxLoss()
loss_fn = BinaryCrossEntropyLoss()

In [19]:
opt = ndl.optim.SGD(model.parameters(), lr=lr, weight_decay=weight_decay)

In [21]:
total_loss = 0
total_errors = 0
total_batches = 0
total_examples = 0

pbar = tqdm.tqdm(desc="reading data", total=len(dataset_iter))
for batch_id, (X_dense, X_sparse, Y) in enumerate(dataset_iter, start=1):
    pbar.update(n=X_dense.shape[0])
    batch_size = Y.shape[0]
    logger.debug(f"TRAIN on {batch_id=} of size {batch_size}...")
    X_dense = np.log(X_dense + 3)
    X_dense = Tensor(X_dense, device=device, requires_grad=False)
    X_sparse = Tensor(X_sparse, device=device, requires_grad=False)
    Y = Y.reshape(1, Y.shape[0])
    Y = Tensor(Y, device=device, requires_grad=False)

    opt.reset_grad()
    out = model(X_dense)
    loss = loss_fn(out, Y)
    loss.backward()
    opt.step()

    y_prob = out.numpy()
    y_pred = np.argmax(y_prob, axis=1)
    errors = np.not_equal(y_pred, Y.numpy()).sum()

    cur_loss = loss.numpy()[0]
    total_loss += cur_loss
    total_errors += errors
    total_batches += 1
    total_examples += batch_size

    avg_loss = (total_loss / total_batches)
    avg_error_rate = total_errors / total_examples

    logger.debug(f"TRAIN on {batch_id=} of size {batch_size}: done ({cur_loss=}, {total_loss=}, {total_batches=}, {avg_loss=:0.4f}, {avg_error_rate=:0.4f})")

reading data:   0%|          | 1024/195871983 [00:18<998:40:20, 54.48it/s] 


2023-01-08 22:23:18,294 - notebooks.debug - DEBUG - TRAIN on batch_id=1 of size 1024...




AssertionError: operation needs two equal-sized arrays