# Experiments with torchfm, which has a range of models based on FM (Factorization Machines). 
* None of these models are sequence-aware.
* Anaconda context must be set to "base". Eventually work with poetry.
* Try working with wandb (Weights & Biases)
* Starting with my_fm_copy.ipynb on July 21, 2021, integrate with elements of the code I wrote for rankfm. Specifically, I will read the data with the newlib.py library. 

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torchfm
from torchfm import layer as fm_layer, model as fm_model
from torchfm.model import fm, lr, nfm, wd
import torch.nn.functional as F
import pandas as pd
import pandas_options
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
tt = torch.tensor
import numpy as np
# import tqdm
import random
import matplotlib.pyplot as plt
import myfunclib as myfm
import d2l_torch as d2l
import datalib
import wandb

from fastcore.all import L, AttrDict

In [3]:
torch.get_num_interop_threads(),  torch.get_num_threads()

(16, 16)

In [4]:
# device = d2l.try_gpu()
device = 'cuda'
print(device)

cuda


In [5]:
gdct = {
    'device': device,  # 'cpu'
    'embed_dim': 10,
    'nb_epochs': 10,
    'lr': 0.05,
    'wd': 1.e-5,
    'optim': 'adamW',
    'batch_size' : 4096
}

In [6]:
param_dct = AttrDict({
    'device': device,  # 'cpu'
    'embed_dim': 10,
    'nb_epochs': 100,
    'lr': 0.05,
    'wd': 1.e-5,
    'optim': 'adamW',
    'batch_size' : 1024*4
})

In [7]:
%%time 
in_file = "activity_reduced_with_attributes.csv"
dct = datalib.read_data_attributes_single_file(in_file, dct=param_dct, continuous_attrib=True)
dct.keys()

df_item_attrib columns:  ['D', 'avg_yr_l', 'avg_yr_h', 'IATA', 'LAT_DEC', 'LON_DEC', 'HEIGHT']
df_item_attrib shape:  (91, 7)
df_:  Index(['MEMBER_ID', 'D', 'age_departure', 'GENDER', 'avg_yr_l', 'avg_yr_h',
       'LAT_DEC', 'LON_DEC', 'HEIGHT'],
      dtype='object')
SHOULD NOT CREATE user_attrib_idx and item_attrib_idx manually! SHOULD DO THIS BEFORE CALL TO this method
CPU times: user 1.18 s, sys: 153 ms, total: 1.33 s
Wall time: 1.33 s


dict_keys(['device', 'embed_dim', 'nb_epochs', 'lr', 'wd', 'optim', 'batch_size', 'age_cuts', 'df_members', 'df_with_attrib', 'user_attrib_idx', 'item_attrib_idx', 'user_attrib_str', 'item_attrib_str', 'field_types', 'field_dims'])

In [8]:
def convert_cat_variables(dct):
    idx2member, member2idx = datalib.cat2dict(dct['df_with_attrib']['MEMBER_ID'])
    idx2dest, dest2idx = datalib.cat2dict(dct['df_with_attrib']['D'])
    idx2gender, gender2idx = datalib.cat2dict(dct.df_with_attrib.GENDER)
    
    dct['idx2member'] = idx2member
    dct['member2idx'] = idx2member
    dct['idx2dest'] = idx2dest
    dct['dest2idx'] = dest2idx
    dct['idx2gender'] = idx2gender
    dct.gender2idx = gender2idx
    
    df1 = dct['df_with_attrib'].copy()
    df1['MEMBER_ID'] = df1['MEMBER_ID'].map(member2idx)
    df1['D'] = df1['D'].map(dest2idx)
    df1['GENDER'] = df1.GENDER.map(gender2idx)
    print("nb dest: ", len(dest2idx))
    print("nb members: ", len(idx2member))
    return df1

In [9]:
dct.item_attrib_str, dct.user_attrib_str

((#6) ['D','avg_yr_l','avg_yr_h','LAT_DEC','LON_DEC','HEIGHT'],
 (#3) ['MEMBER_ID','age_departure','GENDER'])

In [10]:
dct.keys()

dict_keys(['device', 'embed_dim', 'nb_epochs', 'lr', 'wd', 'optim', 'batch_size', 'age_cuts', 'df_members', 'df_with_attrib', 'user_attrib_idx', 'item_attrib_idx', 'user_attrib_str', 'item_attrib_str', 'field_types', 'field_dims'])

In [11]:
df1 = convert_cat_variables(dct)
dct['df_with_attrib'] = df1

nb dest:  86
nb members:  46321


In [None]:
%%time 
# split data into train / valid / test data sets
datalib.train_valid_dct(dct, 0.1, 0.2, temporal=True, shuffle=True)
dataset_train = datalib.myDataset(dct, dct.data_train)
dataset_valid = datalib.myDataset(dct, dct.data_valid)
dataset_test  = datalib.myDataset(dct, dct.data_test)
dct.keys()

In [None]:
loader_train = DataLoader(dataset_train, shuffle=True, batch_size=dct.batch_size)
loader_valid = DataLoader(dataset_valid, shuffle=True, batch_size=dct.batch_size)
loader_test  = DataLoader(dataset_test,  shuffle=True, batch_size=dct.batch_size)

In [None]:
%%time 
print(loader_train.batch_size)
print(loader_train.dataset.data.shape)
for i,d in enumerate(loader_train):
    if i == 5: break

## DataLoader is functional

# NOT USED
batch_size = dct.batch_size
files = "attrib_2016.csv.gz"
#data_dict = myfm.getData(files, batch_size=batch_size, nrows='all')
data_dict = myfm.getData(files, batch_size=batch_size, nrows=20000, shuffle=False)
data_dict



data_dict['train_iter'].dataset.df.shape[0]

len(dataset_train)

train_iter = data_dict['train_iter']
field_dims = train_iter.dataset.field_dims
field_dims

field_dims = 20   # MEANING?

# cpu: device : -1
# gpu: device : 0, 1, ...
gdct

# field_dims: number of categories for each attribute. 
# This should be defined in read_single_file

Create a method with dataset and DataLoader

In [None]:
dct.keys()

In [None]:
# Field_dims are based on the full dataset. As such, I might have 45,000 members in the 
# full dataset, but only 35,000 in the validation set. That wastes computational resources
# due to an enlarged embedding layer, but does it create other issues? I think not. 
embed_dim = dct['embed_dim']
device = dct['device']
field_dims = dct.field_dims.to(device)
# Better would be to capture all fields with element > 1. <<<< TODO IN FUTURE
field_dims = torch.cat([field_dims[0:2], field_dims[3:4]], axis=0)
print("field_dims: ", field_dims)
net = fm.FactorizationMachineModel(field_dims, embed_dim)
net.to(device)


In [12]:
dct.embed_dim = 20
dct.nb_epochs = 50
dct.lr = 0.05
dct.wd = 1.e-5
dct.optim = 'adamW'
dct.device = 'cuda'

In [24]:
config = {
  'lr': dct.lr,
  'epochs': dct.nb_epochs,
  'batch_size': dct.batch_size,
  'optim': dct.optim,
  'wd': dct.wd,
  'embed_dim': dct.embed_dim,
  'device': 'cuda',
}

sweep_config = {
    'name' : 'sweep1',
    'method' : 'random',
    'parameters' : {
        'lr' : { 'min' : 0.005, 'max' : 0.05 },
        'optim' : { 'values' : [ 'adam', 'adamW' ] },
        'wd' : { 'values' : [1.e-5, 1.e-3]  },
        'batch_size' : { 'values' : [128, 512, 1024, 4096] },
        'epochs' : {'value' : 30},
        'embed_dim' : {'values' : [10,20,30]},
        'device' : {'value' : 'cuda'},
    }
}

metric = {
        'name' : 'loss'
}

sweep_config['metric'] = metric

sweep_id = wandb.sweep(sweep_config, project="Copa Recommender", entity="erlebacher")

Create sweep with ID: tj45rn4b
Sweep URL: https://wandb.ai/erlebacher/Copa%20Recommender/sweeps/tj45rn4b


In [25]:
import pprint
pprint.pprint(sweep_config)

{'method': 'random',
 'metric': {'name': 'loss'},
 'name': 'sweep1',
 'parameters': {'batch_size': {'values': [128, 512, 1024, 4096]},
                'device': {'value': 'cuda'},
                'embed_dim': {'values': [10, 20, 30]},
                'epochs': {'value': 30},
                'lr': {'max': 0.05, 'min': 0.005},
                'optim': {'values': ['adam', 'adamW']},
                'wd': {'values': [1e-05, 0.001]}}}


In [26]:
def build_dataset(batch_size):
    datalib.train_valid_dct(dct, 0.1, 0.2, temporal=True, shuffle=True)
    dataset_train = datalib.myDataset(dct, dct.data_train)
    loader_train = DataLoader(dataset_train, shuffle=True, batch_size=dct.batch_size)
    return loader_train

def build_network(config, dct):
    # device = config['device']
    device = 'cuda'
    embed_dim = config['embed_dim']
    field_dims = dct.field_dims.to(device)
    # Better would be to capture all fields with element > 1. <<<< TODO IN FUTURE
    #   0:2 : MEMBER_ID, D (or negD),  3:4 : GENDER
    field_dims = torch.cat([field_dims[0:2], field_dims[3:4]], axis=0)
    network = fm.FactorizationMachineModel(field_dims, embed_dim)
    return network.to(device)

def bpr_loss_func(pos, neg):
    return -torch.log(torch.sigmoid(pos-neg)).sum()

def build_optimizer(dct, network, lr, wd): 
    if dct.optim == "sgd":
        optimizer = torch.optim.SGD(network.parameters(),
                              lr=lr, momentum=0.9, 
                              weight_decay=wd)
    elif dct.optim == "adam":
        optimizer = torch.optim.Adam(network.parameters(),
                               lr=lr, 
                               weight_decay=wd)
    elif dct.optim == "adamW":
        optimizer = torch.optim.AdamW(network.parameters(),
                               lr=lr,
                               weight_decay=wd)
    return optimizer


def train_epoch(network, loader, optimizer, loss_func, nb_epochs):
    for epoch in range(nb_epochs):
        print("train_epoch: device: ", dct.device)
        total_loss = myfm.train_epoch_new(network, optimizer, loader, loss_func, device=dct.device, log_interval=50)
        #if epoch == 0:
            #wandb.run.summary["initial_loss"] = total_loss
        #if total_loss < lowest_loss:
            #lowest_loss = total_loss
            #epoch_lowest_loss = epoch
            #wandb.run.summary["lowest_loss"] = lowest_loss
            #wandb.run.summary["epoch_lowest_loss"] = epoch_lowest_loss
        # losses.append(total_loss)
        if epoch % 5 == 0:
            print(f"Epoch {epoch}, avg total_loss (per training sample): ", total_loss)
        wandb.log({"loss": total_loss, "epoch":epoch})

In [27]:
def train(config=None):
    # Initialize a new wandb run
    with wandb.init(config=config) as run:
        # If called by wandb.agent, as below,
        # this config will be set by Sweep Controller
        config = wandb.config

        loader = build_dataset(config['batch_size'])
        network = build_network(config, dct)
        optimizer = build_optimizer(dct, network, config['lr'], config['wd'])
        loss_func = bpr_loss_func
        nb_epochs = config["epochs"]
        avg_loss = train_epoch(network, loader, optimizer, loss_func, nb_epochs)

In [None]:
wandb.agent(sweep_id, train, count=20)

[34m[1mwandb[0m: Agent Starting Run: vg82vtbs with config:
[34m[1mwandb[0m: 	batch_size: 512
[34m[1mwandb[0m: 	device: cuda
[34m[1mwandb[0m: 	embed_dim: 30
[34m[1mwandb[0m: 	epochs: 30
[34m[1mwandb[0m: 	lr: 0.015891534391008797
[34m[1mwandb[0m: 	optim: adam
[34m[1mwandb[0m: 	wd: 1e-05


(804187, 9)
Create torch.tensor on device
FeaturesLinear, field_dims:  tensor([46321,    86,     3], device='cuda:0') tensor(46410, device='cuda:0')
train_epoch: device:  cuda
elapased time per epoch:  477.474365234375
Epoch 0, avg total_loss (per training sample):  0.001409852315124285
train_epoch: device:  cuda
elapased time per epoch:  422.9700012207031
train_epoch: device:  cuda
elapased time per epoch:  473.80767822265625
train_epoch: device:  cuda
elapased time per epoch:  419.2924499511719
train_epoch: device:  cuda
elapased time per epoch:  476.96661376953125
train_epoch: device:  cuda
elapased time per epoch:  420.4491271972656
Epoch 5, avg total_loss (per training sample):  0.000995064676737247
train_epoch: device:  cuda
elapased time per epoch:  470.63720703125
train_epoch: device:  cuda
elapased time per epoch:  418.1485900878906
train_epoch: device:  cuda
elapased time per epoch:  471.3335266113281
train_epoch: device:  cuda
elapased time per epoch:  416.7745361328125
trai

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
loss,█▅▄▄▃▃▂▂▂▂▂▂▁▂▂▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁

0,1
epoch,29.0
loss,0.00087


[34m[1mwandb[0m: Agent Starting Run: gcldsztg with config:
[34m[1mwandb[0m: 	batch_size: 4096
[34m[1mwandb[0m: 	device: cuda
[34m[1mwandb[0m: 	embed_dim: 10
[34m[1mwandb[0m: 	epochs: 30
[34m[1mwandb[0m: 	lr: 0.032979206001899956
[34m[1mwandb[0m: 	optim: adam
[34m[1mwandb[0m: 	wd: 0.001


(804187, 9)
Create torch.tensor on device
FeaturesLinear, field_dims:  tensor([46321,    86,     3], device='cuda:0') tensor(46410, device='cuda:0')
train_epoch: device:  cuda
elapased time per epoch:  488.0847473144531
Epoch 0, avg total_loss (per training sample):  0.001366601383384856
train_epoch: device:  cuda
elapased time per epoch:  431.4553527832031
train_epoch: device:  cuda
elapased time per epoch:  424.41241455078125
train_epoch: device:  cuda
elapased time per epoch:  483.8220520019531
train_epoch: device:  cuda
elapased time per epoch:  473.2792663574219
train_epoch: device:  cuda
elapased time per epoch:  421.9920349121094
Epoch 5, avg total_loss (per training sample):  0.0009876083645560373
train_epoch: device:  cuda
elapased time per epoch:  479.3701171875
train_epoch: device:  cuda
elapased time per epoch:  425.8641052246094
train_epoch: device:  cuda
elapased time per epoch:  425.99359130859375
train_epoch: device:  cuda
elapased time per epoch:  476.85577392578125
tr

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
loss,█▅▄▃▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁

0,1
epoch,29.0
loss,0.00085


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: eg997veq with config:
[34m[1mwandb[0m: 	batch_size: 512
[34m[1mwandb[0m: 	device: cuda
[34m[1mwandb[0m: 	embed_dim: 30
[34m[1mwandb[0m: 	epochs: 30
[34m[1mwandb[0m: 	lr: 0.02039591184533779
[34m[1mwandb[0m: 	optim: adam
[34m[1mwandb[0m: 	wd: 1e-05


(804187, 9)
Create torch.tensor on device
FeaturesLinear, field_dims:  tensor([46321,    86,     3], device='cuda:0') tensor(46410, device='cuda:0')
train_epoch: device:  cuda
elapased time per epoch:  445.2428894042969
Epoch 0, avg total_loss (per training sample):  0.0013207065033407385
train_epoch: device:  cuda
elapased time per epoch:  488.94915771484375
train_epoch: device:  cuda
elapased time per epoch:  446.30474853515625
train_epoch: device:  cuda
elapased time per epoch:  494.0961608886719
train_epoch: device:  cuda
elapased time per epoch:  433.78497314453125
train_epoch: device:  cuda
elapased time per epoch:  487.8219909667969
Epoch 5, avg total_loss (per training sample):  0.0009757957691202368
train_epoch: device:  cuda
elapased time per epoch:  430.517822265625
train_epoch: device:  cuda
elapased time per epoch:  488.3654479980469
train_epoch: device:  cuda
elapased time per epoch:  434.2811279296875
train_epoch: device:  cuda
elapased time per epoch:  489.5039672851562

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
loss,█▅▄▄▃▃▃▂▂▂▂▂▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,29.0
loss,0.00084


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 1eqj11jx with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	device: cuda
[34m[1mwandb[0m: 	embed_dim: 10
[34m[1mwandb[0m: 	epochs: 30
[34m[1mwandb[0m: 	lr: 0.04523394454224855
[34m[1mwandb[0m: 	optim: adam
[34m[1mwandb[0m: 	wd: 0.001


(804187, 9)
Create torch.tensor on device
FeaturesLinear, field_dims:  tensor([46321,    86,     3], device='cuda:0') tensor(46410, device='cuda:0')
train_epoch: device:  cuda
elapased time per epoch:  497.29046630859375
Epoch 0, avg total_loss (per training sample):  0.0012828624474067844
train_epoch: device:  cuda
elapased time per epoch:  429.76177978515625
train_epoch: device:  cuda
elapased time per epoch:  498.2505187988281
train_epoch: device:  cuda
elapased time per epoch:  443.3150939941406
train_epoch: device:  cuda
elapased time per epoch:  493.5647888183594
train_epoch: device:  cuda
elapased time per epoch:  437.46893310546875
Epoch 5, avg total_loss (per training sample):  0.0009725068778254723
train_epoch: device:  cuda
elapased time per epoch:  488.0267333984375
train_epoch: device:  cuda
elapased time per epoch:  440.3253173828125
train_epoch: device:  cuda
elapased time per epoch:  496.165283203125
train_epoch: device:  cuda
elapased time per epoch:  440.3710632324219

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
loss,█▅▄▄▃▃▂▂▂▂▂▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,29.0
loss,0.00087


[34m[1mwandb[0m: Agent Starting Run: slznia8w with config:
[34m[1mwandb[0m: 	batch_size: 512
[34m[1mwandb[0m: 	device: cuda
[34m[1mwandb[0m: 	embed_dim: 20
[34m[1mwandb[0m: 	epochs: 30
[34m[1mwandb[0m: 	lr: 0.028349235393016484
[34m[1mwandb[0m: 	optim: adam
[34m[1mwandb[0m: 	wd: 0.001


(804187, 9)
Create torch.tensor on device
FeaturesLinear, field_dims:  tensor([46321,    86,     3], device='cuda:0') tensor(46410, device='cuda:0')
train_epoch: device:  cuda
elapased time per epoch:  503.3672180175781
Epoch 0, avg total_loss (per training sample):  0.0012876497744404025
train_epoch: device:  cuda
elapased time per epoch:  445.4310607910156
train_epoch: device:  cuda
elapased time per epoch:  449.4047546386719
train_epoch: device:  cuda
elapased time per epoch:  509.2944641113281
train_epoch: device:  cuda
elapased time per epoch:  444.0576171875
train_epoch: device:  cuda
elapased time per epoch:  513.5250244140625
Epoch 5, avg total_loss (per training sample):  0.0009685923450614026
train_epoch: device:  cuda
elapased time per epoch:  437.85797119140625
train_epoch: device:  cuda
elapased time per epoch:  506.85552978515625
train_epoch: device:  cuda
elapased time per epoch:  442.246826171875
train_epoch: device:  cuda
elapased time per epoch:  516.212158203125
trai

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
loss,█▅▅▄▄▃▃▃▂▂▂▂▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,29.0
loss,0.00086


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 7l4xycx7 with config:
[34m[1mwandb[0m: 	batch_size: 4096
[34m[1mwandb[0m: 	device: cuda
[34m[1mwandb[0m: 	embed_dim: 20
[34m[1mwandb[0m: 	epochs: 30
[34m[1mwandb[0m: 	lr: 0.015235320905138692
[34m[1mwandb[0m: 	optim: adamW
[34m[1mwandb[0m: 	wd: 1e-05


(804187, 9)
Create torch.tensor on device
FeaturesLinear, field_dims:  tensor([46321,    86,     3], device='cuda:0') tensor(46410, device='cuda:0')
train_epoch: device:  cuda
elapased time per epoch:  513.21435546875
Epoch 0, avg total_loss (per training sample):  0.0014341708888847668
train_epoch: device:  cuda
elapased time per epoch:  458.7340087890625
train_epoch: device:  cuda
elapased time per epoch:  512.0413818359375
train_epoch: device:  cuda
elapased time per epoch:  453.7820739746094
train_epoch: device:  cuda
elapased time per epoch:  515.83544921875
train_epoch: device:  cuda
elapased time per epoch:  448.7689208984375
Epoch 5, avg total_loss (per training sample):  0.0010442811674125053
train_epoch: device:  cuda
elapased time per epoch:  507.1667785644531
train_epoch: device:  cuda
elapased time per epoch:  446.1370544433594
train_epoch: device:  cuda
elapased time per epoch:  508.4474182128906
train_epoch: device:  cuda
elapased time per epoch:  446.5749816894531
train

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
loss,█▆▄▄▃▃▃▂▃▂▂▂▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,29.0
loss,0.00086


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 4krve0j0 with config:
[34m[1mwandb[0m: 	batch_size: 4096
[34m[1mwandb[0m: 	device: cuda
[34m[1mwandb[0m: 	embed_dim: 10
[34m[1mwandb[0m: 	epochs: 30
[34m[1mwandb[0m: 	lr: 0.0335371893759723
[34m[1mwandb[0m: 	optim: adam
[34m[1mwandb[0m: 	wd: 1e-05


(804187, 9)
Create torch.tensor on device
FeaturesLinear, field_dims:  tensor([46321,    86,     3], device='cuda:0') tensor(46410, device='cuda:0')
train_epoch: device:  cuda
elapased time per epoch:  498.8308410644531
Epoch 0, avg total_loss (per training sample):  0.0013688793440143723
train_epoch: device:  cuda
elapased time per epoch:  438.9612121582031
train_epoch: device:  cuda
elapased time per epoch:  535.3766479492188
train_epoch: device:  cuda
elapased time per epoch:  466.8919372558594
train_epoch: device:  cuda
elapased time per epoch:  513.0874633789062
train_epoch: device:  cuda
elapased time per epoch:  438.6422119140625
Epoch 5, avg total_loss (per training sample):  0.0009649624219719357
train_epoch: device:  cuda
elapased time per epoch:  434.2995300292969
train_epoch: device:  cuda
elapased time per epoch:  499.24212646484375
train_epoch: device:  cuda
elapased time per epoch:  431.7870788574219
train_epoch: device:  cuda
elapased time per epoch:  490.8062438964844


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
loss,█▅▄▄▃▃▂▂▂▂▂▂▂▂▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,29.0
loss,0.00084


[34m[1mwandb[0m: Agent Starting Run: j9mftx4d with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	device: cuda
[34m[1mwandb[0m: 	embed_dim: 20
[34m[1mwandb[0m: 	epochs: 30
[34m[1mwandb[0m: 	lr: 0.010438178121646928
[34m[1mwandb[0m: 	optim: adam
[34m[1mwandb[0m: 	wd: 1e-05


(804187, 9)
Create torch.tensor on device
FeaturesLinear, field_dims:  tensor([46321,    86,     3], device='cuda:0') tensor(46410, device='cuda:0')
train_epoch: device:  cuda
elapased time per epoch:  483.349609375
Epoch 0, avg total_loss (per training sample):  0.001378927625135351
train_epoch: device:  cuda
elapased time per epoch:  428.41363525390625
train_epoch: device:  cuda
elapased time per epoch:  449.2831115722656
train_epoch: device:  cuda
elapased time per epoch:  509.5060119628906
train_epoch: device:  cuda
elapased time per epoch:  429.7388916015625
train_epoch: device:  cuda
elapased time per epoch:  485.41046142578125
Epoch 5, avg total_loss (per training sample):  0.001059230891468909
train_epoch: device:  cuda
elapased time per epoch:  424.12225341796875
train_epoch: device:  cuda
elapased time per epoch:  486.34674072265625
train_epoch: device:  cuda
elapased time per epoch:  432.19183349609375
train_epoch: device:  cuda
elapased time per epoch:  429.33453369140625
t

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
loss,█▆▅▄▄▄▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁

0,1
epoch,29.0
loss,0.00087


[34m[1mwandb[0m: Agent Starting Run: am11ih2n with config:
[34m[1mwandb[0m: 	batch_size: 4096
[34m[1mwandb[0m: 	device: cuda
[34m[1mwandb[0m: 	embed_dim: 10
[34m[1mwandb[0m: 	epochs: 30
[34m[1mwandb[0m: 	lr: 0.0398515459430002
[34m[1mwandb[0m: 	optim: adamW
[34m[1mwandb[0m: 	wd: 1e-05


(804187, 9)
Create torch.tensor on device
FeaturesLinear, field_dims:  tensor([46321,    86,     3], device='cuda:0') tensor(46410, device='cuda:0')
train_epoch: device:  cuda
elapased time per epoch:  442.6633605957031
Epoch 0, avg total_loss (per training sample):  0.0012648320572484642
train_epoch: device:  cuda
elapased time per epoch:  454.03271484375
train_epoch: device:  cuda
elapased time per epoch:  516.9612426757812
train_epoch: device:  cuda
elapased time per epoch:  468.6671142578125
train_epoch: device:  cuda
elapased time per epoch:  510.73492431640625
train_epoch: device:  cuda
elapased time per epoch:  464.3868713378906
Epoch 5, avg total_loss (per training sample):  0.0009258102224292966
train_epoch: device:  cuda
elapased time per epoch:  450.06402587890625
train_epoch: device:  cuda
elapased time per epoch:  512.63037109375
train_epoch: device:  cuda
elapased time per epoch:  443.33203125
train_epoch: device:  cuda
elapased time per epoch:  499.67327880859375
train_e

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
loss,█▆▄▄▃▃▃▂▂▂▂▂▂▂▁▂▁▂▂▁▂▁▂▁▁▁▁▁▁▁

0,1
epoch,29.0
loss,0.00084


[34m[1mwandb[0m: Agent Starting Run: rfc6b2yv with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	device: cuda
[34m[1mwandb[0m: 	embed_dim: 20
[34m[1mwandb[0m: 	epochs: 30
[34m[1mwandb[0m: 	lr: 0.03936168474363646
[34m[1mwandb[0m: 	optim: adamW
[34m[1mwandb[0m: 	wd: 1e-05


(804187, 9)
Create torch.tensor on device
FeaturesLinear, field_dims:  tensor([46321,    86,     3], device='cuda:0') tensor(46410, device='cuda:0')
train_epoch: device:  cuda
elapased time per epoch:  521.77197265625
Epoch 0, avg total_loss (per training sample):  0.0012205434909301678
train_epoch: device:  cuda
elapased time per epoch:  437.0054626464844
train_epoch: device:  cuda
elapased time per epoch:  433.39910888671875
train_epoch: device:  cuda
elapased time per epoch:  506.16192626953125
train_epoch: device:  cuda
elapased time per epoch:  433.9144592285156
train_epoch: device:  cuda
elapased time per epoch:  512.6065673828125
Epoch 5, avg total_loss (per training sample):  0.0009597587815427379
train_epoch: device:  cuda
elapased time per epoch:  437.3218688964844
train_epoch: device:  cuda
elapased time per epoch:  448.53228759765625
train_epoch: device:  cuda
elapased time per epoch:  508.3765869140625
train_epoch: device:  cuda
elapased time per epoch:  470.9920959472656


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
loss,█▆▅▄▃▃▃▃▂▂▂▂▂▂▂▂▂▁▁▂▂▁▁▁▁▁▁▁▁▁

0,1
epoch,29.0
loss,0.00086


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: qrjs8doo with config:
[34m[1mwandb[0m: 	batch_size: 512
[34m[1mwandb[0m: 	device: cuda
[34m[1mwandb[0m: 	embed_dim: 30
[34m[1mwandb[0m: 	epochs: 30
[34m[1mwandb[0m: 	lr: 0.03831749140656231
[34m[1mwandb[0m: 	optim: adam
[34m[1mwandb[0m: 	wd: 1e-05


(804187, 9)
Create torch.tensor on device
FeaturesLinear, field_dims:  tensor([46321,    86,     3], device='cuda:0') tensor(46410, device='cuda:0')
train_epoch: device:  cuda
elapased time per epoch:  450.4875183105469
Epoch 0, avg total_loss (per training sample):  0.0011692900300272263
train_epoch: device:  cuda
elapased time per epoch:  529.5761108398438
train_epoch: device:  cuda
elapased time per epoch:  445.15496826171875
train_epoch: device:  cuda
elapased time per epoch:  472.2511291503906
train_epoch: device:  cuda
elapased time per epoch:  530.941162109375
train_epoch: device:  cuda
elapased time per epoch:  456.9268798828125
Epoch 5, avg total_loss (per training sample):  0.0009110030231516467
train_epoch: device:  cuda
elapased time per epoch:  459.6313781738281
train_epoch: device:  cuda
elapased time per epoch:  517.1951293945312
train_epoch: device:  cuda
elapased time per epoch:  441.6054992675781
train_epoch: device:  cuda
elapased time per epoch:  498.8773498535156
t

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
loss,█▆▅▅▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▂▁▁

0,1
epoch,29.0
loss,0.00078


[34m[1mwandb[0m: Agent Starting Run: xik5q4l0 with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	device: cuda
[34m[1mwandb[0m: 	embed_dim: 20
[34m[1mwandb[0m: 	epochs: 30
[34m[1mwandb[0m: 	lr: 0.010844292352113896
[34m[1mwandb[0m: 	optim: adam
[34m[1mwandb[0m: 	wd: 0.001


(804187, 9)
Create torch.tensor on device
FeaturesLinear, field_dims:  tensor([46321,    86,     3], device='cuda:0') tensor(46410, device='cuda:0')
train_epoch: device:  cuda
elapased time per epoch:  445.6654968261719
Epoch 0, avg total_loss (per training sample):  0.0014247590439479783
train_epoch: device:  cuda
elapased time per epoch:  451.0376281738281
train_epoch: device:  cuda
elapased time per epoch:  445.4118957519531
train_epoch: device:  cuda
elapased time per epoch:  654.612060546875
train_epoch: device:  cuda
elapased time per epoch:  442.95538330078125
train_epoch: device:  cuda
elapased time per epoch:  440.9197998046875
Epoch 5, avg total_loss (per training sample):  0.0010616708576230622
train_epoch: device:  cuda
elapased time per epoch:  458.69891357421875
train_epoch: device:  cuda
elapased time per epoch:  513.9927368164062
train_epoch: device:  cuda
elapased time per epoch:  462.8186950683594
train_epoch: device:  cuda
elapased time per epoch:  443.5374755859375


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
loss,█▆▅▄▄▄▃▃▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,29.0
loss,0.00085


[34m[1mwandb[0m: Agent Starting Run: e6q2r2a3 with config:
[34m[1mwandb[0m: 	batch_size: 4096
[34m[1mwandb[0m: 	device: cuda
[34m[1mwandb[0m: 	embed_dim: 20
[34m[1mwandb[0m: 	epochs: 30
[34m[1mwandb[0m: 	lr: 0.04004973788776358
[34m[1mwandb[0m: 	optim: adamW
[34m[1mwandb[0m: 	wd: 0.001


(804187, 9)
Create torch.tensor on device
FeaturesLinear, field_dims:  tensor([46321,    86,     3], device='cuda:0') tensor(46410, device='cuda:0')
train_epoch: device:  cuda
elapased time per epoch:  516.9572143554688
Epoch 0, avg total_loss (per training sample):  0.0012046279520072502
train_epoch: device:  cuda
elapased time per epoch:  432.1747741699219
train_epoch: device:  cuda
elapased time per epoch:  448.72735595703125
train_epoch: device:  cuda
elapased time per epoch:  443.71832275390625
train_epoch: device:  cuda
elapased time per epoch:  515.5322875976562
train_epoch: device:  cuda
elapased time per epoch:  459.5237731933594
Epoch 5, avg total_loss (per training sample):  0.0009321936376001492
train_epoch: device:  cuda
elapased time per epoch:  448.80224609375
train_epoch: device:  cuda
elapased time per epoch:  468.4808349609375
train_epoch: device:  cuda
elapased time per epoch:  579.7442626953125
train_epoch: device:  cuda
elapased time per epoch:  461.0269470214844
t

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
loss,█▆▅▄▃▃▃▂▂▃▂▂▂▂▂▂▂▂▂▂▂▂▁▂▁▂▁▁▁▁

0,1
epoch,29.0
loss,0.00082


[34m[1mwandb[0m: Agent Starting Run: dw60bdab with config:
[34m[1mwandb[0m: 	batch_size: 1024
[34m[1mwandb[0m: 	device: cuda
[34m[1mwandb[0m: 	embed_dim: 30
[34m[1mwandb[0m: 	epochs: 30
[34m[1mwandb[0m: 	lr: 0.03928932062756571
[34m[1mwandb[0m: 	optim: adam
[34m[1mwandb[0m: 	wd: 0.001


(804187, 9)
Create torch.tensor on device
FeaturesLinear, field_dims:  tensor([46321,    86,     3], device='cuda:0') tensor(46410, device='cuda:0')
train_epoch: device:  cuda
elapased time per epoch:  449.71722412109375
Epoch 0, avg total_loss (per training sample):  0.0011596506091213557
train_epoch: device:  cuda
elapased time per epoch:  451.0053405761719
train_epoch: device:  cuda
elapased time per epoch:  517.078125
train_epoch: device:  cuda
elapased time per epoch:  451.84661865234375
train_epoch: device:  cuda
elapased time per epoch:  469.38665771484375
train_epoch: device:  cuda
elapased time per epoch:  454.82696533203125
Epoch 5, avg total_loss (per training sample):  0.0009142590240349935
train_epoch: device:  cuda
elapased time per epoch:  531.6666259765625
train_epoch: device:  cuda
elapased time per epoch:  447.29193115234375
train_epoch: device:  cuda
elapased time per epoch:  447.9087829589844
train_epoch: device:  cuda
elapased time per epoch:  517.601806640625
trai

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
loss,█▇▅▅▄▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▂▁▁

0,1
epoch,29.0
loss,0.00078


[34m[1mwandb[0m: Agent Starting Run: ltgpckaj with config:
[34m[1mwandb[0m: 	batch_size: 1024
[34m[1mwandb[0m: 	device: cuda
[34m[1mwandb[0m: 	embed_dim: 10
[34m[1mwandb[0m: 	epochs: 30
[34m[1mwandb[0m: 	lr: 0.030483570906115965
[34m[1mwandb[0m: 	optim: adamW
[34m[1mwandb[0m: 	wd: 1e-05


(804187, 9)
Create torch.tensor on device
FeaturesLinear, field_dims:  tensor([46321,    86,     3], device='cuda:0') tensor(46410, device='cuda:0')
train_epoch: device:  cuda
elapased time per epoch:  465.6015625
Epoch 0, avg total_loss (per training sample):  0.001369965567488081
train_epoch: device:  cuda
elapased time per epoch:  505.4992980957031
train_epoch: device:  cuda
elapased time per epoch:  433.0456848144531
train_epoch: device:  cuda
elapased time per epoch:  432.86407470703125
train_epoch: device:  cuda
elapased time per epoch:  434.4153747558594
train_epoch: device:  cuda
elapased time per epoch:  495.56866455078125
Epoch 5, avg total_loss (per training sample):  0.0009714953519473856
train_epoch: device:  cuda
elapased time per epoch:  433.6548156738281
train_epoch: device:  cuda
elapased time per epoch:  443.66455078125
train_epoch: device:  cuda
elapased time per epoch:  443.5268859863281
train_epoch: device:  cuda
elapased time per epoch:  525.9188232421875
train_ep

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
loss,█▅▄▄▃▃▃▂▂▂▂▂▂▂▁▂▂▁▁▁▁▂▁▁▁▁▁▁▁▁

0,1
epoch,29.0
loss,0.00085


[34m[1mwandb[0m: Agent Starting Run: 4uhudlke with config:
[34m[1mwandb[0m: 	batch_size: 4096
[34m[1mwandb[0m: 	device: cuda
[34m[1mwandb[0m: 	embed_dim: 10
[34m[1mwandb[0m: 	epochs: 30
[34m[1mwandb[0m: 	lr: 0.04326689682321731
[34m[1mwandb[0m: 	optim: adam
[34m[1mwandb[0m: 	wd: 0.001


(804187, 9)
Create torch.tensor on device
FeaturesLinear, field_dims:  tensor([46321,    86,     3], device='cuda:0') tensor(46410, device='cuda:0')
train_epoch: device:  cuda
elapased time per epoch:  487.1710205078125
Epoch 0, avg total_loss (per training sample):  0.00130463567844249
train_epoch: device:  cuda
elapased time per epoch:  528.4822387695312
train_epoch: device:  cuda
elapased time per epoch:  458.085693359375
train_epoch: device:  cuda
elapased time per epoch:  469.4407958984375
train_epoch: device:  cuda
elapased time per epoch:  462.0194091796875
train_epoch: device:  cuda
elapased time per epoch:  462.34771728515625
Epoch 5, avg total_loss (per training sample):  0.0009773288619544223
train_epoch: device:  cuda
elapased time per epoch:  528.9724731445312
train_epoch: device:  cuda
elapased time per epoch:  463.45458984375
train_epoch: device:  cuda
elapased time per epoch:  458.2671813964844
train_epoch: device:  cuda
elapased time per epoch:  461.6809997558594
train

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
loss,█▅▄▄▃▃▂▂▂▂▂▂▂▂▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,29.0
loss,0.00088


[34m[1mwandb[0m: Agent Starting Run: hy5eg89s with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	device: cuda
[34m[1mwandb[0m: 	embed_dim: 10
[34m[1mwandb[0m: 	epochs: 30
[34m[1mwandb[0m: 	lr: 0.013099942892920692
[34m[1mwandb[0m: 	optim: adam
[34m[1mwandb[0m: 	wd: 1e-05


(804187, 9)
Create torch.tensor on device
FeaturesLinear, field_dims:  tensor([46321,    86,     3], device='cuda:0') tensor(46410, device='cuda:0')
train_epoch: device:  cuda
elapased time per epoch:  468.1512756347656
Epoch 0, avg total_loss (per training sample):  0.0014823028627399216
train_epoch: device:  cuda
elapased time per epoch:  482.0791320800781
train_epoch: device:  cuda
elapased time per epoch:  464.6546936035156
train_epoch: device:  cuda
elapased time per epoch:  519.667724609375
train_epoch: device:  cuda
elapased time per epoch:  447.3592529296875
train_epoch: device:  cuda
elapased time per epoch:  447.82806396484375
Epoch 5, avg total_loss (per training sample):  0.0010784211926887631
train_epoch: device:  cuda
elapased time per epoch:  449.8087158203125
train_epoch: device:  cuda
elapased time per epoch:  455.3436279296875
train_epoch: device:  cuda
elapased time per epoch:  514.5716552734375
train_epoch: device:  cuda
elapased time per epoch:  453.2630920410156
t

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
loss,█▆▅▄▃▃▃▃▂▂▂▂▂▂▂▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,29.0
loss,0.00088


[34m[1mwandb[0m: Agent Starting Run: 7hsad4mz with config:
[34m[1mwandb[0m: 	batch_size: 512
[34m[1mwandb[0m: 	device: cuda
[34m[1mwandb[0m: 	embed_dim: 30
[34m[1mwandb[0m: 	epochs: 30
[34m[1mwandb[0m: 	lr: 0.03338444573088391
[34m[1mwandb[0m: 	optim: adamW
[34m[1mwandb[0m: 	wd: 0.001


(804187, 9)
Create torch.tensor on device
FeaturesLinear, field_dims:  tensor([46321,    86,     3], device='cuda:0') tensor(46410, device='cuda:0')
train_epoch: device:  cuda
elapased time per epoch:  449.25152587890625
Epoch 0, avg total_loss (per training sample):  0.0011904604640110743
train_epoch: device:  cuda
elapased time per epoch:  450.3763427734375
train_epoch: device:  cuda
elapased time per epoch:  457.7994384765625
train_epoch: device:  cuda
elapased time per epoch:  456.53106689453125
train_epoch: device:  cuda
elapased time per epoch:  451.34771728515625
train_epoch: device:  cuda
elapased time per epoch:  517.6766967773438
Epoch 5, avg total_loss (per training sample):  0.0009415336067526396
train_epoch: device:  cuda
elapased time per epoch:  451.455078125
train_epoch: device:  cuda
elapased time per epoch:  446.8780517578125
train_epoch: device:  cuda
elapased time per epoch:  448.8956298828125
train_epoch: device:  cuda
elapased time per epoch:  455.6852722167969
tr

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
loss,█▆▆▅▄▄▃▃▃▂▂▃▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁

0,1
epoch,29.0
loss,0.00078


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: f9wlh7at with config:
[34m[1mwandb[0m: 	batch_size: 512
[34m[1mwandb[0m: 	device: cuda
[34m[1mwandb[0m: 	embed_dim: 30
[34m[1mwandb[0m: 	epochs: 30
[34m[1mwandb[0m: 	lr: 0.013200838752628556
[34m[1mwandb[0m: 	optim: adam
[34m[1mwandb[0m: 	wd: 0.001


(804187, 9)
Create torch.tensor on device
FeaturesLinear, field_dims:  tensor([46321,    86,     3], device='cuda:0') tensor(46410, device='cuda:0')
train_epoch: device:  cuda
elapased time per epoch:  460.73785400390625
Epoch 0, avg total_loss (per training sample):  0.0014226548779091494
train_epoch: device:  cuda
elapased time per epoch:  475.19354248046875
train_epoch: device:  cuda
elapased time per epoch:  453.1077575683594
train_epoch: device:  cuda
elapased time per epoch:  449.3650207519531
train_epoch: device:  cuda
elapased time per epoch:  455.7306823730469
train_epoch: device:  cuda
elapased time per epoch:  516.085205078125
Epoch 5, avg total_loss (per training sample):  0.0010025364371738222
train_epoch: device:  cuda
elapased time per epoch:  444.9942932128906
train_epoch: device:  cuda
elapased time per epoch:  461.5966796875
train_epoch: device:  cuda
elapased time per epoch:  448.57415771484375
train_epoch: device:  cuda
elapased time per epoch:  459.002685546875
tra

In [None]:

run = wandb.init(project="Copa Recommender",
            config=config,
            save_code=True)

# Optional
wandb.watch(net)  # model

In [None]:
wanda_dict = AttrDict()
to_save = ["embed_dim", "nb_epochs", "lr", "wd", "optim", "batch_size", "nb_epochs", "device"]
for s in to_save:
    wanda_dict[s] = dct[s]

In [None]:
optimizer, loss_func = myfm.setup_trainer(net, dct)

## Candidate items to add to the Wandab logging dictionary
* Average time per iteration
* Time for set up

## To do
* How to save dependencies python files? 
* Work under poetry to make sure I have the proper Python environment

In [None]:
%%time
nb_epochs = dct.nb_epochs
print("nb_epochs: ", dct.nb_epochs)
losses = []
print("dct.keys(): ", dct.keys())
print("user attr str: ", dct.user_attrib_str)
print("item attr str: ", dct.item_attrib_str)
print("device: ", dct.device)

lowest_loss = 1.e10
lowest_epoch = 0

loader_train = DataLoader(dataset_train, shuffle=True, batch_size=dct.batch_size)  # Already defined

for epoch in range(nb_epochs):   # replace gdct['device'] by dct.device
    total_loss = myfm.train_epoch_new(net, optimizer, loader_train, loss_func, device=dct['device'], log_interval=10)
    if epoch == 0:
        wandb.run.summary["initial_loss"] = total_loss
    if total_loss < lowest_loss:
        lowest_loss = total_loss
        epoch_lowest_loss = epoch
        wandb.run.summary["lowest_loss"] = lowest_loss
        wandb.run.summary["epoch_lowest_loss"] = epoch_lowest_loss
    losses.append(total_loss)
    if epoch % 1 == 0:
        print(f"Epoch {epoch}, avg total_loss (per training sample): ", total_loss)
        
    wandb.log({"loss": total_loss, "epoch":epoch})
        
#  ERROR: Check that all variables are on the same device. HOW TO DO THIS? try 'cuda'
#  8500 training samples
# time GPU, 5.6 sec for 4 epochs, batch 512
# time CPU, 5.8 sec for 4 epochs, batch 512
# time CPU, 7.1 sec for 4 epochs, batch 32
# time CPU, 5.7 sec for 4 epochs, batch 4096
# time CPU, 6.8 sec for 4 epochs, batch 32
# time GPU, 7.0 sec for 4 epochs, batch 32
# time GPU, 5.0 sec for 4 epochs, batch 4096

# ERROR? The loss per training sample should be independent of the batch size

# x: one of its elements is 46475, and yet, the max index should be  46458. Why is this happening? Max index should be 46410 (sum of field_dims)

# I may need to improve my selection of negative samples to speed up convergence. This is much much slower than rankfm. Why? 
# 1) I might have an error
# 2) rankfm is written in C. So I should compare convergence rates between the two when running only MEMBER_ID, DEST, GENDER as one-hot encoded attributes. 
# It is also time to get wandb going so I can save my data. 
# What do I want to save? 
#   total_loss, lr, nb_epochs, device, optim, batch_size, wd, embed_dim

In [None]:
dct.keys()

In [None]:
list(net.parameters())[0].get_device()  # -1 for cpu

In [None]:
a = list(net.parameters())[0].device
a

In [None]:
%%time
test_iter = data_dict['train_iter']
print("length: ", len(test_iter.dataset))
test_iter = DataLoader(data_dict['train_data'], batch_size=4*1024, shuffle=True)
# fields: original dataframe as a torch array
# scores: scores from original dataframe
fields, scores = myfm.test_accuracy(net, test_iter, 'cpu')

In [None]:
for i,data in enumerate(test_iter):
    # data[0].shape = (B,3). Elements are member, item, age
    print(i, data[0].shape, data[1].shape, data[2].shape)
    break

Select a sample of members, and compute scores for all destinations

In [None]:
data_iter = data_dict['train_iter']
data_iter.dataset.dct.keys()
dct = data_iter.dataset.dct
dct.keys()

In [None]:
df = data_iter.dataset.df # dataframe
print(df.shape)
nb_members = 1000  # select random members
max_member = df['MEMBER_ID'].max()
max_dest = df['D'].max()
members = random.sample(range(0,max_member), 100)
destinations = list(range(0,max_dest+1))  # 0, 1, ..., max_dest
print("dest: ", destinations)
print("members[0]: ", members[0])
print("Size: ", df.groupby(['MEMBER_ID','D']).size().sum())
row = df.iloc[members[0],:]
print("row: ", row)

# create a dataframe with members*max_dest rows. 10000*100 = one million
# How to do this? 
#  1. create a specialized Dataset

#print('member_attr: ', dct['member_attr'])
#dct['idx2member'].keys()

In [None]:
data = myfm.AccuracyDataset(data_iter.dataset, destinations)
len(data)

In [None]:
# shuffling is irrelevant
accuracy_loader = DataLoader(data, batch_size=4096, shuffle=False)
#accuracy_loader = DataLoader(data, batch_size=gdct['batch_size'], shuffle=False)

In [None]:
fields, predict = myfm.test_accuracy(net, train_iter, 'cpu')

print("fields: ", fields[0:5])
print("predict: ", predict[0:5])

# Strong decrease in loss. However, is this overfitting? 
# TODO: create a pair-wise approach. So define negative samples. 
# Could weigh the negative samples: flights not taken in the further back in time would have higher weight 
#  than more recent flights. Is that reasonable? 

In [None]:
fields, predict = myfm.test_accuracy(net, accuracy_loader, 'cpu')

print("fields: ", fields[0:5])
print("predict: ", predict[0:50])
print(fields.shape, predict.shape)

# Strong decrease in loss. However, is this overfitting? 
# TODO: create a pair-wise approach. So define negative samples. 
# Could weigh the negative samples: flights not taken in the further back in time would have higher weight 
#  than more recent flights. Is that reasonable? 

In [None]:
# fields: member_id, dest, age
# prediction: score
field_np = np.asarray(fields)
predict_np = np.asarray(predict)
#print(field_np.shape, predict_np.reshape(-1,1).shape)
joined = np.concatenate((field_np, predict_np.reshape(-1,1)), axis=1)
#print(joined[0:7,:])
df = pd.DataFrame(joined, columns=['MEMBER_ID','D','age','rank']) #, predict)
#print(df.head())

cols = list(df.columns)[0:-1]
for col in cols:
    df[col] = df[col].astype('int')
df = df.iloc[1:]

# data_iter: used for training
D_set = data_iter.dataset.dct['D_set']
print(len(D_set))
D_set.iloc[35], D_set.loc[35]

# Why is first row have numbers approx 1.e31? This is the input data. It has nothing to do with the evaluator. 
# for i in range(predict.shape[0]):

merged = df.merge(D_set, how='inner', on='MEMBER_ID')
print("merged shape (all scores): ", merged.shape, merged['MEMBER_ID'].nunique())  # 2218 unique members
merged = merged[merged['rank'] > 0.5]
print("merged shape (scores > 0.5): ", merged.shape, merged['MEMBER_ID'].nunique())  # 2170 unique members
print(merged.head(10))

Determined the topN scores for all members in order. 

In [None]:
_df['MEMBER_ID'].max(

In [None]:
df1 = merged.groupby('MEMBER_ID').agg({'rank':list})
# df1 = merged.groupby('MEMBER_ID')['rank'].transform('count') #agg({'rank':list})
# print(df1)
print(df1.shape)

def sort_func(col):
    col1 = np.asarray(col) #.argsort()
    col1 = np.asarray(col).argsort()
    #col1 = sorted(col1, reverse=True)
    return col1
    
rank = df1['rank'].apply(sort_func)
df2 = df1.copy()
df2['argrank'] = rank
print(df2.head())
print("df2.shape: ", df2.shape)
df2['D'] = [list(range(0,len(D_set)))] * len(df2)
print("len(list(range(0,len(D_set))))= ", len(list(range(0,len(D_set)))) )
print("D_set: ", D_set)
df2
# df3 = pd.concat([_df, df2], axis=1)
# _df.shape, df2.shape

In [None]:
dst = list(data_iter.dataset.dct['idx2dest'].keys())
df
dst   # destinations 0 - 75 (76 values)
# I wish to apply argsort to them
df2['Dlist'] = [dst] * df2.shape[0]
# df2

# apply argrank to D Dlist

In [None]:
plt.plot(losses)
p

Compute scores of training data. 
* For each member_id, compute score for each destination. Rank destinations and compare against destinations actually travelled. 
* consider the 2016 data. For each user+user_attributes, cover a range of destinations. Each destination has its own destination attributes. 
Consider $n$ examples of destination attributes, compute a ranking of these $n$ items. There are 80 destinations and their attributes. Finally, 
there are attributes that are neither member destinations or desination attributes. 