# Data Management Tutorial
This notebook aims to help users understand the functionality of `ChoiceDataset` object. The `ChoiceDataset` is the core
class to hold 

Since this package was initially proposed for modelling consumer choices, the naming convention follows the consumer choice
scenario.

Author: Tianyu Du

In [4]:
# import required dependencies.
import numpy as np
import torch
from torch_choice.data import ChoiceDataset, JointDataset

In [5]:
def print_dict_shape(d):
    for key, val in d.items():
        if torch.is_tensor(val):
            print(f'dict.{key}.shape={val.shape}')

## Creating  `ChoiceDataset` Object

In [3]:
# Creates some fake input features, feel free to modify it as you want.
num_users = 10
num_items = 4
num_sessions = 500

length_of_dataset = 10000

In [6]:
# create observables/features/covariates, the number of parameters are
# arbitrarily chosen.
user_obs = torch.randn(num_users, 128)  # generate 128 features for each user, e.g., race, gender.
item_obs = torch.randn(num_items, 64)  # generate 64 features for each user, e.g., quality.
session_obs = torch.randn(num_sessions, 10)  # generate 10 features for each session, e.g., weekday indicator. 
taste_obs = torch.randn(num_users, num_items, 20)  # generate 20 features for each user item pair.
price_obs = torch.randn(num_sessions, num_items, 12)  # generate 12 features for each session user pair, e.g., the budget of that user at the shopping day.

In [7]:
label = torch.LongTensor(np.random.choice(num_items, size=length_of_dataset))
user_index = torch.LongTensor(np.random.choice(num_users, size=length_of_dataset))
session_index = torch.LongTensor(np.random.choice(num_sessions, size=length_of_dataset))

# assume all items are available in all sessions.
item_availability = torch.ones(num_sessions, num_items).bool()

In [8]:
dataset = ChoiceDataset(
    # pre-specified keywords of __init__
    label=label,  # required.
    # optional:
    user_index=user_index,
    session_index=session_index,
    item_availability=item_availability,
    # additional keywords of __init__
    user_obs=user_obs,
    item_obs=item_obs,
    session_obs=session_obs,
    taste_obs=taste_obs,
    price_obs=price_obs)

In [9]:
dataset

ChoiceDataset(label=[10000], user_index=[10000], session_index=[10000], item_availability=[500, 4], observable_prefix=[5], user_obs=[10, 128], item_obs=[4, 64], session_obs=[500, 10], taste_obs=[10, 4, 20], price_obs=[500, 4, 12], device=cpu)

In [10]:
print(f'{dataset.num_users=:}')
print(f'{dataset.num_items=:}')
print(f'{dataset.num_sessions=:}')
print(f'{len(dataset)=:}')

dataset.num_users=10
dataset.num_items=4
dataset.num_sessions=500
len(dataset)=10000


In [11]:
# clone
print(dataset.label[:10])
dataset_cloned = dataset.clone()
dataset_cloned.label = 99 * torch.ones(num_sessions)
print(dataset_cloned.label[:10])
print(dataset.label[:10])  # does not change the original dataset.

tensor([1, 2, 1, 3, 2, 1, 2, 2, 0, 0])
tensor([99., 99., 99., 99., 99., 99., 99., 99., 99., 99.])
tensor([1, 2, 1, 3, 2, 1, 2, 2, 0, 0])


In [12]:
# move to device
print(f'{dataset.device=:}')
print(f'{dataset.label.device=:}')
print(f'{dataset.taste_obs.device=:}')
print(f'{dataset.user_index.device=:}')
print(f'{dataset.session_index.device=:}')

dataset = dataset.to('cuda')

print(f'{dataset.device=:}')
print(f'{dataset.label.device=:}')
print(f'{dataset.taste_obs.device=:}')
print(f'{dataset.user_index.device=:}')
print(f'{dataset.session_index.device=:}')

dataset.device=cpu
dataset.label.device=cpu
dataset.taste_obs.device=cpu
dataset.user_index.device=cpu
dataset.session_index.device=cpu
dataset.device=cuda:0
dataset.label.device=cuda:0
dataset.taste_obs.device=cuda:0
dataset.user_index.device=cuda:0
dataset.session_index.device=cuda:0


In [13]:
dataset._check_device_consistency()

In [14]:
# # NOTE: this cell will result errors, this is intentional.
# dataset.label = dataset.label.to('cpu')
# dataset._check_device_consistency()

In [15]:
# create dictionary inputs for model.forward()
print_dict_shape(dataset.x_dict)

dict.user_obs.shape=torch.Size([10000, 4, 128])
dict.item_obs.shape=torch.Size([10000, 4, 64])
dict.session_obs.shape=torch.Size([10000, 4, 10])
dict.taste_obs.shape=torch.Size([10000, 4, 20])
dict.price_obs.shape=torch.Size([10000, 4, 12])


In [16]:
# __getitem__ to get batch.
# pick 5 random sessions as the mini-batch.
dataset = dataset.to('cpu')
indices = torch.Tensor(np.random.choice(len(dataset), size=5, replace=False)).long()
print(indices)
subset = dataset[indices]
print(dataset)
print(subset)
print_dict_shape(subset.x_dict)

assert torch.all(dataset.x_dict['price_obs'][indices, :, :] == subset.x_dict['price_obs'])
assert torch.all(dataset.label[indices] == subset.label)

tensor([5608, 3635, 9499, 9746, 6195])
ChoiceDataset(label=[10000], user_index=[10000], session_index=[10000], item_availability=[500, 4], observable_prefix=[5], user_obs=[10, 128], item_obs=[4, 64], session_obs=[500, 10], taste_obs=[10, 4, 20], price_obs=[500, 4, 12], device=cpu)
ChoiceDataset(label=[5], user_index=[5], session_index=[5], item_availability=[500, 4], observable_prefix=[5], user_obs=[10, 128], item_obs=[4, 64], session_obs=[500, 10], taste_obs=[10, 4, 20], price_obs=[500, 4, 12], device=cpu)
dict.user_obs.shape=torch.Size([5, 4, 128])
dict.item_obs.shape=torch.Size([5, 4, 64])
dict.session_obs.shape=torch.Size([5, 4, 10])
dict.taste_obs.shape=torch.Size([5, 4, 20])
dict.price_obs.shape=torch.Size([5, 4, 12])


In [17]:
print(subset.label)
print(dataset.label[indices])

subset.label += 1  # modifying the batch does not change the original dataset.

print(subset.label)
print(dataset.label[indices])

tensor([2, 1, 2, 0, 3])
tensor([2, 1, 2, 0, 3])
tensor([3, 2, 3, 1, 4])
tensor([2, 1, 2, 0, 3])


In [18]:
print(id(subset.label))
print(id(dataset.label[indices]))

139957071248016
139957071266896


## Using Pytorch dataloader for the training loop.

In [None]:
from torch.utils.data.sampler import BatchSampler, SequentialSampler, RandomSampler
shuffle = False
batch_size = 32

sampler = BatchSampler(
    RandomSampler(dataset) if shuffle else SequentialSampler(dataset),
    batch_size=batch_size,
    drop_last=False)

dataloader = torch.utils.data.DataLoader(dataset,
                                         sampler=sampler,
                                         num_workers=0,  # 0 if dataset.device == 'cuda' else os.cpu_count(),
                                         collate_fn=lambda x: x[0],
                                         pin_memory=(dataset.device == 'cpu'))


In [None]:
print(f'{item_obs.shape=:}')
item_obs_all = item_obs.view(1, num_items, -1).expand(num_sessions, -1, -1)
item_obs_all = item_obs_all.to(dataset.device)
label_all = label.to(dataset.device)
print(f'{item_obs_all.shape=:}')

item_obs.shape=torch.Size([4, 64])
item_obs_all.shape=torch.Size([10000, 4, 64])


In [None]:
for i, batch in enumerate(dataloader):
    # check consistency.
    first, last = i * batch_size, min(len(dataset), (i + 1) * batch_size)
    idx = torch.arange(first, last)
    assert torch.all(item_obs_all[idx, :, :] == batch.x_dict['item_obs'])
    assert torch.all(label_all[idx] == batch.label)

## Chaining Multiple Datasets: `JointDataset` Examples

In [None]:
dataset1 = dataset.clone()
dataset2 = dataset.clone()
joint_dataset = JointDataset(the_dataset=dataset1, another_dataset=dataset2)

In [None]:
joint_dataset

JointDataset with 2 sub-datasets: (
	the_dataset: ChoiceDataset(label=[10000], user_onehot=[10000, 10], item_availability=[10000, 4], variable_types=[5], user_obs=[10, 128], item_obs=[4, 64], session_obs=[10000, 234], taste_obs=[10, 4, 567], price_obs=[10000, 4, 12], device=cuda:0)
	another_dataset: ChoiceDataset(label=[10000], user_onehot=[10000, 10], item_availability=[10000, 4], variable_types=[5], user_obs=[10, 128], item_obs=[4, 64], session_obs=[10000, 234], taste_obs=[10, 4, 567], price_obs=[10000, 4, 12], device=cuda:0)
)