Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Transform to numpy arrays #3

Merged
merged 14 commits into from Jun 30, 2021
2 changes: 2 additions & 0 deletions .gitattributes
@@ -1,3 +1,5 @@
*.pt filter=lfs diff=lfs merge=lfs -text
*.pickle filter=lfs diff=lfs merge=lfs -text
*.gz filter=lfs diff=lfs merge=lfs -text
*.json filter=lfs diff=lfs merge=lfs -text
*.npz filter=lfs diff=lfs merge=lfs -text
42 changes: 22 additions & 20 deletions code/dataset.py
Expand Up @@ -5,41 +5,39 @@
import pickle
from torch.utils.data import Dataset, DataLoader
import torch

import json
def mkdir(path):
if os.path.isdir(path) == False:
os.makedirs(path)

import numpy as np
#%% DATALOADERS
class SequentialDataset(Dataset):
'''
Note: displayType has been uncommented for future easy implementation.
'''
def __init__(self, data, sample_uniform_action=False):
def __init__(self, data, sample_uniform_slate=False):

self.data = data
self.num_items = self.data['action'].max()+1
self.sample_uniform_action = sample_uniform_action
logging.info(f"Loading dataset with action size={self.data['action'].size()} and uniform candidate sampling={self.sample_uniform_action}")
self.num_items = self.data['slate'].max()+1
self.sample_uniform_slate = sample_uniform_slate
logging.info(f"Loading dataset with slate size={self.data['slate'].size()} and uniform candidate sampling={self.sample_uniform_slate}")

def __getitem__(self, idx):
batch = {key: val[idx] for key, val in self.data.items()}

if self.sample_uniform_action:
if self.sample_uniform_slate:
# Sample actions uniformly:
action = torch.randint_like(batch['action'], low=3, high=self.num_items)
action = torch.randint_like(batch['slate'], low=3, high=self.num_items)

# Add noclick action at pos0
# and the actual click action at pos 1 (unless noclick):
action[:,0] = 1
clicked = batch['click']!=1
action[:,1][clicked] = batch['click'][clicked]
batch['action'] = action
batch['slate'] = action
# Set click idx to 0 if noclick, and 1 otherwise:
batch['click_idx'] = clicked.long()



return batch

def __len__(self):
Expand All @@ -50,14 +48,19 @@ def load_dataloaders(data_dir,
batch_size=1024,
split_trainvalid=0.90,
t_testsplit = 5,
sample_uniform_action=False):
sample_uniform_slate=False):

logging.info('Load data..')
data = torch.load(f'{data_dir}/data.pt')
dataset = SequentialDataset(data, sample_uniform_action)
with np.load(f'{data_dir}/data.npz') as data_np:
simeneide marked this conversation as resolved.
Show resolved Hide resolved
data = {key: torch.tensor(val) for key, val in data_np.items()}
dataset = SequentialDataset(data, sample_uniform_slate)

with open(f'{data_dir}/ind2val.pickle', 'rb') as handle:
ind2val = pickle.load(handle)
with open(f'{data_dir}/ind2val.json', 'rb') as handle:
# Use string2int object_hook found here: https://stackoverflow.com/a/54112705
ind2val = json.load(
handle,
object_hook=lambda d: {int(k) if k.lstrip('-').isdigit() else k: v for k, v in d.items()}
)

num_validusers = int(len(dataset) * (1-split_trainvalid)/2)
simeneide marked this conversation as resolved.
Show resolved Hide resolved
num_testusers = int(len(dataset) * (1-split_trainvalid)/2)
Expand All @@ -79,16 +82,15 @@ def load_dataloaders(data_dir,
}

dataloaders = {
phase: DataLoader(ds, batch_size=batch_size, shuffle=True)
phase: DataLoader(ds, batch_size=batch_size, shuffle=(phase=="train"), num_workers=12)
simeneide marked this conversation as resolved.
Show resolved Hide resolved
for phase, ds in subsets.items()
}
for key, dl in dataloaders.items():
logging.info(
f"In {key}: num_users: {len(dl.dataset)}, num_batches: {len(dl)}"
)


with open(f'{data_dir}/itemattr.pickle', 'rb') as handle:
itemattr = pickle.load(handle)
with np.load(f'{data_dir}/itemattr.npz', mmap_mode=None) as itemattr_file:
itemattr = {key : val for key, val in itemattr_file.items()}

return ind2val, itemattr, dataloaders
3 changes: 3 additions & 0 deletions data/data.npz
Git LFS file not shown
3 changes: 3 additions & 0 deletions data/ind2val.json
Git LFS file not shown
3 changes: 3 additions & 0 deletions data/itemattr.npz
Git LFS file not shown
48 changes: 48 additions & 0 deletions data/transform_data_to_numpy.py
@@ -0,0 +1,48 @@
### TRANSFORM DATA FILES FROM PYTORCH ARRAY TO NUMPY ARRAYS
# The original dataset was only available as a pytorch dataset.
# This is unfortunate as it is less accessible for non-pytorch users.
# Further, it was also saved in pickle format, which is vulnerable to versioning.
# Lastly, some of the data names are fairly internal, change these to more understandable names.

#%% Imports
import torch
import numpy as np
import pickle


# %% Transform interaction data
# We rename the displayed items from "action" to "slate".
# Otherwise this is just a transformation from pytorch to numpy arrays.
data_pt = torch.load("data.pt")
# Transform some of the arrays directly to numpy arrays:
transform_directly = ['userId','click','click_idx']
data_np = {key : data_pt[key].numpy() for key in transform_directly}

# Transform the displayed items with name changes of the fields:
data_np['slate_lengths'] = data_pt['lengths'].numpy()
data_np['slate'] = data_pt['action'].numpy()
data_np['interaction_type'] = data_pt['displayType'].numpy()

# Save the interaction data with compresed numpy directly:
np.savez_compressed('data', **data_np)

# %% Transform the index file (ind2val):
# userId and itemId transforms are scrambled and is not useful for any purpose.
# Remove these to reduce data size.
# Also we have renamed "displayType" to "interaction_type" in data, so do same here.

ind2val_old = pickle.load(open("ind2val.pickle", "rb"))

ind2val_new = {
'category' : ind2val_old['category'],
'interaction_type' : ind2val_old['displayType']
}
import json
with open('ind2val.json', 'w') as json_file:
json.dump(ind2val_new, json_file)

#%% Transform item attributes (itemattr.pickle)
# Save only the category vector, and save in npz format.
itemattr_old = pickle.load(open("itemattr.pickle","rb"))
itemattr_new = {'category' : itemattr_old['category']}
np.savez_compressed('itemattr', **itemattr_new)
simeneide marked this conversation as resolved.
Show resolved Hide resolved
19 changes: 19 additions & 0 deletions datahelper.py
@@ -0,0 +1,19 @@
import logging
from google_drive_downloader import GoogleDriveDownloader as gdd
simeneide marked this conversation as resolved.
Show resolved Hide resolved
def download_data_files(data_dir : str = "data", overwrite=False):
"""
Downloads the data from google drive.
If files exist they will not be downloaded again unless overwrite=True
"""
gdrive_file_ids = {
'data.npz' : '1VXKXIvPCJ7z4BCa4G_5-Q2XMAD7nXOc7',
'ind2val.json' : '1WOCKfuttMacCb84yQYcRjxjEtgPp6F4N',
'itemattr.npz' : '1rKKyMQZqWp8vQ-Pl1SeHrQxzc5dXldnR'
}
for filename, gdrive_id in gdrive_file_ids.items():
logging.info("Downloading {}".format(filename))
gdd.download_file_from_google_drive(file_id=gdrive_id,
dest_path="{}/{}".format(data_dir, filename),
overwrite=overwrite)
logging.info("Done downloading all files.")
return True
1 change: 0 additions & 1 deletion dataset.py

This file was deleted.

108 changes: 108 additions & 0 deletions dataset_torch.py
@@ -0,0 +1,108 @@
#%% Imports
import torch
NegatioN marked this conversation as resolved.
Show resolved Hide resolved
import datahelper
# datahelper.download_data_files()

#%%
import os
import logging
logging.basicConfig(format='%(asctime)s %(message)s', level='INFO')
import pickle
from torch.utils.data import Dataset, DataLoader
import torch
import json
def mkdir(path):
if os.path.isdir(path) == False:
os.makedirs(path)
import numpy as np
#%% DATALOADERS
class SequentialDataset(Dataset):
'''
Note: displayType has been uncommented for future easy implementation.
'''
def __init__(self, data, sample_uniform_slate=False):

self.data = data
self.num_items = self.data['slate'].max()+1
self.sample_uniform_slate = sample_uniform_slate
logging.info("Loading dataset with slate size={} and uniform candidate sampling={}".format(self.data['slate'].size(), self.sample_uniform_slate))

def __getitem__(self, idx):
batch = {key: val[idx] for key, val in self.data.items()}

if self.sample_uniform_slate:
# Sample actions uniformly:
action = torch.randint_like(batch['slate'], low=3, high=self.num_items)

# Add noclick action at pos0
# and the actual click action at pos 1 (unless noclick):
action[:,0] = 1
clicked = batch['click']!=1
action[:,1][clicked] = batch['click'][clicked]
batch['slate'] = action
# Set click idx to 0 if noclick, and 1 otherwise:
batch['click_idx'] = clicked.long()

return batch

def __len__(self):
return len(self.data['click'])

#%% PREPARE DATA IN TRAINING
def load_dataloaders(data_dir,
batch_size=1024,
split_trainvalid=0.90,
t_testsplit = 5,
num_workers = 0,
sample_uniform_slate=False):

logging.info("Download data if not in data folder..")
datahelper.download_data_files(data_dir=data_dir)

logging.info('Load data..')
with np.load("{}/data.npz".format(data_dir)) as data_np:
data = {key: torch.tensor(val) for key, val in data_np.items()}
dataset = SequentialDataset(data, sample_uniform_slate)

with open('{}/ind2val.json'.format(data_dir), 'rb') as handle:
# Use string2int object_hook found here: https://stackoverflow.com/a/54112705
ind2val = json.load(
handle,
object_hook=lambda d: {int(k) if k.lstrip('-').isdigit() else k: v for k, v in d.items()}
)

# Split dataset into train, validation and test:
num_validusers = int(len(dataset) * (1-split_trainvalid)/2)
num_testusers = int(len(dataset) * (1-split_trainvalid)/2)
torch.manual_seed(0)
num_users = len(dataset)
perm_user = torch.randperm(num_users)
valid_user_idx = perm_user[:num_validusers]
test_user_idx = perm_user[num_validusers:(num_validusers+num_testusers)]
train_user_idx = perm_user[(num_validusers+num_testusers):]
# Mask type: 1: train, 2: valid, 3: test
dataset.data['mask_type'] = torch.ones_like(dataset.data['click'])
dataset.data['mask_type'][valid_user_idx, t_testsplit:] = 2
dataset.data['mask_type'][test_user_idx, t_testsplit:] = 3

subsets = {
'train': dataset,
'valid': torch.utils.data.Subset(dataset, valid_user_idx),
'test': torch.utils.data.Subset(dataset, test_user_idx)
}

# Build dataloaders for each data subset:
dataloaders = {
phase: DataLoader(ds, batch_size=batch_size, shuffle=(phase=="train"), num_workers=num_workers)
for phase, ds in subsets.items()
}
for key, dl in dataloaders.items():
logging.info(
"In {}: num_users: {}, num_batches: {}".format(key, len(dl.dataset), len(dl))
)

# Load item attributes:
with np.load('{}/itemattr.npz'.format(data_dir), mmap_mode=None) as itemattr_file:
itemattr = {key : val for key, val in itemattr_file.items()}

return ind2val, itemattr, dataloaders