Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open-sourced PipeDLRM #122

Open
wants to merge 2 commits into
base: pipedlrm
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
Empty file added __init__.py
Empty file.
123 changes: 123 additions & 0 deletions dlrm_data_pytorch.py
Original file line number Diff line number Diff line change
Expand Up @@ -519,6 +519,39 @@ def make_criteo_data_and_loaders(args):
return train_data, train_loader, test_data, test_loader


# for PipeDLRM with multiple data loaders.
def make_criteo_loaders_with_sampler(args, train_data, test_data, num_ranks_in_first_stage):
train_sampler = torch.utils.data.distributed.DistributedSampler(
train_data, num_replicas=num_ranks_in_first_stage,
rank=args.rank)

test_sampler = torch.utils.data.distributed.DistributedSampler(
test_data, num_replicas=num_ranks_in_first_stage,
rank=args.rank)

train_loader = torch.utils.data.DataLoader(
train_data,
batch_size=args.mini_batch_size,
shuffle=False,
num_workers=args.num_workers,
collate_fn=collate_wrapper_criteo,
pin_memory=True,
drop_last=False, # True
sampler=train_sampler,
)
test_loader = torch.utils.data.DataLoader(
test_data,
batch_size=args.test_mini_batch_size,
shuffle=False,
num_workers=args.test_num_workers,
collate_fn=collate_wrapper_criteo,
pin_memory=True,
drop_last=False, # True
sampler=test_sampler,
)
return train_loader, test_loader, train_sampler


# uniform ditribution (input data)
class RandomDataset(Dataset):

Expand Down Expand Up @@ -657,6 +690,96 @@ def make_random_data_and_loader(args, ln_emb, m_den):
return train_data, train_loader


# for PipeDLRM, pseudo data loader.
def make_random_data_and_loader_pipeline(args, ln_emb, m_den):
train_data = RandomDataset(
m_den,
ln_emb,
args.data_size,
args.num_batches,
args.mini_batch_size,
args.num_indices_per_lookup,
args.num_indices_per_lookup_fixed,
1, # num_targets
args.round_targets,
args.data_generation,
args.data_trace_file,
args.data_trace_enable_padding,
reset_seed_on_access=True,
rand_seed=args.numpy_rand_seed
) # WARNING: generates a batch of lookups at once
test_data = RandomDataset(
m_den,
ln_emb,
args.data_size,
args.test_num_batches,
args.test_mini_batch_size,
args.num_indices_per_lookup,
args.num_indices_per_lookup_fixed,
1, # num_targets
args.round_targets,
args.data_generation,
args.data_trace_file,
args.data_trace_enable_padding,
reset_seed_on_access=True,
rand_seed=args.numpy_rand_seed
) # WARNING: generates a batch of lookups at once

train_loader = torch.utils.data.DataLoader(
train_data,
batch_size=1,
shuffle=False,
num_workers=args.num_workers,
collate_fn=collate_wrapper_random,
pin_memory=True,
drop_last=False, # True
)

test_loader = torch.utils.data.DataLoader(
test_data,
batch_size=1,
shuffle=False,
num_workers=args.test_num_workers,
collate_fn=collate_wrapper_random,
pin_memory=True,
drop_last=False, # True
)
return train_data, train_loader, test_data, test_loader


def make_random_loader_with_sampler(args, train_data, test_data, num_ranks_in_first_stage):

train_sampler = torch.utils.data.distributed.DistributedSampler(
train_data, num_replicas=num_ranks_in_first_stage,
rank=args.rank)

test_sampler = torch.utils.data.distributed.DistributedSampler(
test_data, num_replicas=num_ranks_in_first_stage,
rank=args.rank)

train_loader = torch.utils.data.DataLoader(
train_data,
batch_size=1,
shuffle=False,
num_workers=args.num_workers,
collate_fn=collate_wrapper_random,
pin_memory=True,
drop_last=False, # True
sampler=train_sampler,
)

test_loader = torch.utils.data.DataLoader(
test_data,
batch_size=1,
shuffle=False,
num_workers=args.num_workers,
collate_fn=collate_wrapper_random,
pin_memory=True,
drop_last=False, # True
sampler=test_sampler,
)


def generate_random_data(
m_den,
ln_emb,
Expand Down
3 changes: 3 additions & 0 deletions env.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/bash
export PYTHONPATH=$PYTHONPATH:`pwd`
export PIPEDLRM_HOME=`pwd`
68 changes: 68 additions & 0 deletions exp/pipeline/dlrm_dac_pytorch.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#!/bin/bash
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
#
#WARNING: must have compiled PyTorch and caffe2

#check if extra argument is passed to the test
if [[ $# == 1 ]]; then
dlrm_extra_option=$1
else
dlrm_extra_option=""
fi
#echo $dlrm_extra_option

dlrm_pt_bin="python main_with_runtime.py"
datagen=dataset
print_freq=256 # 256 # fix
nepochs=1

mini_batch_size=64
num_batches=2560 #613937 # fix
num_workers=4

test_num_batches=200 # fix
test_mini_batch_size=16384 #fix
test_num_workers=16

num_input_rank=3
nrank=6
ngpu=$((nrank-1))
conf_file=hybrid_conf.json
exp_name=tmp

#--mini-batch-size=64 --print-freq=256 --test-freq=1024 --print-time --test-mini-batch-size=128 --test-num-workers=16
#--arch-embedding-size="1460-583-10131227-2202608-305-24-12517-633-3-93145-5683-8351593-3194-27-14992-5461306-10-5652-2173-4-7046547-18-15-286181-105-142572"\

echo "run pytorch ..."
# WARNING: the following parameters will be set based on the data set
# --arch-embedding-size=... (sparse feature sizes)
# --arch-mlp-bot=... (the input to the first layer of bottom mlp)
for r in $(seq 0 $num_input_rank); do
echo "runing input rank $r"
$dlrm_pt_bin --arch-sparse-feature-size=16 --arch-mlp-bot="13-512-256-64-16" --arch-mlp-top="512-256-1"\
--arch-embedding-size="1460-583-10131227-2202608-305-24-12517-633-3-93145-5683-8351593-3194-27-14992-5461306-10-5652-2173-4-7046547-18-15-286181-105-142572"\
--data-generation=$datagen --data-set=kaggle --raw-data-file=./input/train.txt --processed-data-file=./input/kaggleAdDisplayChallenge_processed.npz\
--loss-function=bce --round-targets=True --learning-rate=0.1 --mini-batch-size=$mini_batch_size\
--print-freq=$print_freq --print-time --test-mini-batch-size=$test_mini_batch_size --test-num-workers=$test_num_workers\
--module models.dlrm.gpus=3 --rank $r --local-rank $r --master-addr 127.0.0.1\
--config-path models/dlrm/gpus\=3/$conf_file --distributed-backend gloo --num-ranks-in-server $nrank\
--use-gpu --num-batches $num_batches --test-num-batches $test_num_batches --nepochs $nepochs --print-freq=$print_freq $dlrm_extra_option 2>&1 > $exp_name/run_pt_$r.log &
done

datagen=random
for r in $(seq $((num_input_rank+1)) $ngpu); do
echo "runing rank $r"
$dlrm_pt_bin --arch-sparse-feature-size=16 --arch-mlp-bot="13-512-256-64-16" --arch-mlp-top="512-256-1"\
--arch-embedding-size="1460-583-10131227-2202608-305-24-12517-633-3-93145-5683-8351593-3194-27-14992-5461306-10-5652-2173-4-7046547-18-15-286181-105-142572"\
--data-generation=$datagen --data-set=kaggle --raw-data-file=./input/train.txt --processed-data-file=./input/kaggleAdDisplayChallenge_processed.npz\
--loss-function=bce --round-targets=True --learning-rate=0.1 --mini-batch-size=$mini_batch_size\
--print-freq=$print_freq --print-time --test-mini-batch-size=$test_mini_batch_size --test-num-workers=$test_num_workers\
--module models.dlrm.gpus=3 --rank $r --local-rank $r --master-addr 127.0.0.1\
--config-path models/dlrm/gpus\=3/$conf_file --distributed-backend gloo --num-ranks-in-server $nrank\
--use-gpu --num-batches $num_batches --test-num-batches $test_num_batches --nepochs $nepochs --print-freq=$print_freq $dlrm_extra_option 2>&1 > $exp_name/run_pt_$r.log &
done

echo "done"
4 changes: 4 additions & 0 deletions graph/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

from .graph import Graph, Node
Loading