Skip to content

Commit

Permalink
Merge branch 'master' into untyped_storage
Browse files Browse the repository at this point in the history
  • Loading branch information
drivanov committed Jan 29, 2024
2 parents 7cedd70 + fe78093 commit 9ea5525
Show file tree
Hide file tree
Showing 33 changed files with 1,050 additions and 409 deletions.
20 changes: 11 additions & 9 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -318,7 +318,7 @@ pipeline {
agent {
docker {
label "dgl-ci-linux-cpu"
image "dgllib/dgl-ci-cpu:v231103_1700"
image "dgllib/dgl-ci-cpu:v240123_1000"
args "-u root"
alwaysPull true
}
Expand All @@ -337,7 +337,7 @@ pipeline {
agent {
docker {
label "dgl-ci-linux-cpu"
image "dgllib/dgl-ci-gpu:cu116_v231103_1700"
image "dgllib/dgl-ci-gpu:cu116_v240123_1000"
args "-u root"
alwaysPull true
}
Expand Down Expand Up @@ -392,7 +392,7 @@ pipeline {
agent {
docker {
label "dgl-ci-linux-cpu"
image "dgllib/dgl-ci-cpu:v231103_1700"
image "dgllib/dgl-ci-cpu:v240123_1000"
args "-u root"
alwaysPull true
}
Expand All @@ -411,7 +411,7 @@ pipeline {
agent {
docker {
label "dgl-ci-linux-gpu"
image "dgllib/dgl-ci-gpu:cu116_v231103_1700"
image "dgllib/dgl-ci-gpu:cu116_v240123_1000"
args "-u root --runtime nvidia"
alwaysPull true
}
Expand Down Expand Up @@ -451,6 +451,8 @@ pipeline {
steps {
unit_test_linux('tensorflow', 'cpu')
}
// Tensorflow is deprecated.
when { expression { false } }
}
}
post {
Expand All @@ -464,7 +466,7 @@ pipeline {
agent {
docker {
label "dgl-ci-linux-gpu"
image "dgllib/dgl-ci-gpu:cu116_v231103_1700"
image "dgllib/dgl-ci-gpu:cu116_v240123_1000"
args "-u root --runtime nvidia"
alwaysPull true
}
Expand All @@ -489,7 +491,7 @@ pipeline {
agent {
docker {
label "dgl-ci-linux-cpu"
image "dgllib/dgl-ci-cpu:v231103_1700"
image "dgllib/dgl-ci-cpu:v240123_1000"
args "-u root --shm-size=4gb"
alwaysPull true
}
Expand Down Expand Up @@ -542,7 +544,7 @@ pipeline {
agent {
docker {
label "dgl-ci-linux-gpu"
image "dgllib/dgl-ci-gpu:cu116_v231103_1700"
image "dgllib/dgl-ci-gpu:cu116_v240123_1000"
args "-u root --runtime nvidia --shm-size=8gb"
alwaysPull true
}
Expand Down Expand Up @@ -571,7 +573,7 @@ pipeline {
agent {
docker {
label "dgl-ci-linux-cpu"
image "dgllib/dgl-ci-cpu:v231103_1700"
image "dgllib/dgl-ci-cpu:v240123_1000"
args "-u root --shm-size=4gb"
alwaysPull true
}
Expand Down Expand Up @@ -618,7 +620,7 @@ pipeline {
agent {
docker {
label "dgl-ci-linux-cpu"
image "dgllib/dgl-ci-cpu:v231103_1700"
image "dgllib/dgl-ci-cpu:v240123_1000"
args "-u root"
alwaysPull true
}
Expand Down
1 change: 1 addition & 0 deletions docker/install/conda_env/torch_cpu_pip.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,5 @@ torch==1.13.0+cpu
torchdata
torcheval
torchmetrics
torch_geometric
tqdm
1 change: 1 addition & 0 deletions docker/install/conda_env/torch_gpu_pip.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,5 @@ torch==1.13.0+cu116
torchdata
torcheval
torchmetrics
torch_geometric
tqdm
16 changes: 8 additions & 8 deletions examples/core/rgcn/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,15 @@ Below results are roughly collected from an AWS EC2 **g4dn.metal**, 384GB RAM, 9

| Dataset Size | CPU RAM Usage | Num of GPUs | GPU RAM Usage | Time Per Epoch(Training) |
| ------------ | ------------- | ----------- | ------------- | ------------------------ |
| ~1.1GB | ~5GB | 0 | 0GB | ~243s |
| ~1.1GB | ~3GB | 1 | 4.4GB | ~81s |
| ~1.1GB | ~7GB | 0 | 0GB | ~233s |
| ~1.1GB | ~5GB | 1 | 4.5GB | ~73.6s |

### Accuracies
```
Epoch: 01, Loss: 2.3302, Valid: 47.76%, Test: 46.58%
Epoch: 02, Loss: 1.5486, Valid: 48.31%, Test: 47.12%
Epoch: 03, Loss: 1.1469, Valid: 46.43%, Test: 45.18%
Test accuracy 45.1227
Epoch: 01, Loss: 2.3386, Valid: 47.67%, Test: 46.96%
Epoch: 02, Loss: 1.5563, Valid: 47.66%, Test: 47.02%
Epoch: 03, Loss: 1.1557, Valid: 46.58%, Test: 45.42%
Test accuracy 45.3850
```

## Run on `ogb-lsc-mag240m` dataset
Expand All @@ -54,8 +54,8 @@ Below results are roughly collected from an AWS EC2 **g4dn.metal**, 384GB RAM, 9

| Dataset Size | CPU RAM Usage | Num of GPUs | GPU RAM Usage | Time Per Epoch(Training) |
| ------------ | ------------- | ----------- | ------------- | ------------------------ |
| ~404GB | ~60GB | 0 | 0GB | ~216s |
| ~404GB | ~60GB | 1 | 7GB | ~157s |
| ~404GB | ~72GB | 0 | 0GB | ~325s |
| ~404GB | ~61GB | 1 | 14GB | ~178s |

### Accuracies
```
Expand Down
51 changes: 21 additions & 30 deletions examples/multigpu/graphbolt/node_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,9 +89,7 @@ def create_dataloader(
features,
itemset,
device,
drop_last=False,
shuffle=True,
drop_uneven_inputs=False,
is_train,
):
############################################################################
# [HIGHLIGHT]
Expand Down Expand Up @@ -122,9 +120,9 @@ def create_dataloader(
datapipe = gb.DistributedItemSampler(
item_set=itemset,
batch_size=args.batch_size,
drop_last=drop_last,
shuffle=shuffle,
drop_uneven_inputs=drop_uneven_inputs,
drop_last=is_train,
shuffle=is_train,
drop_uneven_inputs=is_train,
)
############################################################################
# [Note]:
Expand All @@ -141,7 +139,10 @@ def create_dataloader(
if args.storage_device == "cpu":
datapipe = datapipe.copy_to(device)

dataloader = gb.DataLoader(datapipe, args.num_workers)
# Until https://github.com/dmlc/dgl/issues/7008, overlap should be False.
dataloader = gb.DataLoader(
datapipe, args.num_workers, overlap_feature_fetch=False
)

# Return the fully-initialized DataLoader object.
return dataloader
Expand Down Expand Up @@ -187,7 +188,7 @@ def train(
epoch_start = time.time()

model.train()
total_loss = torch.tensor(0, dtype=torch.float).to(device)
total_loss = torch.tensor(0, dtype=torch.float, device=device)
########################################################################
# (HIGHLIGHT) Use Join Context Manager to solve uneven input problem.
#
Expand Down Expand Up @@ -227,20 +228,17 @@ def train(
loss.backward()
optimizer.step()

total_loss += loss
total_loss += loss.detach()

# Evaluate the model.
if rank == 0:
print("Validating...")
acc = (
evaluate(
rank,
model,
valid_dataloader,
num_classes,
device,
)
/ world_size
acc = evaluate(
rank,
model,
valid_dataloader,
num_classes,
device,
)
########################################################################
# (HIGHLIGHT) Collect accuracy and loss values from sub-processes and
Expand All @@ -252,14 +250,13 @@ def train(
dist.reduce(tensor=acc, dst=0)
total_loss /= step + 1
dist.reduce(tensor=total_loss, dst=0)
dist.barrier()

epoch_end = time.time()
if rank == 0:
print(
f"Epoch {epoch:05d} | "
f"Average Loss {total_loss.item() / world_size:.4f} | "
f"Accuracy {acc.item():.4f} | "
f"Accuracy {acc.item() / world_size:.4f} | "
f"Time {epoch_end - epoch_start:.4f}"
)

Expand Down Expand Up @@ -301,29 +298,23 @@ def run(rank, world_size, args, devices, dataset):
dataset.feature,
train_set,
device,
drop_last=False,
shuffle=True,
drop_uneven_inputs=False,
is_train=True,
)
valid_dataloader = create_dataloader(
args,
dataset.graph,
dataset.feature,
valid_set,
device,
drop_last=False,
shuffle=False,
drop_uneven_inputs=False,
is_train=False,
)
test_dataloader = create_dataloader(
args,
dataset.graph,
dataset.feature,
test_set,
device,
drop_last=False,
shuffle=False,
drop_uneven_inputs=False,
is_train=False,
)

# Model training.
Expand Down Expand Up @@ -354,7 +345,7 @@ def run(rank, world_size, args, devices, dataset):
/ world_size
)
dist.reduce(tensor=test_acc, dst=0)
dist.barrier()
torch.cuda.synchronize()
if rank == 0:
print(f"Test Accuracy {test_acc.item():.4f}")

Expand Down
16 changes: 10 additions & 6 deletions examples/multigpu/node_classification_sage.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,12 +171,16 @@ def train(
use_uva,
):
# Instantiate a neighbor sampler
sampler = NeighborSampler(
[10, 10, 10],
prefetch_node_feats=["feat"],
prefetch_labels=["label"],
fused=(args.mode != "benchmark"),
)
if args.mode == "benchmark":
# A work-around to prevent CUDA running error. For more details, please
# see https://github.com/dmlc/dgl/issues/6697.
sampler = NeighborSampler([10, 10, 10], fused=False)
else:
sampler = NeighborSampler(
[10, 10, 10],
prefetch_node_feats=["feat"],
prefetch_labels=["label"],
)
train_dataloader = DataLoader(
g,
train_idx,
Expand Down
6 changes: 4 additions & 2 deletions examples/sampling/graphbolt/node_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,9 @@ def evaluate(args, model, graph, features, itemset, num_classes):


def train(args, graph, features, train_set, valid_set, num_classes, model):
optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
optimizer = torch.optim.Adam(
model.parameters(), lr=args.lr, weight_decay=5e-4
)
dataloader = create_dataloader(
graph=graph,
features=features,
Expand Down Expand Up @@ -343,7 +345,7 @@ def parse_args():
parser.add_argument(
"--lr",
type=float,
default=0.0005,
default=1e-3,
help="Learning rate for optimization.",
)
parser.add_argument(
Expand Down
24 changes: 12 additions & 12 deletions examples/sampling/graphbolt/rgcn/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,15 @@ Below results are roughly collected from an AWS EC2 **g4dn.metal**, 384GB RAM, 9

| Dataset Size | CPU RAM Usage | Num of GPUs | GPU RAM Usage | Time Per Epoch(Training) |
| ------------ | ------------- | ----------- | ------------- | ------------------------ |
| ~1.1GB | ~4.5GB | 0 | 0GB | ~235s |
| ~1.1GB | ~2GB | 1 | 4.4GB | ~60s |
| ~1.1GB | ~5.3GB | 0 | 0GB | ~230s |
| ~1.1GB | ~3GB | 1 | 3.87GB | ~64.6s |

### Accuracies
```
Epoch: 01, Loss: 2.6736, Valid accuracy: 42.21%
Epoch: 02, Loss: 2.0809, Valid accuracy: 42.51%
Epoch: 03, Loss: 1.8143, Valid accuracy: 42.76%
Test accuracy 41.4817
Epoch: 01, Loss: 2.3434, Valid accuracy: 48.23%
Epoch: 02, Loss: 1.5646, Valid accuracy: 48.49%
Epoch: 03, Loss: 1.1633, Valid accuracy: 45.79%
Test accuracy 44.6792
```

## Run on `ogb-lsc-mag240m` dataset
Expand All @@ -47,17 +47,17 @@ Below results are roughly collected from an AWS EC2 **g4dn.metal**, 384GB RAM, 9

> **note:**
`buffer/cache` are highly used during train, it's about 300GB. If more RAM is available, more `buffer/cache` will be consumed as graph size is about 55GB and feature data is about 350GB.
One more thing, first epoch is quite slow as `buffer/cache` is not ready yet. For GPU train, first epoch takes **34:56min, 1.93s/it**.
One more thing, first epoch is quite slow as `buffer/cache` is not ready yet. For GPU train, first epoch takes **1030s**.
Even in following epochs, time consumption varies.

| Dataset Size | CPU RAM Usage | Num of GPUs | GPU RAM Usage | Time Per Epoch(Training) |
| ------------ | ------------- | ----------- | ------------- | ------------------------ |
| ~404GB | ~55GB | 0 | 0GB | ~197s |
| ~404GB | ~55GB | 1 | 7GB | ~119s |
| ~404GB | ~67GB | 0 | 0GB | ~248s |
| ~404GB | ~60GB | 1 | 15GB | ~166s |

### Accuracies
```
Epoch: 01, Loss: 2.3038, Valid accuracy: 46.33%
Epoch: 02, Loss: 2.1160, Valid accuracy: 46.47%
Epoch: 03, Loss: 2.0847, Valid accuracy: 48.38%
Epoch: 01, Loss: 2.1432, Valid accuracy: 50.21%
Epoch: 02, Loss: 1.9267, Valid accuracy: 50.77%
Epoch: 03, Loss: 1.8797, Valid accuracy: 53.38%
```
5 changes: 3 additions & 2 deletions graphbolt/include/graphbolt/cuda_ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -113,15 +113,16 @@ std::tuple<torch::Tensor, torch::Tensor> IndexSelectCSCImpl(
* given nodes and their indptr values.
*
* @param indptr The indptr tensor.
* @param nodes The nodes to read from indptr
* @param nodes The nodes to read from indptr. If not provided, assumed to be
* equal to torch.arange(indptr.size(0) - 1).
*
* @return Tuple of tensors with values:
* (indptr[nodes + 1] - indptr[nodes], indptr[nodes]), the returned indegrees
* tensor (first one) has size nodes.size(0) + 1 so that calling ExclusiveCumSum
* on it gives the output indptr.
*/
std::tuple<torch::Tensor, torch::Tensor> SliceCSCIndptr(
torch::Tensor indptr, torch::Tensor nodes);
torch::Tensor indptr, torch::optional<torch::Tensor> nodes);

/**
* @brief Given the compacted sub_indptr tensor, edge type tensor and
Expand Down
9 changes: 5 additions & 4 deletions graphbolt/include/graphbolt/cuda_sampling_ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ namespace ops {
*
* @param indptr Index pointer array of the CSC.
* @param indices Indices array of the CSC.
* @param nodes The nodes from which to sample neighbors.
* @param nodes The nodes from which to sample neighbors. If not provided,
* assumed to be equal to torch.arange(indptr.size(0) - 1).
* @param fanouts The number of edges to be sampled for each node with or
* without considering edge types.
* - When the length is 1, it indicates that the fanout applies to all
Expand Down Expand Up @@ -49,9 +50,9 @@ namespace ops {
* the sampled graph's information.
*/
c10::intrusive_ptr<sampling::FusedSampledSubgraph> SampleNeighbors(
torch::Tensor indptr, torch::Tensor indices, torch::Tensor nodes,
const std::vector<int64_t>& fanouts, bool replace, bool layer,
bool return_eids,
torch::Tensor indptr, torch::Tensor indices,
torch::optional<torch::Tensor> nodes, const std::vector<int64_t>& fanouts,
bool replace, bool layer, bool return_eids,
torch::optional<torch::Tensor> type_per_edge = torch::nullopt,
torch::optional<torch::Tensor> probs_or_mask = torch::nullopt);

Expand Down

0 comments on commit 9ea5525

Please sign in to comment.