diff --git a/fluid/SE-ResNeXt-152/readme.md b/fluid/SE-ResNeXt-152/readme.md new file mode 100644 index 0000000..8c11c89 --- /dev/null +++ b/fluid/SE-ResNeXt-152/readme.md @@ -0,0 +1,16 @@ +# Benchmark SE-ResNeXt-152 + +## For single card: +``` +env CUDA_VISIBLE_DEVICES=4 python train.py --use_parallel_mode=parallel_do --use_nccl=False --parallel=False --display_step=1 +``` + +## For multi-card: +### use parallel_do +``` +env CUDA_VISIBLE_DEVICES=4,5,6,7 python train.py --use_parallel_mode=parallel_do --use_nccl=True --parallel=True --display_step=1 +``` +### use parallel_exe +``` +env CUDA_VISIBLE_DEVICES=4,5,6,7 python train.py --use_parallel_mode=parallel_exe --use_nccl=True --parallel=True --display_step=1 +``` diff --git a/fluid/SE-ResNeXt-152/run.sh b/fluid/SE-ResNeXt-152/run.sh deleted file mode 100644 index 053076c..0000000 --- a/fluid/SE-ResNeXt-152/run.sh +++ /dev/null @@ -1 +0,0 @@ -env CUDA_VISIBLE_DEVICES=4 python train_parallel_do.py --use_nccl=False --parallel=False diff --git a/fluid/SE-ResNeXt-152/train_parallel_executor.py b/fluid/SE-ResNeXt-152/train.py similarity index 56% rename from fluid/SE-ResNeXt-152/train_parallel_executor.py rename to fluid/SE-ResNeXt-152/train.py index 9c3e36d..13fc951 100644 --- a/fluid/SE-ResNeXt-152/train_parallel_executor.py +++ b/fluid/SE-ResNeXt-152/train.py @@ -18,26 +18,65 @@ import distutils.util import numpy as np -import paddle.v2 as paddle +import paddle import paddle.fluid as fluid -import paddle.v2.dataset.flowers as flowers +import paddle.dataset.flowers as flowers import paddle.fluid.profiler as profiler +fluid.default_startup_program().random_seed = 111 + def parse_args(): - parser = argparse.ArgumentParser('SE-ResNeXt-152 parallel-executor model.') + parser = argparse.ArgumentParser('SE-ResNeXt-152 parallel profile.') + parser.add_argument( + '--class_number', type=int, default=1000, help='the class number') + parser.add_argument( + '--use_parallel_mode', + type=str, + default='parallel_exe', + choices=['parallel_do', 'parallel_exe'], + help='The parallel mode("parallel_do" or "parallel_exe").') + parser.add_argument('--batch_size', type=int, default=12, help='batch size') + parser.add_argument('--per_gpu_batch_size', type=int, default=12, help='') parser.add_argument( '--use_mem_opt', type=distutils.util.strtobool, default=True, - help='use memory optimize') - parser.add_argument('--per_gpu_batch_size', type=int, default=12, help='') + help='use memory optimize or not.') + parser.add_argument( + '--do_profile', + type=distutils.util.strtobool, + default=True, + help='do profile or not.') parser.add_argument( '--number_iteration', type=int, - default=100, - help='total batch num for per_gpu_batch_size') + default=50, + help='total batch num for per_gpu_batch_size.') parser.add_argument('--display_step', type=int, default=1, help='') + parser.add_argument( + '--skip_first_steps', + type=int, + default=2, + help='The first num of steps to skip, for better performance profile.') + parser.add_argument( + '--parallel', + type=distutils.util.strtobool, + default=True, + help='It is valid only when parallel_mode is parallel_do.') + parser.add_argument( + '--use_nccl', + type=distutils.util.strtobool, + default=True, + help='It is valid only when parallel_mode is parallel_do.') + parser.add_argument( + '--use_python_reader', + type=distutils.util.strtobool, + default=True, + help='It is valid only when parallel_mode is parallel_do.' + 'If use_python_reader is True, python reader is used to feeding data,' + 'the process includes data transfer from CPU to GPU. Otherwise, ' + 'the data which will be needed for training is in GPU side constantly.') args = parser.parse_args() return args @@ -45,8 +84,13 @@ def parse_args(): def print_arguments(args): print('----------- Configuration Arguments -----------') - for arg, value in sorted(vars(args).iteritems()): - print('%s=%s' % (arg, value)) + if args.use_parallel_mode == "parallel_do": + for arg, value in sorted(vars(args).iteritems()): + print('%s=%s' % (arg, value)) + else: + args.use_nccl = True + for arg, value in sorted(vars(args).iteritems()): + print('%s=%s' % (arg, value)) def conv_bn_layer(input, num_filters, filter_size, stride=1, groups=1, @@ -178,16 +222,99 @@ def net_conf(image, label, class_dim): return out, avg_cost, accuracy, accuracy5 -def train(): - args = parse_args() +def train_parallel_do(args): - cards = os.getenv("CUDA_VISIBLE_DEVICES") or "" - cards_num = len(cards.split(",")) - batch_size = args.per_gpu_batch_size * cards_num + class_dim = 1000 + image_shape = [3, 224, 224] - print_arguments(args) - print("cards_num=" + str(cards_num)) - print("batch_size=" + str(batch_size)) + image = fluid.layers.data(name='image', shape=image_shape, dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + + if args.parallel: + places = fluid.layers.get_places() + pd = fluid.layers.ParallelDo(places, use_nccl=args.use_nccl) + + with pd.do(): + image_ = pd.read_input(image) + label_ = pd.read_input(label) + out = SE_ResNeXt(input=image_, class_dim=class_dim) + cost = fluid.layers.cross_entropy(input=out, label=label_) + avg_cost = fluid.layers.mean(x=cost) + accuracy = fluid.layers.accuracy(input=out, label=label_) + pd.write_output(avg_cost) + pd.write_output(accuracy) + + avg_cost, accuracy = pd() + avg_cost = fluid.layers.mean(x=avg_cost) + accuracy = fluid.layers.mean(x=accuracy) + else: + out = SE_ResNeXt(input=image, class_dim=class_dim) + cost = fluid.layers.cross_entropy(input=out, label=label) + avg_cost = fluid.layers.mean(x=cost) + accuracy = fluid.layers.accuracy(input=out, label=label) + + #optimizer = fluid.optimizer.SGD(learning_rate=0.002) + optimizer = fluid.optimizer.Momentum( + learning_rate=fluid.layers.piecewise_decay( + boundaries=[100], values=[0.1, 0.2]), + momentum=0.9, + regularization=fluid.regularizer.L2Decay(1e-4)) + opts = optimizer.minimize(avg_cost) + + if args.use_mem_opt: + fluid.memory_optimize(fluid.default_main_program()) + + place = fluid.CUDAPlace(0) + # place = fluid.CPUPlace() + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + + train_reader = paddle.batch(flowers.train(), batch_size=args.batch_size) + + feeder = fluid.DataFeeder(place=place, feed_list=[image, label]) + train_reader_iter = train_reader() + if not args.use_python_reader: + data = train_reader_iter.next() + feed_dict = feeder.feed(data) + + time_record = [] + + for batch_id in range(args.number_iteration): + if args.do_profile and batch_id >= 5 and batch_id < 8: + with profiler.profiler('All', 'total', + '/tmp/profile_parallel_do') as prof: + exe.run(fluid.default_main_program(), + feed=feeder.feed(train_reader_iter.next()) + if args.use_python_reader else feed_dict, + fetch_list=[], + use_program_cache=True) + continue + + train_start = time.time() + cost_val = exe.run(fluid.default_main_program(), + feed=feeder.feed(train_reader_iter.next()) + if args.use_python_reader else feed_dict, + fetch_list=[avg_cost.name] + if batch_id % args.display_step == 0 else [], + use_program_cache=True) + train_stop = time.time() + step_time = train_stop - train_start + time_record.append(step_time) + + if batch_id % args.display_step == 0: + print("iter=%d, elapse=%f, cost=%s" % + (batch_id, step_time, np.array(cost_val[0]))) + + for _ in range(args.skip_first_steps): + del time_record[0] + + for ele in time_record: + print ele + + print("average time:{0}".format(np.mean(time_record))) + + +def train_parallel_exe(args): class_dim = 1000 image_shape = [3, 224, 224] @@ -201,12 +328,14 @@ def train(): shapes=[[-1, 3, 224, 224], [-1, 1]], lod_levels=[0, 0], dtypes=['float32', 'int64']) + # currently, double buffer only supports one device. #data_file = fluid.layers.create_double_buffer_reader(reader=data_file, place='CUDA:0') image, label = fluid.layers.read_file(reader) prediction, avg_cost, accuracy, accuracy5 = net_conf(image, label, class_dim) + #optimizer = fluid.optimizer.SGD(learning_rate=0.002) optimizer = fluid.optimizer.Momentum( learning_rate=fluid.layers.piecewise_decay( @@ -220,13 +349,13 @@ def train(): exe = fluid.ParallelExecutor(loss_name=avg_cost.name, use_cuda=True) - batch_id = -1 time_record = [] - for i in xrange(args.number_iteration): - batch_id += 1 - if batch_id >= 5 and batch_id < 7: - with profiler.profiler('All', 'total', '/tmp/profile') as prof: + for batch_id in xrange(args.number_iteration): + + if args.do_profile and batch_id >= 5 and batch_id < 8: + with profiler.profiler('All', 'total', + '/tmp/profile_parallel_exe') as prof: exe.run([]) continue @@ -241,7 +370,9 @@ def train(): print("iter=%d, elapse=%f, cost=%s" % (batch_id, period, np.array(cost_val[0]))) - del time_record[0] + for _ in range(args.skip_first_steps): + del time_record[0] + for ele in time_record: print ele @@ -249,4 +380,16 @@ def train(): if __name__ == '__main__': - train() + args = parse_args() + + cards = os.getenv("CUDA_VISIBLE_DEVICES") or "" + cards_num = len(cards.split(",")) + args.batch_size = args.per_gpu_batch_size * cards_num + + print_arguments(args) + print("cards_num=" + str(cards_num)) + + if args.use_parallel_mode == "parallel_do": + train_parallel_do(args) + else: + train_parallel_exe(args) diff --git a/fluid/SE-ResNeXt-152/train_parallel_do.py b/fluid/SE-ResNeXt-152/train_parallel_do.py deleted file mode 100644 index 6e32e55..0000000 --- a/fluid/SE-ResNeXt-152/train_parallel_do.py +++ /dev/null @@ -1,259 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import time -import argparse -import distutils.util - -import paddle.v2 as paddle -import paddle.fluid as fluid -import paddle.v2.dataset.flowers as flowers -import paddle.fluid.profiler as profiler - - -def parse_args(): - parser = argparse.ArgumentParser('SE-ResNeXt-152 parallel profile.') - parser.add_argument('--per_gpu_batch_size', type=int, default=12, help='') - parser.add_argument( - '--use_mem_opt', - type=distutils.util.strtobool, - default=True, - help='use memory optimize') - parser.add_argument( - '--skip_first_steps', - type=int, - default=2, - help='The first num of steps to skip, for better performance profile') - parser.add_argument( - '--total_batch_num', - type=int, - default=40, - help='total batch num for per_gpu_batch_size') - parser.add_argument( - '--parallel', - type=distutils.util.strtobool, - default=True, - help='use parallel_do') - parser.add_argument( - '--use_nccl', - type=distutils.util.strtobool, - default=False, - help='use_nccl') - parser.add_argument( - '--use_python_reader', - type=distutils.util.strtobool, - default=True, - help='use python reader to feed data') - - args = parser.parse_args() - return args - - -def print_arguments(args): - print('----------- Configuration Arguments -----------') - for arg, value in sorted(vars(args).iteritems()): - print('%s=%s' % (arg, value)) - - -def conv_bn_layer(input, num_filters, filter_size, stride=1, groups=1, - act=None): - conv = fluid.layers.conv2d( - input=input, - num_filters=num_filters, - filter_size=filter_size, - stride=stride, - padding=(filter_size - 1) / 2, - groups=groups, - act=None, - bias_attr=False) - return fluid.layers.batch_norm(input=conv, act=act) - - -def squeeze_excitation(input, num_channels, reduction_ratio): - pool = fluid.layers.pool2d( - input=input, pool_size=0, pool_type='avg', global_pooling=True) - squeeze = fluid.layers.fc(input=pool, - size=num_channels / reduction_ratio, - act='relu') - excitation = fluid.layers.fc(input=squeeze, - size=num_channels, - act='sigmoid') - scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0) - return scale - - -def shortcut(input, ch_out, stride): - ch_in = input.shape[1] - if ch_in != ch_out: - if stride == 1: - filter_size = 1 - else: - filter_size = 3 - return conv_bn_layer(input, ch_out, filter_size, stride) - else: - return input - - -def bottleneck_block(input, num_filters, stride, cardinality, reduction_ratio): - conv0 = conv_bn_layer( - input=input, num_filters=num_filters, filter_size=1, act='relu') - conv1 = conv_bn_layer( - input=conv0, - num_filters=num_filters, - filter_size=3, - stride=stride, - groups=cardinality, - act='relu') - conv2 = conv_bn_layer( - input=conv1, num_filters=num_filters * 2, filter_size=1, act=None) - scale = squeeze_excitation( - input=conv2, - num_channels=num_filters * 2, - reduction_ratio=reduction_ratio) - - short = shortcut(input, num_filters * 2, stride) - - return fluid.layers.elementwise_add(x=short, y=scale, act='relu') - - -def SE_ResNeXt(input, class_dim, infer=False): - cardinality = 64 - reduction_ratio = 16 - depth = [3, 8, 36, 3] - num_filters = [128, 256, 512, 1024] - - conv = conv_bn_layer( - input=input, num_filters=64, filter_size=3, stride=2, act='relu') - conv = conv_bn_layer( - input=conv, num_filters=64, filter_size=3, stride=1, act='relu') - conv = conv_bn_layer( - input=conv, num_filters=128, filter_size=3, stride=1, act='relu') - conv = fluid.layers.pool2d( - input=conv, pool_size=3, pool_stride=2, pool_padding=1, pool_type='max') - - for block in range(len(depth)): - for i in range(depth[block]): - conv = bottleneck_block( - input=conv, - num_filters=num_filters[block], - stride=2 if i == 0 and block != 0 else 1, - cardinality=cardinality, - reduction_ratio=reduction_ratio) - - pool = fluid.layers.pool2d( - input=conv, pool_size=0, pool_type='avg', global_pooling=True) - if not infer: - drop = fluid.layers.dropout(x=pool, dropout_prob=0.2) - else: - drop = pool - out = fluid.layers.fc(input=drop, size=class_dim, act='softmax') - return out - - -def time_stamp(): - return int(round(time.time() * 1000)) - - -def train(): - args = parse_args() - - cards = os.getenv("CUDA_VISIBLE_DEVICES") or "" - cards_num = len(cards.split(",")) - step_num = args.total_batch_num / cards_num - batch_size = args.per_gpu_batch_size * cards_num - - print_arguments(args) - print("cards_num=" + str(cards_num)) - print("batch_size=" + str(batch_size)) - print("total_batch_num=" + str(args.total_batch_num)) - print("step_num=" + str(step_num)) - - class_dim = 1000 - image_shape = [3, 224, 224] - - image = fluid.layers.data(name='image', shape=image_shape, dtype='float32') - label = fluid.layers.data(name='label', shape=[1], dtype='int64') - - if args.parallel: - places = fluid.layers.get_places() - pd = fluid.layers.ParallelDo(places, use_nccl=args.use_nccl) - - with pd.do(): - image_ = pd.read_input(image) - label_ = pd.read_input(label) - out = SE_ResNeXt(input=image_, class_dim=class_dim) - cost = fluid.layers.cross_entropy(input=out, label=label_) - avg_cost = fluid.layers.mean(x=cost) - accuracy = fluid.layers.accuracy(input=out, label=label_) - pd.write_output(avg_cost) - pd.write_output(accuracy) - - avg_cost, accuracy = pd() - avg_cost = fluid.layers.mean(x=avg_cost) - accuracy = fluid.layers.mean(x=accuracy) - else: - out = SE_ResNeXt(input=image, class_dim=class_dim) - cost = fluid.layers.cross_entropy(input=out, label=label) - avg_cost = fluid.layers.mean(x=cost) - accuracy = fluid.layers.accuracy(input=out, label=label) - - #optimizer = fluid.optimizer.SGD(learning_rate=0.002) - optimizer = fluid.optimizer.Momentum( - learning_rate=fluid.layers.piecewise_decay( - boundaries=[100], values=[0.1, 0.2]), - momentum=0.9, - regularization=fluid.regularizer.L2Decay(1e-4)) - opts = optimizer.minimize(avg_cost) - - if args.use_mem_opt: - fluid.memory_optimize(fluid.default_main_program()) - - place = fluid.CUDAPlace(0) - # place = fluid.CPUPlace() - exe = fluid.Executor(place) - exe.run(fluid.default_startup_program()) - - train_reader = paddle.batch(flowers.train(), batch_size=batch_size) - test_reader = paddle.batch(flowers.test(), batch_size=batch_size) - feeder = fluid.DataFeeder(place=place, feed_list=[image, label]) - train_reader_iter = train_reader() - data = train_reader_iter.next() - feed_dict = feeder.feed(data) - - for pass_id in range(1): - #with profiler.profiler('All', 'total', '/tmp/profile') as prof: - train_time = 0.0 - - for step_id in range(step_num): - train_start = time.time() - exe.run(fluid.default_main_program(), - feed=feeder.feed(train_reader_iter.next()) - if args.use_python_reader else feed_dict, - fetch_list=[], - use_program_cache=True) - train_stop = time.time() - step_time = train_stop - train_start - if step_id >= args.skip_first_steps: - train_time += step_time - print("step_id=" + str(step_id) + " step_time=" + str(step_time)) - print("\n\n\n") - calc_step_num = step_num - args.skip_first_steps - print("calc_step_num=" + str(calc_step_num) + " total_train_time=" + - str(train_time) + " ave_step_time=" + str( - float(train_time) / calc_step_num)) - - -if __name__ == '__main__': - train()