Skip to content

Commit

Permalink
Dev/wangyi/fix/inference on ipu (PaddlePaddle#176)
Browse files Browse the repository at this point in the history
1. fix inference problem on CPU
2. fix inference problem on IPU
  • Loading branch information
yiakwy-xpu-ml-framework-team committed Sep 18, 2021
1 parent 0cc55f8 commit 3eb17d6
Show file tree
Hide file tree
Showing 5 changed files with 105 additions and 19 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,20 @@ Examples:

> bash infer_with_ipu.sh
Example:

```
(py37_paddle-ipu) [docker-λ>] leiw@gbnwx-pod006-3-in_docker_dev:~/Paddle/python/paddle/fluid/tests/unittests/ipu/test_dataset/mnist$ bash infer_with_ipu.sh
[09/18 07:14:40] mnist:infer INFO: Reading data ...
[09/18 07:14:40] mnist:infer INFO: Complete reading image infer_3.png
[09/18 07:14:40] mnist:infer INFO: Constructing the computation graph ...
[09/18 07:15:12] mnist:infer INFO: Computation graph built.
[09/18 07:15:12] mnist:infer INFO: Change batch size of var %s from %d to %d
[09/18 07:15:12] mnist:infer INFO: Drawing IR graph ...
[09/18 07:15:12] mnist:infer INFO: Complete drawing.
digit hand write number picture is recognized as : 3
```

## Inference on IPU with Analysis API (c++) backend

We will add this example in the future
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,21 @@ MNSIT 数据集足够小,可以在1个IPU上运行。对于精度对齐市足

> bash infer_with_ipu.sh
示例:

```
(py37_paddle-ipu) [docker-λ>] leiw@gbnwx-pod006-3-in_docker_dev:~/Paddle/python/paddle/fluid/tests/unittests/ipu/test_dataset/mnist$ bash infer_with_ipu.sh
[09/18 07:14:40] mnist:infer INFO: Reading data ...
[09/18 07:14:40] mnist:infer INFO: Complete reading image infer_3.png
[09/18 07:14:40] mnist:infer INFO: Constructing the computation graph ...
[09/18 07:15:12] mnist:infer INFO: Computation graph built.
[09/18 07:15:12] mnist:infer INFO: Change batch size of var %s from %d to %d
[09/18 07:15:12] mnist:infer INFO: Drawing IR graph ...
[09/18 07:15:12] mnist:infer INFO: Complete drawing.
digit hand write number picture is recognized as : 3
```
```
## 通过 Analysis API (c++) 在 `IPU` 上做推理
我们将加入相关示例
Expand Down
74 changes: 60 additions & 14 deletions python/paddle/fluid/tests/unittests/ipu/test_dataset/mnist/infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,12 @@ def parse_args():
type=bool,
default=False,
help="Whether to use IPU or not.")
parser.add_argument(
"--ues_ipu_model",
type=bool,
default=True,
help="use model trained on IPU devices"
)
parser.add_argument(
"--num_ipus",
type=int,
Expand All @@ -75,8 +81,6 @@ def parse_args():
action="store_false",
help="draw IR graph for debug"
)
parser.add_argument(
"--num_epochs", type=int, default=5, help="number of epochs.")
parser.add_argument(
"--save_dir",
type=str,
Expand All @@ -101,15 +105,47 @@ def apply_pseudo_batch_size_pass(prog, batch_size, var_name):
# transform feed var batch_size to 1
global_block = prog.global_block()
if var_name in global_block.vars:
feed_var = global_block.vars[var_name] # Call API
feed_var = global_block.vars[var_name] # Call Python Block API
# modify attrs
# TODO(yiakwy) : hard coded
feed_var.desc.set_shape([1,1,28,28])
logger.info("Change batch size of var %s from %d to %d")
old_shape = feed_var.desc.shape()
feed_var.desc.set_shape([batch_size,1,28,28])
logger.info("Change batch size of var %s from %d to %d" % (var_name, old_shape[0], batch_size))
return

raise ValueError("Cannot find variable %s in the program description" % var_name)

def read_batch_size(prog, var_name):
global_block = prog.global_block()
if var_name in global_block.vars:
feed_var = global_block.vars[var_name] # Call Python Block API
old_shape = feed_var.desc.shape()
return old_shape[0]

raise ValueError("Cannot find variable %s in the program description" % var_name)

def apply_pseudo_rm_op_by_type_pass(prog, op_type):
global_block = prog.global_block()
# TODO(yiakwy) : with block python frontend API, we could
for i, op in enumerate(global_block.ops):
op.desc.set_is_target(False)
if op.type == op_type:
global_block._remove_op(i)
logger.info("Remove operator %d of type %s" % (i, op_type))
# return

# raise ValueError("Cannot find operator with type %s in the program description" % op_type)

def apply_pseudo_rm_vars_pass(prog, var_name):
global_block = prog.global_block()
if var_name in global_block.vars:
global_block._remove_var(var_name)
prog.desc.flush()
logger.info("Remove var %s" % var_name)
return

raise ValueError("Cannot find var %s in the program description" % var_name)

def load_image(file):
im = Image.open(file).convert('L')
im = im.resize((28, 28), Image.ANTIALIAS)
Expand Down Expand Up @@ -144,12 +180,12 @@ def main():
# Reading images
logger.info("Reading data ...")
pwd = os.path.dirname(os.path.realpath(__file__))
img = load_image(os.path.join(pwd,FLAGS.img))
img = load_image(os.path.join(pwd,FLAGS.img))
logger.info("Complete reading image %s" % FLAGS.img)

save_dir = FLAGS.save_dir
num_ipus = FLAGS.num_ipus
enable_pipelining = not FLAGS.no_pipelining
enable_pipelining = FLAGS.no_pipelining
will_draw_ir_graph = FLAGS.draw_ir_graph

# add model
Expand All @@ -162,22 +198,32 @@ def main():
logger.info("Constructing the computation graph ...")
[infer_program, feed_target_names,
fetch_targets] = fluid.io.load_inference_model(save_dir, infer_exc,
model_filename="recognize_digits_%s_test.pdmodel" % DEVICE_SUFFIX, params_filename="recognize_digits_%s.pdiparams" % DEVICE_SUFFIX)
model_filename="recognize_digits_%s.pdmodel" % DEVICE_SUFFIX, params_filename="recognize_digits_%s.pdiparams" % DEVICE_SUFFIX)
logger.info("Computation graph built.")


if FLAGS.use_ipu:
# TODO(yiakwy) : for the moment, we store our model trained on IPU as static graph
# which means that the batch size is fixed.
#
# We will apply passes to trains from batch size to `None` or `-1` upon the generated graph description later
apply_pseudo_batch_size_pass(infer_program, 1, feed_target_names[0])
else:
pass
# We will apply passes to transform batch size from a static number to `None` or `-1` or another number upon the generated graph description
# apply_pseudo_batch_size_pass(infer_program, 1, feed_target_names[0])

# TODO(yiakwy) : workaround
batch_size = read_batch_size(infer_program, feed_target_names[0])
img_with_64 = np.tile(img, (batch_size, 1, 1, 1))
img = img_with_64

apply_pseudo_rm_op_by_type_pass(infer_program, "feed")
apply_pseudo_rm_op_by_type_pass(infer_program, "fetch")
apply_pseudo_rm_vars_pass(infer_program, "feed")
apply_pseudo_rm_vars_pass(infer_program, "fetch")
else:
if FLAGS.ues_ipu_model:
apply_pseudo_batch_size_pass(infer_program, 1, feed_target_names[0])

if FLAGS.use_ipu:

if FLAGS.use_ipu:#False:
# Pipeline with tensorflow frontend: https://docs.graphcore.ai/projects/tensorflow1-user-guide/en/latest/perf_training.html#pipelined-training
ipu_strategy = compiler.get_ipu_strategy()
ipu_strategy.is_training = False
Expand All @@ -192,7 +238,7 @@ def main():
logger.info("Compiling graph on IPU devices ...")
feed_list = feed_target_names
fetch_list = [ out.name for out in fetch_targets]
infer_program = ipu_compiler.compile(feed_list, fetch_list, infer=True)
infer_program = ipu_compiler.compile(feed_list, fetch_list)
logger.info("Complete compiling.")
else:
if will_draw_ir_graph:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,4 +102,4 @@ def outputs(self):
if not self.is_built:
raise ValueError("The model is not built!")
return self._outputs


Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,7 @@ def train(epochs, exec, model, feeder, save_dir,
metrics = exec.run(train_program,
feed={inp.name : data[i] for i, inp in enumerate(model.inputs)},
fetch_list=model.outputs[1])

if batch_id % 100 == 0:
if validation_loss > 0:
print("Epoch %d, batch %d, Cost %f, Validation Cost %f" % (
Expand All @@ -238,15 +239,19 @@ def train(epochs, exec, model, feeder, save_dir,
step += 1

if save_dir is not None:
if True:#not model.cfg.get("use_ipu", False):
# TODO(yiak) : does not work in IPU
if not model.cfg.get("use_ipu", False):
paddle.static.save_inference_model(
save_dir+"recognize_digits_%s" % model.cfg.get("device_suffix", "cpu"),
model.inputs[0], model.outputs[0],
exec, program=train_program
)
else:
paddle.static.save(train_program, save_dir+"recognize_digits_%s_test" % model.cfg.get("device_suffix", "ipu"))
paddle.static.save_inference_model(
save_dir+"recognize_digits_%s" % model.cfg.get("device_suffix", "cpu"),
model.inputs[0], model.outputs[0],
exec, program=train_program.org_program
)
# paddle.static.save(train_program.org_program, save_dir+"recognize_digits_%s_test" % model.cfg.get("device_suffix", "ipu"))

# find the best pass
best = sorted(report,key=lambda record: float(record[1]))[0]
Expand Down Expand Up @@ -321,6 +326,12 @@ def main():
cfg["use_ipu"] = FLAGS.use_ipu
cfg["device_suffix"] = device_suffix

# create config
cfg = {}
cfg["batch_size"] = BATCH_SIZE
cfg["use_ipu"] = FLAGS.use_ipu
cfg["device_suffix"] = device_suffix

# create model
mnist = MNIST(cfg)

Expand Down Expand Up @@ -376,4 +387,4 @@ def main():
return 0

if __name__ == "__main__":
sys.exit(main())
sys.exit(main())

0 comments on commit 3eb17d6

Please sign in to comment.