From 19f4d00f9b13aa67369e32ec7cd3518950c6f30e Mon Sep 17 00:00:00 2001 From: Ciro Santilli Date: Mon, 6 Aug 2018 02:03:18 +0100 Subject: [PATCH] qemu: expose rr run: expose forgotten -Q, document it --- README.adoc | 65 ++++++++++++++++++++++++++++++++++++++++++------ build-usage.adoc | 1 + common | 3 ++- run | 46 +++++++++++++++++++++++++++++----- run-usage.adoc | 3 +++ 5 files changed, 104 insertions(+), 14 deletions(-) diff --git a/README.adoc b/README.adoc index 3be5c23f..e48399c0 100644 --- a/README.adoc +++ b/README.adoc @@ -7011,19 +7011,70 @@ TODO do even more awesome offline post-mortem analysis things, such as: ==== QEMU record and replay -QEMU supports deterministic record and replay by saving external inputs, which would be awesome to understand the kernel, as you would be able to examine a single run as many times as you would like. +QEMU runs are not deterministic by default, however it does support a record and replay mechanism that allows you to replay a previous run deterministically: -This mechanism first requires a trace to be generated on an initial record run. The trace is then used on the replay runs to make them deterministic. +This awesome feature allows you to examine a single run as many times as you would like until you understand everything: -Unfortunately it is not working in the current QEMU: https://stackoverflow.com/questions/46970215/how-to-use-qemus-deterministic-record-and-replay-feature-for-a-linux-kernel-boo +.... +# Record a run. +./run -F '/rand_check.out;/poweroff.out;' -r +# Replay the run. +./run -F '/rand_check.out;/poweroff.out;' -R +.... + +By comparing the terminal output of both runs, we can see that they are the exact same, including things which normally differ across runs: + +* timestamps of dmesg output +* <> output -Patches were merged in post v2.12.0-rc2 but it crashed for me and I opened a minimized bug report: https://bugs.launchpad.net/qemu/+bug/1762179 +The record and replay feature was revived around QEMU v3.0.0. It existed earlier but it rot completely. As of v3.0.0 it is still flaky: sometimes we get deadlocks, and only a limited number of command line arguments are supported. -We don't expose record and replay on our scripts yet since it was was not very stable, but we will do so when it stabilizes. +Documented at: https://github.com/qemu/qemu/blob/v2.12.0/docs/replay.txt + +TODO: using `-r` as above leads to a kernel warning: + +.... +rcu_sched detected stalls on CPUs/tasks +.... -<> is a good way to test out if record and replay is actually deterministic. +TODO: replay deadlocks intermittently at disk operations, last kernel message: + +.... +EXT4-fs (sda): re-mounted. Opts: block_validity,barrier,user_xattr +.... + +TODO replay with network gets stuck: + +.... +./run -F '/sbin/ifup -a;wget -S google.com;/poweroff.out;' -r +./run -F '/sbin/ifup -a;wget -S google.com;/poweroff.out;' -R +.... + +after the message: + +.... +adding dns 10.0.2.3 +.... + +There is explicit network support on the QEMU patches, but either it is buggy or we are not using the correct magic options. + +TODO `arm` and `aarch64` only seem to work with initrd since I cannot plug a working IDE disk device? See also: https://lists.gnu.org/archive/html/qemu-devel/2018-02/msg05245.html + +Then, when I tried with <> and no disk: + +.... +./build -aA -i +./run -aA -F '/rand_check.out;/poweroff.out;' -i -r +./run -aA -F '/rand_check.out;/poweroff.out;' -i -R +.... + +QEMU crashes with: + +.... +ERROR:replay/replay-time.c:49:replay_read_clock: assertion failed: (replay_file && replay_mutex_locked()) +.... -Alternatively, https://github.com/mozilla/rr[`mozilla/rr`] claims it is able to run QEMU: but using it would require you to step through QEMU code itself. Likely doable, but do you really want to? +I had the same error previously on x86-64, but it was fixed: https://bugs.launchpad.net/qemu/+bug/1762179 so maybe the forgot to fix it for `aarch64`? ==== QEMU trace multicore diff --git a/build-usage.adoc b/build-usage.adoc index 5d0c5c03..52db7faf 100644 --- a/build-usage.adoc +++ b/build-usage.adoc @@ -35,6 +35,7 @@ on top of it. |`-M` |`VARIANT` |gem5 build variant. |`-p` | |Pass extra arguments to the `rootfs_post_build_script`. +|`-Q` |`VARIANT`` |QEMU build variant. |`-S` | |Don't build QEMU with SDL support. Graphics such as X11 won't work, only the terminal. |`-s` | |Add a custom suffix to the build. diff --git a/common b/common index c5d79d15..f0b9c64d 100644 --- a/common +++ b/common @@ -69,7 +69,8 @@ set_common_vars() { common_images_dir="${buildroot_out_dir}/images" host_dir="${buildroot_out_dir}/host" common_qemu_run_dir="${out_arch_dir}/qemu/${common_run_id}" - common_qemu_termout_file="${common_qemu_run_dir}/termout.txt" + common_qemu_termout_file="${common_qemu_run_dir}/termout.txt" + common_qemu_rrfile="${common_qemu_run_dir}/rrfile" common_linux_custom_dir="${build_dir}/linux-custom" common_linux_variant_dir="${common_linux_custom_dir}.${linux_variant}" common_qemu_custom_dir="${build_dir}/host-qemu-custom" diff --git a/run b/run index 88a782e8..58ce2b72 100755 --- a/run +++ b/run @@ -27,6 +27,7 @@ initramfs=false memory=256M nographic=true prebuilt=false +rr= root= tmux=false tmux_args= @@ -35,7 +36,7 @@ trace_enabled=false # just to prevent QEMU from emitting a warning that '' is not valid. trace_type=pr_manager_run vnc= -while getopts a:c:DdE:e:F:f:G:ghIiKkL:M:m:N:n:PT:t:U:uVX:x OPT; do +while getopts a:c:DdE:e:F:f:G:ghIiKkL:M:m:N:n:PQ:RrT:t:U:uVX:x OPT; do case "$OPT" in a) arch="$OPTARG" @@ -108,6 +109,15 @@ while getopts a:c:DdE:e:F:f:G:ghIiKkL:M:m:N:n:PT:t:U:uVX:x OPT; do P) prebuilt=true ;; + Q) + common_qemu_variant="$OPTARG" + ;; + R) + rr=replay + ;; + r) + rr=record + ;; T) trace_enabled=true trace_type="$OPTARG" @@ -209,7 +219,7 @@ ${gem5opts} \ --dtb "${common_gem5_system_dir}/arm/dt/armv8_gem5_v1_big_little_2_2.dtb" \\ --kernel="${common_vmlinux}" \\ --little-cpus=2 \\ -${extra_flags} \\ +${extra_flags} \ " else gem5_common="\ @@ -282,16 +292,39 @@ ${vnc}" extra_flags="${extra_flags} -initrd '${common_images_dir}/rootfs.cpio' \\ " fi + + # Disk related options. if "$ramfs"; then # TODO why is this needed, and why any string works. root='root=/dev/anything' else if [ ! "$arch" = mips64 ]; then - extra_flags="${extra_flags} -drive 'file=${common_images_dir}/rootfs.ext2.qcow2,format=qcow2,if=virtio,snapshot' \\ + if [ -n "$rr" ]; then + driveif=none + rrid=',id=img-direct' + root='root=/dev/sda' + else + driveif=virtio + root='root=/dev/vda' + rrid= + fi + extra_flags="${extra_flags} -drive 'file=${common_images_dir}/rootfs.ext2.qcow2,format=qcow2,if=${driveif},snapshot${rrid}' \\ +" + if [ -n "$rr" ]; then + extra_flags="${extra_flags} \\ +-drive driver=blkreplay,if=none,image=img-direct,id=img-blkreplay \\ +-device ide-hd,drive=img-blkreplay \\ " - root='root=/dev/vda' + fi fi fi + + if [ -n "$rr" ]; then + extra_flags="${extra_flags} \ +-object filter-replay,id=replay,netdev=net0 \\ +-icount 'shift=7,rr=${rr},rrfile=${common_qemu_rrfile}' \\ +" + fi case "$arch" in x86_64) if "$kgdb"; then @@ -342,7 +375,8 @@ ${extra_flags} \ mips64) if ! "$ramfs"; then root='root=/dev/hda' - extra_flags="${extra_flags} -drive 'file=${common_images_dir}/rootfs.ext2.qcow2,format=qcow2,snapshot' \\ + extra_flags="${extra_flags} \ +-drive 'file=${common_images_dir}/rootfs.ext2.qcow2,format=qcow2,snapshot' \\ " fi cmd="\ @@ -359,7 +393,7 @@ fi if "$tmux"; then if "$gem5"; then eval "./tmu 'sleep 2;./gem5-shell -n ${common_run_id} ${tmux_args};'" - elif "$debug"; then + elif "$debug"; then eval "./tmu ./rungdb -a '${arch} -L ${common_linux_variant}' -n ${common_run_id} ${tmux_args}" fi fi diff --git a/run-usage.adoc b/run-usage.adoc index 79586a53..2939a7b1 100644 --- a/run-usage.adoc +++ b/run-usage.adoc @@ -44,7 +44,10 @@ Any |`-N` |`VARIANT` |gem5 source input variant. |`-n` | |Run ID. +|`-R` | |Replay a QEMU run record deterministically. +|`-r` | |Record a QEMU run record for later replay with `-R`. |`-P` | |Run the downloaded prebuilt images. +|`-Q` |`VARIANT`` |QEMU build variant. |`-T` |`TRACE_TYPES` |Set trace events to be enabled. If not given, gem5 tracing is completely disabled, while QEMU tracing is enabled but uses default traces that are very rare and don't affect