Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
134 changes: 49 additions & 85 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -39,42 +39,6 @@ void job_step_update(def value=currentBuild.currentResult) {
jobStatusUpdate(job_status_internal, env.STAGE_NAME, value)
}

Map nlt_test() {
// groovylint-disable-next-line NoJavaUtilDate
Date startDate = new Date()
try {
unstash('nltr')
} catch (e) {
print 'Unstash failed, results from NLT stage will not be included'
}
sh label: 'Fault injection testing using NLT',
script: './ci/docker_nlt.sh --class-name el8.fault-injection fi'
List filesList = []
filesList.addAll(findFiles(glob: '*.memcheck.xml'))
int vgfail = 0
int vgerr = 0
if (filesList) {
String rcs = sh label: 'Check for Valgrind errors',
script: "grep -E '<error( |>)' ${filesList.join(' ')} || true",
returnStdout: true
if (rcs) {
vfail = 1
}
String suite = sanitizedStageName()
junitSimpleReport suite: suite,
file: suite + '_valgrind_results.xml',
fails: vgfail,
errors: vgerr,
name: 'Valgrind_Memcheck',
class: 'Valgrind',
message: 'Valgrind Memcheck error detected',
testdata: rcs
}
int runTime = durationSeconds(startDate)
Map runData = ['nlttest_time': runTime]
return runData
}

// Don't define this as a type or it loses it's global scope
target_branch = env.CHANGE_TARGET ? env.CHANGE_TARGET : env.BRANCH_NAME
String sanitized_JOB_NAME() {
Expand Down Expand Up @@ -345,8 +309,11 @@ pipeline {
defaultValue: 'ci_vm9',
description: 'Label to use for 9 VM functional tests')
string(name: 'CI_NLT_1_LABEL',
defaultValue: 'ci_nlt_1',
defaultValue: 'ci_nlt_vm1',
description: 'Label to use for NLT tests')
string(name: 'CI_FI_1_LABEL',
defaultValue: 'ci_fi_vm1',
description: 'Label to use for Fault Injection (FI) tests')
string(name: 'FUNCTIONAL_HARDWARE_MEDIUM_LABEL',
defaultValue: 'ci_nvme5',
description: 'Label to use for the Functional Hardware Medium (MD on SSD) stages')
Expand Down Expand Up @@ -798,7 +765,7 @@ pipeline {
}
}
}
stage('NLT on EL 8.8') {
stage('NLT') {
when {
beforeAgent true
expression { params.CI_NLT_TEST && !skipStage() }
Expand All @@ -809,11 +776,23 @@ pipeline {
steps {
job_step_update(
unitTest(timeout_time: 60,
inst_repos: prRepos(),
test_script: 'ci/unit/test_nlt.sh',
inst_repos: daosRepos(),
test_script: 'ci/unit/test_nlt.sh' +
' --system-ram-reserved 4' +
' --max-log-size 1950MiB' +
' --dfuse-dir /localhome/jenkins/' +
' --log-usage-save nltir.xml' +
' --log-usage-export nltr.json' +
' --class-name nlt all',
with_valgrind: 'memcheck',
valgrind_pattern: '*memcheck.xml',
always_script: 'ci/unit/test_nlt_post.sh',
testResults: 'nlt-junit.xml',
unstash_opt: true,
unstash_tests: false,
inst_rpms: unitPackages()))
inst_rpms: unitPackages(target: 'el9'),
image_version: 'el9.7',
prov_env_vars: 'VM_CPUS=14'))
// recordCoverage(tools: [[parser: 'COBERTURA', pattern:'nltir.xml']],
// skipPublishingChecks: true,
// id: 'tlc', name: 'Fault Injection Interim Report')
Expand All @@ -823,9 +802,10 @@ pipeline {
always {
unitTestPost artifacts: ['nlt_logs/'],
testResults: 'nlt-junit.xml',
always_script: 'ci/unit/test_nlt_post.sh',
referenceJobName: 'daos-stack/daos/release%252F2.6',
valgrind_stash: 'el8-gcc-nlt-memcheck'
valgrind_stash: 'nlt-memcheck',
valgrind_pattern: '*memcheck.xml',
NLT: true
recordIssues enabledForFailure: true,
failOnError: false,
ignoreQualityGate: true,
Expand Down Expand Up @@ -1010,62 +990,47 @@ pipeline {
}
} // post
} // stage('Functional on Ubuntu 20.04')
stage('Fault injection testing on EL 8.8') {
stage('Fault injection testing') {
when {
beforeAgent true
expression { !skipStage() }
}
agent {
dockerfile {
filename 'utils/docker/Dockerfile.el.8'
label 'docker_runner_fi'
additionalBuildArgs dockerBuildArgs(repo_type: 'stable',
parallel_build: true,
deps_build: true)
args '--tmpfs /mnt/daos_0'
}
label params.CI_FI_1_LABEL
}
steps {
job_step_update(
sconsBuild(parallel_build: true,
scons_args: 'PREFIX=/opt/daos TARGET_TYPE=release BUILD_TYPE=debug',
build_deps: 'no'))
job_step_update(nlt_test())
// recordCoverage(tools: [[parser: 'COBERTURA', pattern:'nltr.xml']],
// skipPublishingChecks: true,
// id: 'fir', name: 'Fault Injection Report')
unitTest(timeout_time: 240,
inst_repos: daosRepos(),
test_script: 'ci/unit/test_nlt.sh --memcheck no' +
' --system-ram-reserved 4 --server-debug WARN' +
' --log-usage-import nltr.json' +
' --log-usage-save nltr.xml' +
' --class-name fault-injection fi',
with_valgrind: '',
always_script: 'ci/unit/test_nlt_post.sh',
testResults: 'nlt-junit.xml',
unstash_opt: true,
unstash_tests: false,
inst_rpms: unitPackages(target: 'el9') + ' daos-client-tests',
image_version: 'el9.7',
prov_env_vars: 'VM_CPUS=14'))
}
post {
always {
discoverGitReferenceBuild referenceJob: 'daos-stack/daos/release%252F2.6',
unitTestPost artifacts: ['nlt_logs/'],
testResults: 'nlt-junit.xml',
with_valgrind: '',
FI: true
discoverGitReferenceBuild referenceJob: 'daos-stack/daos/master',
scm: 'daos-stack/daos',
requiredResult: hudson.model.Result.UNSTABLE
recordIssues enabledForFailure: true,
/* ignore warning/errors from PMDK logging system */
filters: [excludeFile('pmdk/.+')],
failOnError: false,
ignoreQualityGate: true,
qualityGates: [[threshold: 1, type: 'TOTAL_ERROR'],
[threshold: 1, type: 'TOTAL_HIGH'],
[threshold: 1, type: 'NEW_NORMAL', unstable: true],
[threshold: 1, type: 'NEW_LOW', unstable: true]],
tools: [issues(pattern: 'nlt-errors.json',
name: 'Fault injection issues',
id: 'Fault_Injection'),
issues(pattern: 'nlt-client-leaks.json',
name: 'Fault injection leaks',
id: 'NLT_client')],
scm: 'daos-stack/daos'
junit testResults: 'nlt-junit.xml'
stash name: 'fault-inject-valgrind',
includes: '*.memcheck.xml',
allowEmpty: true
archiveArtifacts artifacts: 'nlt_logs/el8.fault-injection/',
archiveArtifacts artifacts: 'nlt_logs/fault-injection/',
allowEmptyArchive: true
job_status_update()
}
}
} // stage('Fault injection testing on EL 8.8')
} // stage('Fault injection testing')
stage('Test RPMs on EL 8.6') {
when {
beforeAgent true
Expand Down Expand Up @@ -1277,9 +1242,8 @@ pipeline {
} // stages
post {
always {
valgrindReportPublish valgrind_stashes: ['el8-gcc-nlt-memcheck',
'el8-gcc-unit-memcheck',
'fault-inject-valgrind']
valgrindReportPublish valgrind_stashes: ['nlt-memcheck',
'el8-gcc-unit-memcheck']
job_status_update('final_status')
jobStatusWrite(job_status_internal)
}
Expand Down
42 changes: 0 additions & 42 deletions ci/docker_nlt.sh

This file was deleted.

7 changes: 4 additions & 3 deletions ci/unit/test_nlt.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ mydir="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
# Copy over the install tree and some of the build tree.
rsync -rlpt -z -e "ssh $SSH_KEY_ARGS" .build_vars* opt-daos.tar utils requirements-utest.txt jenkins@"$NODE":build/

# shellcheck disable=SC2029
ssh -tt "$SSH_KEY_ARGS" jenkins@"$NODE" "DAOS_HTTPS_PROXY=\"${DAOS_HTTPS_PROXY:-}\" \
$(cat "$mydir/test_nlt_node.sh")"
ssh -T "$SSH_KEY_ARGS" jenkins@"$NODE" \
"DAOS_HTTPS_PROXY=\"${DAOS_HTTPS_PROXY:-}\" \
DAOS_NO_PROXY=\"${DAOS_NO_PROXY:-}\" \
bash -s -- $*" < "$mydir/test_nlt_node.sh"
12 changes: 9 additions & 3 deletions ci/unit/test_nlt_node.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
set -uex

sudo bash -c 'echo 1 > /proc/sys/kernel/sysrq'
sudo mkdir -p /mnt/daos
# using mmap()'ed ULT stacks requires to bump system default
if [ "$(sudo sysctl -n vm.max_map_count)" -lt "1000000" ] ; then
sudo sysctl vm.max_map_count=1000000
Expand Down Expand Up @@ -40,5 +39,12 @@ pip install /opt/daos/lib/daos/python/
# set high open file limit in the shell to avoid extra warning
sudo prlimit --nofile=1024:262144 --pid $$
prlimit -n
HTTPS_PROXY="${DAOS_HTTPS_PROXY:-}" ./utils/node_local_test.py --max-log-size 1700MiB \
--dfuse-dir /localhome/jenkins/ --log-usage-save nltir.xml --log-usage-export nltr.json all

mkdir -p nlt_logs
sudo mount -t tmpfs tmpfs nlt_logs
sudo chown jenkins:jenkins nlt_logs

TMPDIR="$(pwd)/nlt_logs" \
HTTPS_PROXY="${DAOS_HTTPS_PROXY:-}" \
NO_PROXY="${DAOS_NO_PROXY:-}" \
exec ./utils/node_local_test.py "$@"
8 changes: 6 additions & 2 deletions ci/unit/test_nlt_post.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,18 @@ mkdir nlt_logs
# Copy any log files. Use rsync filters here to allow us to specify
# all files we want to copy, as it's much more flexible than using
# standard wildcards.
rsync -v -dprt -e "ssh $SSH_KEY_ARGS" jenkins@"$NODE":/tmp/ \

# Assuming that node_local_test.py is run with --class-name,
# the logs will be in build/nlt_logs/ on the node.
rsync -v -rlpt -e "ssh $SSH_KEY_ARGS" jenkins@"$NODE":build/nlt_logs/ \
--filter="include dnt*.log" --filter="include dnt*.log.bz2" \
--filter="include dnt_fi_*_logs" \
--filter="include dnt_fi_*_logs" --filter="include */" \
--filter="exclude *" nlt_logs/

rsync -v -dpt -z -e "ssh $SSH_KEY_ARGS" jenkins@"$NODE":build/ \
--filter="include nlt*.json" --filter="include dnt*.xml" \
--filter="include nltir.xml" --filter="include nltr.json" \
--filter="include nlt-junit.xml" --filter="exclude *" ./

mkdir -p vm_test
mv nlt-errors.json vm_test/
5 changes: 0 additions & 5 deletions src/tests/ftest/cart/util/cart_logtest.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,6 @@ def __init__(self, log_iter, quiet=False):
self.fi_triggered = False
self.fi_location = None
self.skip_suffixes = []
self.skip_substrings = []
self._tracers = []
self.ftest_mode = False

Expand Down Expand Up @@ -445,10 +444,6 @@ def _check_pid_from_log_file(self, pid, abort_on_warning, leak_wf, show_memleaks
show = False
if show and any(map(line.get_msg().endswith, self.skip_suffixes)):
show = False
if show:
line_msg = line.get_msg().casefold()
if any(sub in line_msg for sub in self.skip_substrings):
show = False
if show:
# Allow WARNING or ERROR messages, but anything higher like assert should
# trigger a failure.
Expand Down
1 change: 0 additions & 1 deletion utils/nlt_server.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ engines:
- DAOS_MD_CAP=1024
- DAOS_STRICT_SHUTDOWN=1
- DAOS_TARGET_OVERSUBSCRIBE=1
- ABT_STACK_OVERFLOW_CHECK=mprotect
storage:
-
class: ram
Expand Down
15 changes: 5 additions & 10 deletions utils/node_local_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -4897,13 +4897,6 @@ def sizeof_fmt(num, suffix='B'):
if ignore_busy:
lto.skip_suffixes.append(" DER_BUSY(-1012): 'Device or resource busy'")

lto.skip_substrings.extend([
'sluggish ec boundary report from rank',
'sluggish stable epoch reporting',
'progress callback was not called for too long',
'rpc failed; rc:',
])

try:
lto.check_log_file(abort_on_warning=True,
show_memleaks=show_memleaks,
Expand Down Expand Up @@ -5792,7 +5785,7 @@ def _prep(self):
# pylint: disable-next=no-member
num_cores = len(os.sched_getaffinity(0))

if num_cores < 20:
if num_cores < 14:
max_child = 1
else:
max_child = int(num_cores / 4 * 3)
Expand Down Expand Up @@ -6492,12 +6485,14 @@ def run(wf, args):
run_fi = False

if args.perf_check or fi_test or fi_test_dfuse:
fs = subprocess.run([os.path.join(conf['PREFIX'], 'bin', 'fault_status')], check=False)
fi_env = os.environ.copy()
fi_env['PATH'] = f'{conf["PREFIX"]}/bin:{fi_env["PATH"]}'
fs = subprocess.run(['fault_status'], check=False, env=fi_env)
print(fs)
if fs.returncode == 0:
run_fi = True
else:
print("Unable to detect fault injection feature, skipping testing")
print("Unable to detect fault injection feature - skipping FI testing")

if run_fi:
args.server_debug = 'INFO'
Expand Down
Loading