Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/develop' into feature/use_jcb_atm
Browse files Browse the repository at this point in the history
* upstream/develop:
  Add CCPP suite and FASTER option to UFS build (NOAA-EMC#2521)
  New "atmanlfv3inc" Rocoto job (NOAA-EMC#2420)
  Hotfix to disable STALLED in CI as an error (NOAA-EMC#2523)
  Add restart on failure capability for the forecast executable (NOAA-EMC#2510)
  Update parm/transfer list files to match vetted GFSv16 set (NOAA-EMC#2517)
  Update gdas_gsibec_ver to 20240416 (NOAA-EMC#2497)
  Adding more cycles to gempak script gfs_meta_sa2.sh (NOAA-EMC#2518)
  Update gsi_enkf.sh hash to 457510c (NOAA-EMC#2514)
  Enable using the FV3_global_nest_v1 CCPP suite (NOAA-EMC#2512)
  CI Refactoring and STALLED case detection (NOAA-EMC#2488)
  Add C768 and C1152 S2SW test cases (NOAA-EMC#2509)
  Fix paths for refactored prepocnobs task (NOAA-EMC#2504)
  • Loading branch information
danholdaway committed Apr 23, 2024
2 parents 0d89f2f + d0e1cc8 commit 0a920cd
Show file tree
Hide file tree
Showing 121 changed files with 2,033 additions and 1,321 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ ush/global_cycle.sh
ush/global_cycle_driver.sh
ush/jediinc2fv3.py
ush/ufsda
ush/finddate.sh
ush/soca
ush/make_NTC_file.pl
ush/make_ntc_bull.pl
ush/make_tif.sh
Expand Down
101 changes: 42 additions & 59 deletions ci/Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,22 @@ def HOME = 'none'
def caseList = ''
// Location of the custom workspaces for each machine in the CI system. They are persitent for each iteration of the PR.
def custom_workspace = [hera: '/scratch1/NCEPDEV/global/CI', orion: '/work2/noaa/stmp/CI/ORION', hercules: '/work2/noaa/stmp/CI/HERCULES']
def repo_url = 'git@github.com:NOAA-EMC/global-workflow.git'
def STATUS = 'Passed'

pipeline {

agent { label 'built-in' }

options {
skipDefaultCheckout()
parallelsAlwaysFailFast()
//parallelsAlwaysFailFast()
}

stages { // This initial stage is used to get the Machine name from the GitHub labels on the PR
// which is used to designate the Nodes in the Jenkins Controler by the agent label
// Each Jenknis Node is connected to said machine via an JAVA agent via an ssh tunnel
// no op 2

stage('Get Machine') {
agent { label 'built-in' }
Expand Down Expand Up @@ -46,10 +50,7 @@ pipeline {
properties([parameters([[$class: 'NodeParameterDefinition', allowedSlaves: ['built-in', 'Hera-EMC', 'Orion-EMC'], defaultSlaves: ['built-in'], name: '', nodeEligibility: [$class: 'AllNodeEligibility'], triggerIfResult: 'allCases']])])
HOME = "${WORKSPACE}"
sh(script: "mkdir -p ${HOME}/RUNTESTS;rm -Rf ${HOME}/RUNTESTS/error.logs")
pullRequest.addLabel("CI-${Machine}-Building")
if (pullRequest.labels.any { value -> value.matches("CI-${Machine}-Ready") }) {
pullRequest.removeLabel("CI-${Machine}-Ready")
}
sh(script: """${GH} pr edit ${env.CHANGE_ID} --repo ${repo_url} --add-label "CI-${Machine}-Building" --remove-label "CI-${Machine}-Ready" """)
}
echo "Building and running on ${Machine} in directory ${HOME}"
}
Expand Down Expand Up @@ -108,26 +109,23 @@ pipeline {
catch (Exception error_arch) { echo "Failed to archive error log ${line}: ${error_arch.getMessage()}" }
}
}
repo_url=sh(script: "${HOMEgfs}/ci/scripts/utils/publish_logs.py --file ${error_logs} --repo PR_BUILD_${env.CHANGE_ID}", returnStdout: true).trim()
gist_url=sh(script: "${HOMEgfs}/ci/scripts/utils/publish_logs.py --file ${error_logs} --gist PR_BUILD_${env.CHANGE_ID}", returnStdout: true).trim()
try {
pullRequest.comment("Build failed on **${Machine}** with error logs:${error_logs_message}\n\nFollow link here to view the contents of the above file(s): [(link)](${gist_url})")
sh(script: "${HOMEgfs}/ci/scripts/utils/publish_logs.py --file ${error_logs} --repo PR_BUILD_${env.CHANGE_ID}")
gist_url=sh(script: "${HOMEgfs}/ci/scripts/utils/publish_logs.py --file ${error_logs} --gist PR_BUILD_${env.CHANGE_ID}", returnStdout: true).trim()
sh(script: """${GH} pr comment ${env.CHANGE_ID} --repo ${repo_url} --body "Build **FAILED** on **${Machine}** with error logs:\n\\`\\`\\`\n${error_logs_message}\\`\\`\\`\n\nFollow link here to view the contents of the above file(s): [(link)](${gist_url})" """)
} catch (Exception error_comment) {
echo "Failed to comment on PR: ${error_comment.getMessage()}"
}
error("Failed to build system on ${Machine}")
}
}
sh(script: './link_workflow.sh')
// sh(script: "echo ${HOMEgfs} > BUILT_semaphor")
sh(script: "echo ${HOMEgfs} > BUILT_semaphor")
}
}
if (env.CHANGE_ID && system == 'gfs') {
try {
if (pullRequest.labels.any { value -> value.matches("CI-${Machine}-Building") }) {
pullRequest.removeLabel("CI-${Machine}-Building")
}
pullRequest.addLabel("CI-${Machine}-Running")
sh(script: """${GH} pr edit ${env.CHANGE_ID} --repo ${repo_url} --add-label "CI-${Machine}-Running" --remove-label "CI-${Machine}-Building" """)
} catch (Exception e) {
echo "Failed to update label from Buildng to Running: ${e.getMessage()}"
}
Expand All @@ -144,6 +142,7 @@ pipeline {
}

stage('Run Tests') {
failFast false
matrix {
agent { label "${machine}-emc" }
axes {
Expand Down Expand Up @@ -175,19 +174,22 @@ pipeline {
when {
expression { return caseList.contains(Case) }
}
failFast false
steps {
script {
HOMEgfs = "${HOME}/gfs" // common HOMEgfs is used to launch the scripts that run the experiments
pslot = sh(script: "${HOMEgfs}/ci/scripts/utils/ci_utils_wrapper.sh get_pslot ${HOME}/RUNTESTS ${Case}", returnStdout: true).trim()
def pslot = sh(script: "${HOMEgfs}/ci/scripts/utils/ci_utils_wrapper.sh get_pslot ${HOME}/RUNTESTS ${Case}", returnStdout: true).trim()
def error_file = "${HOME}/RUNTESTS/${pslot}_error.logs"
sh(script: " rm -f ${error_file}")
try {
sh(script: "${HOMEgfs}/ci/scripts/run-check_ci.sh ${HOME} ${pslot}")
sh(script: "${HOMEgfs}/ci/scripts/run-check_ci.sh ${HOME} ${pslot} ${system}")
} catch (Exception error_experment) {
sh(script: "${HOMEgfs}/ci/scripts/utils/ci_utils_wrapper.sh cancel_all_batch_jobs ${HOME}/RUNTESTS")
sh(script: "${HOMEgfs}/ci/scripts/utils/ci_utils_wrapper.sh cancel_batch_jobs ${pslot}")
ws(HOME) {
def error_logs = ""
def error_logs_message = ""
if (fileExists("RUNTESTS/error.logs")) {
def fileContent = readFile 'RUNTESTS/error.logs'
if (fileExists(error_file)) {
def fileContent = readFile error_file
def lines = fileContent.readLines()
for (line in lines) {
echo "archiving: ${line}"
Expand All @@ -201,15 +203,22 @@ pipeline {
}
}
}
repo_url = sh(script: "${HOMEgfs}/ci/scripts/utils/publish_logs.py --file ${error_logs} --repo PR_${env.CHANGE_ID}", returnStdout: true).trim()
gist_url = sh(script: "${HOMEgfs}/ci/scripts/utils/publish_logs.py --file ${error_logs} --gist PR_${env.CHANGE_ID}", returnStdout: true).trim()
try {
pullRequest.comment("Experiment ${Case} failed on ${Machine} with error logs: ${error_logs_message}\n\nFollow link here to view the contents of the above file(s): [(link)](${gist_url})")
gist_url = sh(script: "${HOMEgfs}/ci/scripts/utils/publish_logs.py --file ${error_logs} --gist PR_${env.CHANGE_ID}", returnStdout: true).trim()
sh(script: """${GH} pr comment ${env.CHANGE_ID} --repo ${repo_url} --body "Experiment ${Case} **FAILED** on ${Machine} with error logs:\n\\`\\`\\`\n${error_logs_message}\\`\\`\\`\n\nFollow link here to view the contents of the above file(s): [(link)](${gist_url})" """)
sh(script: "${HOMEgfs}/ci/scripts/utils/publish_logs.py --file ${error_logs} --repo PR_${env.CHANGE_ID}")
} catch (Exception error_comment) {
echo "Failed to comment on PR: ${error_comment.getMessage()}"
}
} else {
echo "No error logs found for failed cases in $HOME/RUNTESTS/error.logs"
echo "No error logs found for failed cases in $HOME/RUNTESTS/${pslot}_error.logs"
}
STATUS = 'Failed'
try {
sh(script: """${GH} pr edit ${env.CHANGE_ID} --repo ${repo_url} --remove-label "CI-${Machine}-Running" --add-label "CI-${Machine}-${STATUS}" """, returnStatus: true)
sh(script: """${GH} pr comment ${env.CHANGE_ID} --repo ${repo_url} --body "Experiment ${Case} **FAILED** on ${Machine}\nin\\`${HOME}/RUNTESTS/${pslot}\\`" """)
} catch (Exception e) {
echo "Failed to update label from Running to ${STATUS}: ${e.getMessage()}"
}
error("Failed to run experiments ${Case} on ${Machine}")
}
Expand All @@ -220,49 +229,23 @@ pipeline {
}
}
}
}

post {
always {
script {
if(env.CHANGE_ID) {
try {
for (label in pullRequest.labels) {
if (label.contains("${Machine}")) {
pullRequest.removeLabel(label)
}
}
} catch (Exception e) {
echo "Failed to remove labels: ${e.getMessage()}"
}
}
}
}
success {
script {
if(env.CHANGE_ID) {
try {
pullRequest.addLabel("CI-${Machine}-Passed")
def timestamp = new Date().format('MM dd HH:mm:ss', TimeZone.getTimeZone('America/New_York'))
pullRequest.comment("**CI SUCCESS** ${Machine} at ${timestamp}\n\nBuilt and ran in directory `${HOME}`")
} catch (Exception e) {
echo "Failed to add success label or comment: ${e.getMessage()}"
}
stage( 'FINALIZE' ) {
when {
expression {
STATUS == 'Passed'
}
}
}
failure {
script {
if(env.CHANGE_ID) {
agent { label "${machine}-emc" }
steps {
script {
try {
pullRequest.addLabel("CI-${Machine}-Failed")
def timestamp = new Date().format('MM dd HH:mm:ss', TimeZone.getTimeZone('America/New_York'))
pullRequest.comment("**CI FAILED** ${Machine} at ${timestamp}<br>Built and ran in directory `${HOME}`")
sh(script: """${GH} pr edit ${env.CHANGE_ID} --repo ${repo_url} --remove-label "CI-${Machine}-Running" --remove-label "CI-${Machine}-Building" --add-label "CI-${Machine}-${STATUS}" """, returnStatus: true)
sh(script: """${GH} pr comment ${env.CHANGE_ID} --repo ${repo_url} --body "**CI ${STATUS}** ${Machine} at <br>Built and ran in directory \\`${HOME}\\`" """, returnStatus: true)
} catch (Exception e) {
echo "Failed to add failure label or comment: ${e.getMessage()}"
echo "Failed to update label from Running to ${STATUS}: ${e.getMessage()}"
}
}
}
}
}
}
}
14 changes: 14 additions & 0 deletions ci/cases/hires/C1152_S2SW.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
experiment:
system: gfs
mode: forecast-only

arguments:
pslot: {{ 'pslot' | getenv }}
app: S2SW
resdetatmos: 1152
resdetocean: 0.25
comroot: {{ 'RUNTESTS' | getenv }}/COMROOT
expdir: {{ 'RUNTESTS' | getenv }}/EXPDIR
idate: 2019120300
edate: 2019120300
yaml: {{ HOMEgfs }}/ci/cases/yamls/gfs_defaults_ci.yaml
14 changes: 14 additions & 0 deletions ci/cases/hires/C768_S2SW.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
experiment:
system: gfs
mode: forecast-only

arguments:
pslot: {{ 'pslot' | getenv }}
app: S2SW
resdetatmos: 768
resdetocean: 0.25
comroot: {{ 'RUNTESTS' | getenv }}/COMROOT
expdir: {{ 'RUNTESTS' | getenv }}/EXPDIR
idate: 2019120300
edate: 2019120300
yaml: {{ HOMEgfs }}/ci/cases/yamls/gfs_defaults_ci.yaml
59 changes: 35 additions & 24 deletions ci/scripts/check_ci.sh
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ pr_list_dbfile="${GFS_CI_ROOT}/open_pr_list.db"

pr_list=""
if [[ -f "${pr_list_dbfile}" ]]; then
pr_list=$("${HOMEgfs}/ci/scripts/pr_list_database.py" --dbfile "${pr_list_dbfile}" --display | grep -v Failed | grep Running | awk '{print $1}') || true
pr_list=$("${HOMEgfs}/ci/scripts/utils/pr_list_database.py" --dbfile "${pr_list_dbfile}" --list Open Running) || true
fi
if [[ -z "${pr_list+x}" ]]; then
echo "no PRs open and ready to run cases on .. exiting"
Expand Down Expand Up @@ -100,7 +100,7 @@ for pr in ${pr_list}; do
sed -i "1 i\`\`\`" "${output_ci}"
sed -i "1 i\All CI Test Cases Passed on ${MACHINE_ID^}:" "${output_ci}"
"${GH}" pr comment "${pr}" --repo "${REPO_URL}" --body-file "${output_ci}"
"${HOMEgfs}/ci/scripts/pr_list_database.py" --remove_pr "${pr}" --dbfile "${pr_list_dbfile}"
"${HOMEgfs}/ci/scripts/utils/pr_list_database.py" --remove_pr "${pr}" --dbfile "${pr_list_dbfile}"
# Check to see if this PR that was opened by the weekly tests and if so close it if it passed on all platforms
weekly_labels=$(${GH} pr view "${pr}" --repo "${REPO_URL}" --json headRefName,labels,author --jq 'select(.author.login | contains("emcbot")) | select(.headRefName | contains("weekly_ci")) | .labels[].name ') || true
if [[ -n "${weekly_labels}" ]]; then
Expand Down Expand Up @@ -133,28 +133,40 @@ for pr in ${pr_list}; do
if [[ ! -f "${db}" ]]; then
continue
fi
rocoto_stat_output=$("${rocotostat}" -w "${xml}" -d "${db}" -s | grep -v CYCLE) || true
num_cycles=$(echo "${rocoto_stat_output}" | wc -l) || true
num_done=$(echo "${rocoto_stat_output}" | grep -c Done) || true
# num_succeeded=$("${rocotostat}" -w "${xml}" -d "${db}" -a | grep -c SUCCEEDED) || true
echo "${pslot} Total Cycles: ${num_cycles} number done: ${num_done}" || true
num_failed=$("${rocotostat}" -w "${xml}" -d "${db}" -a | grep -c -E 'FAIL|DEAD') || true
if [[ ${num_failed} -ne 0 ]]; then
"${GH}" pr edit --repo "${REPO_URL}" "${pr}" --remove-label "CI-${MACHINE_ID^}-Running" --add-label "CI-${MACHINE_ID^}-Failed"
error_logs=$("${rocotostat}" -d "${db}" -w "${xml}" | grep -E 'FAIL|DEAD' | awk '{print "-c", $1, "-t", $2}' | xargs "${rocotocheck}" -d "${db}" -w "${xml}" | grep join | awk '{print $2}') || true
{
echo "Experiment ${pslot} *** FAILED *** on ${MACHINE_ID^}"
echo "Experiment ${pslot} with ${num_failed} tasks failed at $(date +'%D %r')" || true
echo "Error logs:"
echo "${error_logs}"
} >> "${output_ci}"
sed -i "1 i\`\`\`" "${output_ci}"
"${GH}" pr comment "${pr}" --repo "${REPO_URL}" --body-file "${output_ci}"
"${HOMEgfs}/ci/scripts/pr_list_database.py" --remove_pr "${pr}" --dbfile "${pr_list_dbfile}"
cancel_all_batch_jobs "${pr_dir}/RUNTESTS/"
break

set +e
rocoto_state="$("${HOMEgfs}/ci/scripts/utils/rocotostat.py" -w "${xml}" -d "${db}")"
rocoto_error=$?
rm -f "${output_ci_single}"
if [[ "${rocoto_error}" -ne 0 ]]; then
"${GH}" pr edit --repo "${REPO_URL}" "${pr}" --remove-label "CI-${MACHINE_ID^}-Running" --add-label "CI-${MACHINE_ID^}-Failed"
if [[ "${rocoto_state}" == "STALLED" ]]; then
# shellcheck disable=SC2312
"${GH}" pr comment "${pr}" --repo "${REPO_URL}" --body "Experiment ${pslot} **${rocoto_state}** on ${MACHINE_ID^} at $(date +'%D %r')"
"${HOMEgfs}/ci/scripts/utils/pr_list_database.py" --remove_pr "${pr}" --dbfile "${pr_list_dbfile}"
cancel_all_batch_jobs "${pr_dir}/RUNTESTS"
exit "${rocoto_error}"
fi
error_logs=$("${rocotostat}" -d "${db}" -w "${xml}" | grep -E 'FAIL|DEAD' | awk '{print "-c", $1, "-t", $2}' | xargs "${rocotocheck}" -d "${db}" -w "${xml}" | grep join | awk '{print $2}') || true
# shellcheck disable=SC2086
${HOMEgfs}/ci/scripts/utils/publish_logs.py --file ${error_logs} --repo "PR_${pr}" > /dev/null
# shellcheck disable=SC2086
gist_url="$("${HOMEgfs}/ci/scripts/utils/publish_logs.py" --file ${error_logs} --gist "PR_${pr}")"
{
echo "Experiment ${pslot} **${rocoto_state}** on ${MACHINE_ID^} at $(date +'%D %r')" || true
echo ""
echo "Error logs:"
echo "\`\`\`"
echo "${error_logs}"
echo "\`\`\`"
echo "Follow link here to view the contents of the above file(s): [(link)](${gist_url})"
} >> "${output_ci_single}"
"${GH}" pr comment "${pr}" --repo "${REPO_URL}" --body-file "${output_ci_single}"
"${HOMEgfs}/ci/scripts/utils/pr_list_database.py" --remove_pr "${pr}" --dbfile "${pr_list_dbfile}"
cancel_all_batch_jobs "${pr_dir}/RUNTESTS"
exit "${rocoto_error}"
fi
if [[ "${num_done}" -eq "${num_cycles}" ]]; then
if [[ "${rocoto_state}" == "DONE" ]]; then
#Remove Experment cases that completed successfully
rm -Rf "${pslot_dir}"
rm -Rf "${pr_dir}/RUNTESTS/COMROOT/${pslot}"
Expand All @@ -164,7 +176,6 @@ for pr in ${pr_list}; do
echo "Experiment ${pslot} **SUCCESS** on ${MACHINE_ID^} at ${DATE}" >> "${output_ci_single}"
echo "Experiment ${pslot} *** SUCCESS *** at ${DATE}" >> "${output_ci}"
"${GH}" pr comment "${pr}" --repo "${REPO_URL}" --body-file "${output_ci_single}"

fi
done
done
Loading

0 comments on commit 0a920cd

Please sign in to comment.