Skip to content

Commit

Permalink
feat: System Health Report (backport #26046) (#26085)
Browse files Browse the repository at this point in the history
* feat: System Health Report

(cherry picked from commit e06901a)

* feat: background worker monitoring

(cherry picked from commit d410af7)

* feat: better bench doctor in UI

(cherry picked from commit d7a0ed8)

* feat: socketio health check

(cherry picked from commit 023297b)

# Conflicts:
#	realtime/handlers/frappe_handlers.js

* feat: email health checks

(cherry picked from commit 2df9e2e)

* feat: Errors in System Health

(cherry picked from commit 7bfa31f)

* feat: database health stats

(cherry picked from commit b9ed8c5)

* feat: cache health

(cherry picked from commit 92dc5f3)

* feat: backup health

(cherry picked from commit 614857e)

* feat: system health - users

(cherry picked from commit 5b70060)

* refactor: Single page instead of tabs

(cherry picked from commit 99d2dea)

* feat: background jobs test

(cherry picked from commit 7411c4f)

* fix: exception handling for health report

(cherry picked from commit cbf4351)

* chore: rename child doctypes

(cherry picked from commit a94534a)

* feat: highlight bad indicators

(cherry picked from commit 4f406d7)

* fix(UX): help links and relative URLs

also closes #23020

(cherry picked from commit d40b2a2)

* feat: extend highlight to child tables

(cherry picked from commit b0ce404)

* refactor: use table for errors

(cherry picked from commit 9154e42)

* feat: failng scheduled jobs

(cherry picked from commit c712780)

* refactor: misc

- fix styles
- hardcode perm check
- few more indicators
- cache directory size for 5 min (rapid refreshes should be fast enough)

(cherry picked from commit c9a8cd6)

* chore: conflicts

---------

Co-authored-by: Ankush Menat <ankush@frappe.io>
  • Loading branch information
mergify[bot] and ankush committed Apr 22, 2024
1 parent d089765 commit 7b8a923
Show file tree
Hide file tree
Showing 23 changed files with 1,278 additions and 4 deletions.
Empty file.
98 changes: 98 additions & 0 deletions frappe/desk/doctype/system_health_report/system_health_report.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
// Copyright (c) 2024, Frappe Technologies and contributors
// For license information, please see license.txt

frappe.ui.form.on("System Health Report", {
onload(frm) {
let poll_attempts = 0;
const interval = setInterval(() => {
frappe
.xcall(
"frappe.desk.doctype.system_health_report.system_health_report.get_job_status",
{ job_id: frm.doc.test_job_id }
)
.then((status) => {
poll_attempts += 1;
if (["finished", "failed"].includes(status) || poll_attempts > 30) {
clearInterval(interval);
}
status && frm.set_value("background_jobs_check", status);
});
}, 1000);
},
refresh(frm) {
frm.set_value("socketio_ping_check", "Fail");
frappe.realtime.on("pong", () => {
frm.set_value("socketio_ping_check", "Pass");
frm.set_value(
"socketio_transport_mode",
frappe.realtime.socket.io?.engine?.transport?.name
);
});
frappe.realtime.emit("ping");
frm.disable_save();
frm.trigger("setup_highlight");
},

setup_highlight(frm) {
/// field => is bad?
const conditions = {
scheduler_status: (val) => val.toLowerCase() != "active",
background_jobs_check: (val) => val.toLowerCase() != "finished",
total_background_workers: (val) => val == 0,
binary_logging: (val) => val.toLowerCase() != "on",
socketio_ping_check: (val) => val != "Pass",
socketio_transport_mode: (val) => val != "websocket",
onsite_backups: (val) => val == 0,
failed_logins: (val) => val > frm.doc.total_users,
total_errors: (val) => val > 50,
// 5% excluding very small numbers
unhandled_emails: (val) =>
val > 3 && frm.doc.handled_emails > 3 && val / frm.doc.handled_emails > 0.05,
failed_emails: (val) =>
val > 3 &&
frm.doc.total_outgoing_emails > 3 &&
val / frm.doc.total_outgoing_emails > 0.05,
pending_emails: (val) =>
val > 3 &&
frm.doc.total_outgoing_emails > 3 &&
val / frm.doc.total_outgoing_emails > 0.1,
"queue_status.pending_jobs": (val) => val > 50,
"background_workers.utilization": (val) => val > 70,
"background_workers.failed_jobs": (val) => val > 50,
"top_errors.occurrences": (val) => val > 10,
"failing_scheduled_jobs.failure_rate": (val) => val > 10,
};

const style = document.createElement("style");
style.innerText = `.health-check-failed {
font-weight: bold;
color: var(--text-colour);
background-color: var(--bg-red);
}`;
document.head.appendChild(style);

const update_fields = () => {
Object.entries(conditions).forEach(([field, condition]) => {
try {
if (field.includes(".")) {
let [table, fieldname] = field.split(".");

frm.fields_dict[table].grid.grid_rows.forEach((row) => {
let is_bad = condition(row.doc[fieldname]);
$(row.columns[fieldname]).toggleClass("health-check-failed", is_bad);
});
} else {
let is_bad = condition(frm.doc[field]);
let df = frm.fields_dict[field];
$(df.disp_area).toggleClass("health-check-failed", is_bad);
}
} catch (e) {
console.log("Failed to evaluated", e);
}
});
};

update_fields();
setInterval(update_fields, 1000);
},
});
Loading

0 comments on commit 7b8a923

Please sign in to comment.