<a href="https://colab.research.google.com/github/eyasu-taye/ncc_med_healthcheck/blob/main/hc.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import re
from datetime import datetime, timedelta
from prettytable import PrettyTable

# --- Load the log file safely ---
with open("/content/mdc2_hc_12122025.txt", encoding="utf-8", errors="ignore") as f:
    log_data = f.read()

results = []

def add_row(category, status, remarks):
    results.append([category.strip(), status.strip(), remarks.strip()])

# --- STEP 1: Split sections by markers ---
split_pattern = r"(#+\n#\s*[A-Za-z0-9_\-/ ]+?\s*#\n#+)"
parts = re.split(split_pattern, log_data)

sections = []
for i in range(1, len(parts), 2):
    header = parts[i]
    content = parts[i + 1] if i + 1 < len(parts) else ""
    name = re.search(r"#\s*([A-Za-z0-9_\-/ ]+?)\s*#", header)
    section_name = name.group(1).strip().lower() if name else "unknown"
    sections.append((section_name, content.strip()))

# --- STEP 2: Analyze each section ---
for section_name, content in sections:
    up_items = re.findall(r"([A-Za-z0-9_\-:.]+).*?\bUP\b", content)
    down_items = re.findall(r"([A-Za-z0-9_\-:.]+).*?\b(DOWN|FAIL|ERROR)\b", content, re.I)

       # --- SPS STATUS ---
    if "sps-status" in section_name:
        up_items = re.findall(r"([A-Za-z0-9_\-:.]+).*?\bUP\b", content)
        down_items = re.findall(r"([A-Za-z0-9_\-:.]+).*?\b(DOWN|FAIL|ERROR)\b", content, re.I)
        down_list = ", ".join(set(d[0] for d in down_items)) if down_items else "None"
        add_row("SPS Status",
                "⚠️ Issues" if down_items else "✅ Stable",
                f"{len(up_items)} UP, {len(down_items)} DOWN ({down_list})")

    # --- DB STATS ---
    elif "db-stats" in section_name:
        total_nodes = re.search(r"Total DB nodes:\s+(\d+)", content)
        responding_nodes = re.search(r"DB nodes responding:\s+(\d+)", content)
        record_sets = len(re.findall(r'^[A-Za-z0-9_]+\s+\d', content, re.M))

        total_nodes = int(total_nodes.group(1)) if total_nodes else 0
        responding_nodes = int(responding_nodes.group(1)) if responding_nodes else 0
        add_row("DB Stats",
                "✅ OK" if total_nodes == responding_nodes else "⚠️ Issue",
                f"{responding_nodes}/{total_nodes} nodes responding, {record_sets} record sets found")

    # --- XDR STATUS ---
    elif re.search(r'\bxdr status\b', section_name) and "pod" not in section_name:
        lag = re.findall(r"lag[:\s]+(\d+)", content)
        latency = re.findall(r"latency_ms[:\s]+(\d+)", content)
        state = re.search(r"Service State:\s*(\w+)", content)
        avg_lag = sum(map(int, lag)) / len(lag) if lag else 0
        avg_lat = sum(map(int, latency)) / len(latency) if latency else 0
        state_val = state.group(1) if state else "UNKNOWN"
        status = "✅ OK" if state_val.upper() == "UP" and avg_lag < 500 else "⚠️ High Lag/Down"
        add_row("XDR Status", status,
                f"State: {state_val}, Avg lag {avg_lag:.1f} ms, Avg latency {avg_lat:.1f} ms")

    # --- POD XDR STATUS ---
    elif "pod xdr" in section_name or ("xdr status" in section_name and "pod" in section_name):
        pods = re.findall(r"(\S+)\s+lag[:=]\s*(\d+).*?latency[:=]\s*(\d+)", content)
        total_pods = len(pods)
        high_lag = [(p, l, t) for p, l, t in pods if int(l) > 1000 or int(t) > 1000]
        avg_lag = sum(int(l) for _, l, _ in pods)/total_pods if total_pods else 0
        if total_pods:
            status = "⚠️ High Lag" if high_lag else "✅ Normal"
            top = ", ".join([f"{p}(lag {l}, lat {t})" for p, l, t in high_lag[:3]]) if high_lag else "All normal"
            add_row("Pod XDR Status", status, f"Pods: {total_pods}, Avg lag: {avg_lag:.1f}, {top}")
        else:
            add_row("Pod XDR Status", "ℹ️ No Data", "No pod lag/latency entries")

    # --- AEROSPIKE RESTART STATUS ---
    elif "restart" in section_name and "aerospike" in section_name:
        now = datetime.now()
        restarts = []
        for pod, t in re.findall(r'(\S+)\s+(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2})', content):
            try:
                restart_time = datetime.strptime(t, "%Y-%m-%dT%H:%M:%S")
                if (now - restart_time) <= timedelta(hours=24):
                    restarts.append((pod, t))
            except ValueError:
                pass
        if restarts:
            add_row("Aerospike Restart Status",
                    f"⚠️ {len(restarts)} Found",
                    ", ".join([f"{p} @ {t}" for p, t in restarts[:5]]) + (" ..." if len(restarts) > 5 else ""))
        else:
            add_row("Aerospike Restart Status", "✅ None", "No recent restarts")


    # Diameter Peers
    elif "diameter" in section_name:
        peers = re.findall(r"(SS7|PGW|DRA).*?(UP|DOWN)", content)
        total = len(peers)
        down_peers = [p[0] for p in peers if p[1].lower() == "down"]
        add_row("Diameter Peers",
                "⚠️ Some Down" if down_peers else "✅ All UP",
                f"Total: {total}, Down: {len(down_peers)} ({', '.join(down_peers) if down_peers else 'None'})")

    # Replication Status
    elif "replication" in section_name:
        ok = len(re.findall(r"Running.*Yes", content, re.I))
        failed = len(re.findall(r"(No|Error)", content, re.I))
        add_row("Replication Status",
                "⚠️ Issue" if failed else "✅ OK",
                f"Running: {ok}, Failed: {failed}")

    # Pod Restarts (24h)
    elif "restart" in section_name:
        now = datetime.now()
        restarts = []
        for pod, t in re.findall(r'(\S+)\s+(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2})', content):
            try:
                restart_time = datetime.strptime(t, "%Y-%m-%dT%H:%M:%S")
                if (now - restart_time) <= timedelta(hours=24):
                    restarts.append((pod, t))
            except ValueError:
                pass
        if restarts:
            add_row("Pod Restarts (24h)",
                    f"⚠️ {len(restarts)} Found",
                    ", ".join([f"{p[0]} @ {p[1]}" for p in restarts[:100]]) + (" ..." if len(restarts) > 100 else ""))
        else:
            add_row("Pod Restarts (24h)", "✅ None", "No recent restarts")

    # Node Status
    elif "node status" in section_name:
        nodes = re.findall(r"(mdc\S+).*?(Ready|NotReady)", content)
        ready = [n for n in nodes if "Ready" in n[1]]
        not_ready = [n for n in nodes if "NotReady" in n[1]]
        add_row("Node Status",
                "⚠️ Not Ready" if not_ready else "✅ All Ready",
                f"Total: {len(nodes)}, Ready: {len(ready)}, NotReady: {len(not_ready)} ({', '.join([n[0] for n in not_ready])})")

    # Pod CPU Status
    elif "pod cpu status" in section_name:
        rows = re.findall(r"(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)m\s+(\d+)", content)
        cpu_vals = [int(r[4]) for r in rows]
        mem_vals = [int(r[5]) for r in rows]
        avg_cpu = sum(cpu_vals)/len(cpu_vals) if cpu_vals else 0
        avg_mem = sum(mem_vals)/len(mem_vals) if mem_vals else 0
        high_cpu = [(r[1], r[4]) for r in rows if int(r[4]) > avg_cpu]
        high_mem = [(r[1], r[5]) for r in rows if int(r[5]) > avg_mem]
        add_row("Pod CPU/Memory Usage",
                "✅ Normal" if not high_cpu and not high_mem else "⚠️ High Usage",
                f"Pods: {len(rows)}, Above Avg CPU: {len(high_cpu)}, Above Avg MEM: {len(high_mem)}")

    # Flush-max-ms
    elif "flush-max-ms" in section_name:
        flush = [int(v) for v in re.findall(r"(\d+)", content)]
        high = [v for v in flush if v > 1000]
        add_row("Flush-Max-MS", "⚠️ High Latency" if high else "✅ Normal",
                f"Count >1000ms: {len(high)}, Max: {max(flush) if flush else 0}ms")

    # Check Clock Skew AS
    elif "clock_skew_as" in section_name:
        skews = [int(v) for v in re.findall(r"skew-ms\s+(\d+)", content)]
        avg_skew = sum(skews)/len(skews) if skews else 0
        max_skew = max(skews) if skews else 0
        add_row("Clock Skew (AS)",
                "⚠️ High Skew" if max_skew > 100 else "✅ Normal",
                f"Avg: {avg_skew:.1f} ms, Max: {max_skew} ms, Samples: {len(skews)}")

    # Backup Status
    elif "check_backup_status" in section_name or "check_btel_backup_status" in section_name:
        total = re.search(r"total\s+([\d.]+[KMG]?)", content)
        dirs = re.findall(r"^drwx", content, re.M)
        add_row("Backup Status", "✅ OK", f"Dirs: {len(dirs)}, Size: {total.group(1) if total else 'Unknown'}")

    # Disk Space
    elif "diskspace" in section_name:
        disks = re.findall(r"(/\S+)\s+\d+\S*\s+\d+\S*\s+\d+\S*\s+(\d+)%", content)
        if disks:
            high_usage = [(m, int(u)) for m, u in disks if int(u) > 85]
            if high_usage:
                mounts_str = ", ".join([f"{m}: {u}%" for m, u in high_usage])
                add_row(section_name,
                        f"⚠️ {len(high_usage)} Mounts >85%",
                        mounts_str)
            else:
                add_row(section_name, "✅ OK", "All mounts below 85%")
        else:
            add_row(section_name, "ℹ️ No Data", "No mount info found")

    # A/C Device Dump Status
    elif "check_a/c_device_dump_status" in section_name:
        folders = re.findall(r"^drwx", content, re.M)
        total_size = re.search(r"total\s+([\d.]+[KMG]?)", content)
        size_str = total_size.group(1) if total_size else "Unknown"
        add_row("A/C Device Dump Status", "✅ OK", f"Folders: {len(folders)}, Total Size: {size_str}")

    # EDR / CDR Generation
    elif "edr_status" in section_name or "cdr_status" in section_name:
        files = re.findall(r"\.EDR\.gz|\.CDR\.gz", content)
        total_size = re.search(r"total\s+([\d.]+[KMG]?)", content)
        size_str = total_size.group(1) if total_size else "Unknown"
        add_row(
            "EDR Generation" if "edr" in section_name else "CDR Generation",
            "✅ OK" if files else "⚠️ None Found",
            f"Files: {len(files)}, Size: {size_str}"
        )

    # Cron Jobs
    elif "cronjob" in section_name:
        jobs = re.findall(r"^\S+\s+\S+\s+\S+\s+(True|False)", content, re.M)
        suspended = [j for j in jobs if j.lower() == "true"]
        add_row("Cron Jobs",
                "⚠️ Suspended Jobs" if suspended else "✅ OK",
                f"Total: {len(jobs)}, Suspended: {len(suspended)}")

    else:
        add_row(section_name, "ℹ️ Parsed", f"{len(up_items)} UP, {len(down_items)} DOWN")

# --- STEP 3: Display Results as HTML Table ---
html_table = "<table>\n"
html_table += "  <tr><th>Section</th><th>Status</th><th>Remarks</th></tr>\n"
for row in results:
    html_table += "  <tr>\n"
    for item in row:
        html_table += f"    <td>{item}</td>\n"
    html_table += "  </tr>\n"
html_table += "</table>"

from IPython.display import display, HTML
display(HTML(html_table))

Section,Status,Remarks
SPS Status,⚠️ Issues,"907 UP, 1 DOWN (Admin)"
DB Stats,✅ OK,"116/116 nodes responding, 390 record sets found"
XDR Status,✅ OK,"State: UP, Avg lag 0.5 ms, Avg latency 86.5 ms"
Pod XDR Status,ℹ️ No Data,No pod lag/latency entries
Aerospike Restart Status,✅ None,No recent restarts
Pod Restarts (24h),⚠️ 3 Found,"ig-admincli-5c76956f79-h9jm2 @ 2025-12-11T06:38:24, get-dump-cron-29424780-ddlh6 @ 2025-12-11T21:00:00, rms-server-56f6994ff4-d645c @ 2025-12-12T02:17:18"
check iohd/cm/ltm/ nosw,ℹ️ Parsed,"0 UP, 0 DOWN"
aerospike pods memory status,ℹ️ Parsed,"0 UP, 0 DOWN"
SPS Status,⚠️ Issues,"907 UP, 1 DOWN (Admin)"
Diameter Peers,✅ All UP,"Total: 0, Down: 0 (None)"


In [2]:
import re
from collections import defaultdict
from prettytable import PrettyTable

# --- Read log text safely ---
with open("/content/MED_HC_12122025.txt", encoding="utf-8", errors="ignore") as f:
    text = f.read()

results = []

# --- Cluster Stack ---
if re.search(r'pacemaker|corosync', text, re.I):
    results.append(["Cluster Stack", "✅ Active", "Pacemaker & Corosync running fine"])
else:
    results.append(["Cluster Stack", "❌ Inactive", "Cluster services not running"])

# --- Node Health ---
nodes = re.findall(r'(?:node\d+|10\.\d+\.\d+\.\d+)', text)
unique_nodes = sorted(set(nodes))
count_online = len(unique_nodes)

if unique_nodes:
    results.append([
        "Node Health",
        f"✅ {count_online} node(s) online",
        ", ".join(unique_nodes)
    ])
else:
    results.append([
        "Node Health",
        "⚠️ Unknown",
        "No node info found"
    ])

# --- Resources ---
if re.search(r'Started|running', text, re.I):
    results.append(["Resources", "✅ All started", "Including VIPs and LVs"])
else:
    results.append(["Resources", "⚠️ Some stopped", "Check resource list"])

# --- Storage ---
node_blocks = re.split(r'(?=\b(?:node\d+|10\.\d+\.\d+\.\d+)\b)', text)
high_usage_overall = []

for block in node_blocks:
    node_match = re.search(r'(node\d+|10\.\d+\.\d+\.\d+)', block)
    node_name = node_match.group(1) if node_match else "Unknown"

    # Find mount points and usage %, ignoring tmpfs/devtmpfs
    disk_usage = re.findall(r'(\S+)\s+\d+\S*\s+\d+\S*\s+\d+\S*\s+(\d+)%', block)
    for m, u in disk_usage:
        if m.startswith("/dev"):
            usage = int(u)
            if usage > 85:
                high_usage_overall.append((node_name, m, usage))

if high_usage_overall:
    nodes_high = sorted(set(node for node, _, _ in high_usage_overall))
    count_nodes = len(nodes_high)
    mounts_summary = ", ".join([f"{m} {u}% on {node}" for node, m, u in high_usage_overall])
    results.append([
        "Storage",
        f"⚠️ {count_nodes} node(s) above 85%",
        mounts_summary
    ])
else:
    results.append(["Storage", "✅ OK", "All mounts below 85% usage"])

# --- Historical Issues (Fence Failures) ---
failure_count = defaultdict(int)
failure_nodes = defaultdict(list)
pattern = r"reboot of (\S+) failed"

for line in text.splitlines():
    match = re.search(pattern, line)
    if match:
        node = match.group(1)
        failure_count[node] += 1
        failure_nodes[node].append(line.strip())

if failure_count:
    total_failures = sum(failure_count.values())
    nodes_summary = ", ".join(f"{node} ({count})" for node, count in failure_count.items())
    results.append([
        "Historical Issues",
        f"⚠️ {total_failures} failure(s)",
        nodes_summary
    ])
else:
    results.append([
        "Historical Issues",
        "✅ Clean",
        "No recent failures"
    ])

# --- Missing Package ---
if re.search(r'missing\s+pcs', text, re.I):
    results.append(["Missing Package", "⚠️ pcs missing", "Reinstall pcs if needed"])
else:
    results.append(["Missing Package", "✅ Installed", "All required packages present"])

# --- STEP 3: Display Results as HTML Table ---
html_table = "<table>\n"
html_table += "  <tr><th>Category</th><th>Status</th><th>Remarks</th></tr>\n"
for row in results:
    html_table += "  <tr>\n"
    for item in row:
        html_table += f"    <td>{item}</td>\n"
    html_table += "  </tr>\n"
html_table += "</table>"

from IPython.display import display, HTML
display(HTML(html_table))

Category,Status,Remarks
Cluster Stack,✅ Active,Pacemaker & Corosync running fine
Node Health,✅ 98 node(s) online,"10.4.24.11, 10.43.14.10, 10.43.14.11, 10.43.14.12, 10.43.14.13, 10.43.14.14, 10.43.14.15, 10.43.14.16, 10.43.14.2, 10.43.14.3, 10.43.14.32, 10.43.14.33, 10.43.14.34, 10.43.14.35, 10.43.14.36, 10.43.14.37, 10.43.14.38, 10.43.14.39, 10.43.14.4, 10.43.14.40, 10.43.14.41, 10.43.14.42, 10.43.14.43, 10.43.14.44, 10.43.14.45, 10.43.14.46, 10.43.14.47, 10.43.14.48, 10.43.14.49, 10.43.14.5, 10.43.14.50, 10.43.14.51, 10.43.14.52, 10.43.14.53, 10.43.14.54, 10.43.14.55, 10.43.14.56, 10.43.14.57, 10.43.14.58, 10.43.14.6, 10.43.14.7, 10.43.14.8, 10.43.14.9, 10.43.20.10, 10.43.20.11, 10.43.20.2, 10.43.20.3, 10.43.20.32, 10.43.20.33, 10.43.20.34, 10.43.20.35, 10.43.20.36, 10.43.20.37, 10.43.20.38, 10.43.20.39, 10.43.20.4, 10.43.20.40, 10.43.20.41, 10.43.20.42, 10.43.20.43, 10.43.20.44, 10.43.20.45, 10.43.20.46, 10.43.20.47, 10.43.20.48, 10.43.20.49, 10.43.20.5, 10.43.20.50, 10.43.20.51, 10.43.20.52, 10.43.20.53, 10.43.20.54, 10.43.20.55, 10.43.20.56, 10.43.20.6, 10.43.20.7, 10.43.20.8, 10.43.20.9, 10.43.21.130, 10.43.21.131, 10.43.21.132, 10.43.21.133, 10.43.21.134, 10.43.21.150, 10.43.21.151, 10.43.21.152, 10.43.21.153, 10.43.21.154, 10.43.21.155, 10.43.21.156, 10.43.21.157, 10.43.21.158, 10.43.21.159, 10.43.21.160, 10.43.21.161, 10.43.21.162, 10.43.21.163, 10.43.21.164"
Resources,✅ All started,Including VIPs and LVs
Storage,⚠️ 13 node(s) above 85%,"/dev/mapper/backupvg-backuplv 87% on 10.43.21.156, /dev/mapper/rootvg-root 94% on 10.43.21.162, /dev/mapper/rootvg-usr 97% on 10.43.21.162, /dev/mapper/rootvg-usr 97% on 10.43.21.163, /dev/mapper/rootvg-home 90% on 10.43.21.164, /dev/mapper/rootvg-home 90% on 10.43.14.32, /dev/mapper/rootvg-root 95% on 10.43.14.54, /dev/mapper/rootvg-root 95% on 10.43.14.56, /dev/mapper/rootvg-usr 97% on 10.43.14.56, /dev/mapper/rootvg-usr 97% on 10.43.14.57, /dev/mapper/rootvg-home 90% on 10.43.14.58, /dev/loop0 100% on 10.43.14.58, /dev/loop1 100% on 10.43.14.58, /dev/loop2 100% on 10.43.14.58, /dev/mapper/backupvg-backuplv 86% on 10.43.20.32, /dev/mapper/backupvg-backuplv 90% on 10.43.20.36, /dev/mapper/rootvg-usr 97% on 10.43.20.55, /dev/mapper/rootvg-home 90% on 10.43.20.56, /dev/loop0 100% on 10.43.20.56, /dev/loop1 100% on 10.43.20.56, /dev/loop2 100% on 10.43.20.56"
Historical Issues,⚠️ 1012 failure(s),"mdc1nmappp02.safaricomet.net (2), mdc1nmappp04.safaricomet.net (4), mdc1nmappa03.safaricomet.net (2), mdc1nmuia01.safaricomet.net (20), mdc1nmepccgfp02.safaricomet.net (868), mdc1nmepccgfp06.safaricomet.net (98), mdc1nmcgwdba01.safaricomet.net (10), mdc1nmnccappp10.safaricomet.net (6), mdc1nmnccdbp02.safaricomet.net (2)"
Missing Package,✅ Installed,All required packages present
