In [1]:
# deepest_samples_per_hole.py
import os
import pandas as pd

# --- Paths ---
INPUT_CSV = r"C:\Users\Julian.Diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\05_Geodatabases\00_Projects\99_IsaNorth_CaseStudy\Drilling_Data.csv"
OUTPUT_XLSX = os.path.join(os.path.dirname(INPUT_CSV), "Drilling_Data_cleaned.xlsx")

def main():
    if not os.path.exists(INPUT_CSV):
        raise FileNotFoundError(f"Input CSV not found: {INPUT_CSV}")

    # Load
    df = pd.read_csv(INPUT_CSV)

    # Allow either 'New_Hole_I' (requested) or fallback to 'New_Hole_ID' if present
    hole_col = 'New_Hole_I' if 'New_Hole_I' in df.columns else ('New_Hole_ID' if 'New_Hole_ID' in df.columns else None)
    if hole_col is None:
        raise KeyError("Hole name column not found. Expected 'New_Hole_I' (or 'New_Hole_ID').")
    if 'Depth' not in df.columns:
        raise KeyError("Depth column not found. Expected 'Depth'.")

    # Ensure numeric depth and drop rows missing hole or depth
    df['Depth'] = pd.to_numeric(df['Depth'], errors='coerce')
    df = df.dropna(subset=[hole_col, 'Depth'])

    # Pick the index of the maximum depth per hole (ties -> first occurrence)
    deepest_idx = df.groupby(hole_col)['Depth'].idxmax()
    deepest = df.loc[deepest_idx].copy()

    # Optional: sort by hole for readability
    deepest = deepest.sort_values(by=hole_col).reset_index(drop=True)

    # Write Excel
    with pd.ExcelWriter(OUTPUT_XLSX, engine="openpyxl") as xw:
        deepest.to_excel(xw, index=False, sheet_name="DeepestSamples")

    print(f"Saved {len(deepest)} deepest samples "
          f"(unique holes) to:\n{OUTPUT_XLSX}")

if __name__ == "__main__":
    main()


Saved 2918 deepest samples (unique holes) to:
C:\Users\Julian.Diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\05_Geodatabases\00_Projects\99_IsaNorth_CaseStudy\Drilling_Data_cleaned.xlsx


In [2]:
# drilling_stats.py
import os
import pandas as pd
import numpy as np

# --- Paths ---
INPUT_XLSX = r"C:\Users\Julian.Diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\05_Geodatabases\00_Projects\99_IsaNorth_CaseStudy\Drilling_Data_cleaned.xlsx"
OUTPUT_XLSX = os.path.join(os.path.dirname(INPUT_XLSX), "Drilling_Data_stats.xlsx")

# --- Config ---
DRILL_COL = "Drilling_T"   # drilling type column
DEPTH_COL = "Depth"        # depth column
# hole name column (support both spellings just in case)
HOLE_COL_CANDIDATES = ["New_Hole_I", "New_Hole_ID"]

def normalize_drill_type(s: pd.Series) -> pd.Series:
    """Uppercase, trim, and merge PERC into RAB; fill missing as 'UNKNOWN'."""
    s = s.astype(str).str.strip()
    # Treat explicit NaNs / empties as UNKNOWN
    s = s.replace({"": np.nan, "nan": np.nan, "None": np.nan})
    s = s.str.upper()
    # Merge logic
    s = s.replace({"PERC": "RAB"})  # requested merge
    s = s.fillna("UNKNOWN")
    return s

def q(series, p):
    return series.quantile(p) if len(series) else np.nan

def main():
    if not os.path.exists(INPUT_XLSX):
        raise FileNotFoundError(f"Input Excel not found: {INPUT_XLSX}")

    df = pd.read_excel(INPUT_XLSX)

    # Determine hole column
    hole_col = next((c for c in HOLE_COL_CANDIDATES if c in df.columns), None)
    if hole_col is None:
        raise KeyError(f"Hole column not found. Looked for: {HOLE_COL_CANDIDATES}")
    if DRILL_COL not in df.columns:
        raise KeyError(f"Drilling type column '{DRILL_COL}' not found.")
    if DEPTH_COL not in df.columns:
        raise KeyError(f"Depth column '{DEPTH_COL}' not found.")

    # Coerce depth numeric and drop rows without essential fields
    df[DEPTH_COL] = pd.to_numeric(df[DEPTH_COL], errors="coerce")
    df = df.dropna(subset=[hole_col, DEPTH_COL])

    # Normalise drilling type and merge PERC -> RAB
    df["Drilling_T_norm"] = normalize_drill_type(df[DRILL_COL])

    # --- Per-class statistics ---
    grouped = df.groupby("Drilling_T_norm", dropna=False)
    per_class = grouped[DEPTH_COL].agg(
        holes=("count"),                        # rows == holes (cleaned file has one per hole)
        mean_depth=("mean"),
        median_depth=("median"),
        std_depth=("std"),
        min_depth=("min"),
        p25_depth=(lambda s: q(s, 0.25)),
        p75_depth=(lambda s: q(s, 0.75)),
        max_depth=("max")
    ).sort_values("mean_depth", ascending=False)

    # --- Overall statistics ---
    overall = pd.DataFrame({
        "metric": [
            "n_holes", "mean_depth", "median_depth", "std_depth",
            "min_depth", "p25_depth", "p75_depth", "max_depth"
        ],
        "value": [
            int(df.shape[0]),
            df[DEPTH_COL].mean(),
            df[DEPTH_COL].median(),
            df[DEPTH_COL].std(),
            df[DEPTH_COL].min(),
            q(df[DEPTH_COL], 0.25),
            q(df[DEPTH_COL], 0.75),
            df[DEPTH_COL].max()
        ]
    })

    # --- Extra: class counts and proportions ---
    class_counts = per_class[["holes"]].rename(columns={"holes": "count"}).copy()
    class_counts["proportion_%"] = (class_counts["count"] / class_counts["count"].sum() * 100)

    # --- Extra: Top 10 deepest holes overall (for quick QA) ---
    cols_to_show = [hole_col, DRILL_COL, "Drilling_T_norm", DEPTH_COL]
    existing_cols = [c for c in cols_to_show if c in df.columns]
    top10 = df.sort_values(DEPTH_COL, ascending=False).head(10)[existing_cols].reset_index(drop=True)

    # Round for neatness (without mutating original precise calcs saved to Excel writer)
    per_class_rounded = per_class.copy()
    for col in ["mean_depth", "median_depth", "std_depth", "min_depth", "p25_depth", "p75_depth", "max_depth"]:
        per_class_rounded[col] = per_class_rounded[col].round(2)
    overall_rounded = overall.copy()
    overall_rounded["value"] = overall_rounded["value"].round(2)
    class_counts_rounded = class_counts.copy()
    class_counts_rounded["proportion_%"] = class_counts_rounded["proportion_%"].round(2)

    # Print to console
    print("\n=== Per-class depth statistics (PERC merged into RAB) ===")
    print(per_class_rounded)
    print("\n=== Overall depth statistics ===")
    print(overall_rounded)
    print("\n=== Class distribution ===")
    print(class_counts_rounded)
    print("\n=== Top 10 deepest holes (overall) ===")
    print(top10)

    # Write Excel report
    with pd.ExcelWriter(OUTPUT_XLSX, engine="openpyxl") as xw:
        per_class.to_excel(xw, sheet_name="Per_Class_Stats")
        overall.to_excel(xw, sheet_name="Overall_Stats", index=False)
        class_counts.to_excel(xw, sheet_name="Class_Distribution")
        top10.to_excel(xw, sheet_name="Top10_Deepest", index=False)

    print(f"\nSaved stats report to:\n{OUTPUT_XLSX}")

if __name__ == "__main__":
    main()



=== Per-class depth statistics (PERC merged into RAB) ===
                 holes  mean_depth  median_depth  std_depth  min_depth  \
Drilling_T_norm                                                          
DD                  73      391.50        349.00     219.25      70.20   
ACORE                1      125.00        125.00        NaN     125.00   
REVC                66       94.20         80.00      42.48       6.00   
RAB               2579       11.54          5.00      22.39       0.80   
AUGER              199        3.27          1.39       2.87       1.39   

                 p25_depth  p75_depth  max_depth  
Drilling_T_norm                                   
DD                  202.90      575.5    1075.00  
ACORE               125.00      125.0     125.00  
REVC                 60.00      120.0     198.00  
RAB                   5.00        9.0     288.00  
AUGER                 1.39        6.1      18.29  

=== Overall depth statistics ===
         metric    value
0     