In [4]:
# JUPYTER NOTEBOOK CELL — Clean Rocks/Soils/Seds with lock-tolerant Excel reader

import os
import re
import time
import numpy as np
import pandas as pd

# =========================
# Paths / Settings
# =========================
BASE_FOLDER = r"C:\Users\Julian.Diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\05_Geodatabases\05_Excel"
ROCKS_XLSX = os.path.join(BASE_FOLDER, "NW-East_QLD_Geochem_rocks.xlsx")
SOILS_XLSX = os.path.join(BASE_FOLDER, "NW-East_QLD_Geochem_Soils.xlsx")
SEDS_XLSX  = os.path.join(BASE_FOLDER, "NW-East_QLD_Geochem_Seds.xlsx")

OUTDIR = r"C:\Users\Julian.Diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\05_Geodatabases\05_Excel\NW-East_Isa"
os.makedirs(OUTDIR, exist_ok=True)

# If your data aren't on the first sheet, set the sheet names/indices here:
ROCKS_SHEET = 0
SOILS_SHEET = 0
SEDS_SHEET  = 0

# Columns to convert to numbers + apply negative rules
NUMERIC_COLUMNS = [
    "Au","Au1","Au2","Au3","Au4","Cu","Pb","Zn","Ag","As","Bi","Mo","Mn","Fe","Ni","Co","Cr",
    "V","Ba","Cd","Sn","Sb","Hg","Te","P","W","Zr","Ti","Mg","Th","U","Pt","Pd","S","F"
]

# =========================
# Helpers
# =========================
def normalize_number_string(s: str) -> str:
    """Normalize number-like text (preserves decimals) to help parsing."""
    if s is None or (isinstance(s, float) and np.isnan(s)):
        return ""
    if not isinstance(s, str):
        s = str(s)
    s = s.strip()
    s = s.replace("%", "").replace("−", "-")  # remove literal %; normalize unicode minus
    # If comma but no dot, treat comma as decimal separator
    if "," in s and "." not in s:
        s = s.replace(",", ".")
    # Keep digits, a single dot, and a minus
    s = re.sub(r"[^0-9.\-]+", "", s)
    # If multiple dots, keep first as decimal point
    if s.count(".") > 1:
        first = s.find(".")
        s = s[: first + 1] + s[first + 1:].replace(".", "")
    return s

def to_numeric(series: pd.Series) -> pd.Series:
    """Coerce a Series to numeric using normalization first."""
    return pd.to_numeric(series.map(normalize_number_string), errors="coerce")

def apply_negative_rules_to_series(s: pd.Series) -> pd.Series:
    """
    Apply your rules:
      - values < -99  → NaN
      - other negatives → abs(value) / 2 (becomes positive and halved)
    """
    s = pd.to_numeric(s, errors="coerce")
    s = s.mask(s < -99, np.nan)
    s = s.mask((s < 0) & (s >= -99), abs(s) / 2)
    return s

def make_new_hole_id(df: pd.DataFrame) -> pd.DataFrame:
    """
    New_Hole_ID = Prospect_Code + '_' + Company + '_' + Sheet_Number + '_' + (Hole_ID or Sheet_Number)
    """
    for col in ["Prospect_Code", "Company", "Sheet_Number"]:
        if col not in df.columns:
            print(f"[Warn] Missing column '{col}', using blanks.")
            df[col] = ""
        df[col] = df[col].fillna("").astype(str).str.strip()

    tail = df["Hole_ID"].fillna("").astype(str).str.strip() if "Hole_ID" in df.columns else df["Sheet_Number"]
    df["New_Hole_ID"] = (
        df["Prospect_Code"] + "_" + df["Company"] + "_" + df["Sheet_Number"] + "_" + tail
    ).str.replace(r"\s+", "_", regex=True).str.replace(r"_+", "_", regex=True).str.strip("_")
    return df

def read_excel_safely(path: str, sheet, attempts: int = 5, delay_s: float = 2.0) -> pd.DataFrame:
    """
    Robust Excel loader:
      1) Try pandas.read_excel(engine='openpyxl')
      2) On PermissionError, retry a few times
      3) Fallback: openpyxl read-only mode -> build DataFrame
    """
    # 1 & 2: pandas with retries
    for i in range(attempts):
        try:
            return pd.read_excel(path, sheet_name=sheet, dtype=str, engine="openpyxl")
        except PermissionError as e:
            if i < attempts - 1:
                print(f"[Warn] Permission denied opening {os.path.basename(path)}. "
                      f"Close Excel/OneDrive sync on the file if open. Retrying in {delay_s:.0f}s...")
                time.sleep(delay_s)
            else:
                print("[Warn] Falling back to openpyxl read-only mode...")
        except Exception as e:
            # Other errors -> break to fallback
            print(f"[Warn] read_excel failed ({type(e).__name__}: {e}). Falling back...")
            break

    # 3) Fallback: openpyxl read-only
    try:
        import openpyxl
        wb = openpyxl.load_workbook(path, read_only=True, data_only=True)
        ws = wb[wb.sheetnames[sheet]] if isinstance(sheet, int) else wb[sheet]
        rows_iter = ws.values
        headers = next(rows_iter)
        df = pd.DataFrame(rows_iter, columns=[str(h).strip() if h is not None else f"Unnamed_{i}"
                                              for i, h in enumerate(headers)])
        wb.close()
        # Ensure string dtype to match our downstream expectations
        for c in df.columns:
            df[c] = df[c].astype(str).where(df[c].notna(), None)
        return df
    except Exception as e:
        raise PermissionError(
            f"Unable to read '{path}'. It may be locked by Excel/OneDrive, or inaccessible.\n"
            f"Details: {type(e).__name__}: {e}"
        )

def clean_file(input_xlsx: str, sheet, output_csv: str):
    """Load, build New_Hole_ID, clean numeric columns, save CSV."""
    if not os.path.exists(input_xlsx):
        print(f"[Error] File not found: {input_xlsx}")
        return

    df = read_excel_safely(input_xlsx, sheet)
    print(f"[Info] Loaded {os.path.basename(input_xlsx)} → {df.shape}")

    # Build New_Hole_ID first
    df = make_new_hole_id(df)

    # Convert only specified numeric columns and apply negative rules
    missing_cols = [c for c in NUMERIC_COLUMNS if c not in df.columns]
    for col in NUMERIC_COLUMNS:
        if col in df.columns:
            df[col] = to_numeric(df[col])
            df[col] = apply_negative_rules_to_series(df[col])

    if missing_cols:
        print(f"[Note] Missing numeric columns in {os.path.basename(input_xlsx)} (skipped): {missing_cols}")

    # Save CSV
    df.to_csv(output_csv, index=False)
    print(f"[Saved] {output_csv}")

# =========================
# Run for Rocks / Soils / Sediments
# =========================
clean_file(ROCKS_XLSX, ROCKS_SHEET, os.path.join(OUTDIR, "All_QLD_Geochem_rocks_cleaned.csv"))
clean_file(SOILS_XLSX, SOILS_SHEET, os.path.join(OUTDIR, "All_QLD_Geochem_Soils_cleaned.csv"))
clean_file(SEDS_XLSX,  SEDS_SHEET,  os.path.join(OUTDIR, "All_QLD_Geochem_Seds_cleaned.csv"))


[Info] Loaded NW-East_QLD_Geochem_rocks.xlsx → (71732, 119)
[Note] Missing numeric columns in NW-East_QLD_Geochem_rocks.xlsx (skipped): ['Au2', 'Au3', 'Au4', 'F']
[Saved] C:\Users\Julian.Diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\05_Geodatabases\05_Excel\NW-East_Isa\All_QLD_Geochem_rocks_cleaned.csv
[Info] Loaded NW-East_QLD_Geochem_Soils.xlsx → (337085, 104)
[Note] Missing numeric columns in NW-East_QLD_Geochem_Soils.xlsx (skipped): ['Au2', 'Au3', 'Au4', 'F']
[Saved] C:\Users\Julian.Diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\05_Geodatabases\05_Excel\NW-East_Isa\All_QLD_Geochem_Soils_cleaned.csv
[Info] Loaded NW-East_QLD_Geochem_Seds.xlsx → (131128, 108)
[Note] Missing numeric columns in NW-East_QLD_Geochem_Seds.xlsx (skipped): ['Au2', 'Au3', 'Au4', 'F']
[Saved] C:\Users\Julian.Diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\05_Geodatabases\05_Excel\NW-East_Isa\All_QLD_Geochem_Seds_cleaned.csv
