In [1]:
# JUPYTER NOTEBOOK CELL — Clean NWQLD_Geochem_Seds.csv to NWQLD_Geochem_Seds_cleaned.csv

import os
import re
import numpy as np
import pandas as pd

# =========================
# Paths
# =========================
INPUT_CSV  = r"C:\Users\Julian.Diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\05_Geodatabases\05_Excel\NWQLD\NWQLD_Geochem_Seds.csv"
OUTPUT_CSV = r"C:\Users\Julian.Diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\05_Geodatabases\05_Excel\NWQLD\NWQLD_Geochem_Seds_cleaned.csv"

# Optional delimiter / encoding hints
CSV_SEP = None        # e.g., ";" if semicolon-delimited; None lets pandas guess
UTF8_FIRST = True     # try UTF-8 first, then latin1 if it fails

# Columns to convert to numbers + apply negative rules
NUMERIC_COLUMNS = [
    "Au","Au1","Au2","Au3","Au4","Cu","Pb","Zn","Ag","As","Bi","Mo","Mn","Fe","Ni","Co","Cr",
    "V","Ba","Cd","Sn","Sb","Hg","Te","P","W","Zr","Ti","Mg","Th","U","Pt","Pd","S","F"
]

# =========================
# Helpers
# =========================
def normalize_number_string(s: str) -> str:
    """Normalize number-like text (preserves decimals) to help parsing."""
    if s is None or (isinstance(s, float) and np.isnan(s)):
        return ""
    if not isinstance(s, str):
        s = str(s)
    s = s.strip()
    s = s.replace("%", "").replace("−", "-")  # drop literal %; normalize unicode minus
    if "," in s and "." not in s:              # comma as decimal
        s = s.replace(",", ".")
    s = re.sub(r"[^0-9.\-]+", "", s)           # keep digits, dot, minus
    if s.count(".") > 1:                       # collapse multiple dots
        first = s.find(".")
        s = s[: first + 1] + s[first + 1:].replace(".", "")
    return s

def to_numeric(series: pd.Series) -> pd.Series:
    """Coerce a Series to numeric using normalization first."""
    return pd.to_numeric(series.map(normalize_number_string), errors="coerce")

def apply_negative_rules_to_series(s: pd.Series) -> pd.Series:
    """
    Apply your rules:
      - values < -99  → NaN
      - other negatives → abs(value) / 2 (becomes positive and halved)
    """
    s = pd.to_numeric(s, errors="coerce")
    s = s.mask(s < -99, np.nan)
    s = s.mask((s < 0) & (s >= -99), abs(s) / 2)
    return s

def make_new_hole_id(df: pd.DataFrame) -> pd.DataFrame:
    """
    New_Hole_ID = Prospect_Code + '_' + Company + '_' + Sheet_Number + '_' + (Hole_ID or Sheet_Number)
    """
    for col in ["Prospect_Code", "Company", "Sheet_Number"]:
        if col not in df.columns:
            print(f"[Warn] Missing column '{col}', using blanks.")
            df[col] = ""
        df[col] = df[col].fillna("").astype(str).str.strip()

    tail = df["Hole_ID"].fillna("").astype(str).str.strip() if "Hole_ID" in df.columns else df["Sheet_Number"]
    df["New_Hole_ID"] = (
        df["Prospect_Code"] + "_" + df["Company"] + "_" + df["Sheet_Number"] + "_" + tail
    ).str.replace(r"\s+", "_", regex=True).str.replace(r"_+", "_", regex=True).str.strip("_")
    return df

def read_csv_fallback(path: str, sep=None, utf8_first=True) -> pd.DataFrame:
    """Read CSV with UTF-8 first, then latin1 fallback."""
    kwargs = {"dtype": str, "low_memory": False}
    if sep is not None:
        kwargs["sep"] = sep
    if utf8_first:
        try:
            return pd.read_csv(path, encoding="utf-8", **kwargs)
        except UnicodeDecodeError:
            print("[Warn] UTF-8 decode failed — retrying with 'latin1' encoding.")
            return pd.read_csv(path, encoding="latin1", **kwargs)
    else:
        return pd.read_csv(path, encoding="latin1", **kwargs)

# =========================
# Load → Clean → Save
# =========================
if not os.path.exists(os.path.dirname(OUTPUT_CSV)):
    os.makedirs(os.path.dirname(OUTPUT_CSV), exist_ok=True)

df = read_csv_fallback(INPUT_CSV, sep=CSV_SEP, utf8_first=UTF8_FIRST)
print(f"[Info] Loaded {os.path.basename(INPUT_CSV)} → {df.shape}")

# Build New_Hole_ID first (before numeric coercion)
df = make_new_hole_id(df)

# Convert only the specified numeric columns and apply negative rules
missing_cols = [c for c in NUMERIC_COLUMNS if c not in df.columns]
for col in NUMERIC_COLUMNS:
    if col in df.columns:
        df[col] = to_numeric(df[col])
        df[col] = apply_negative_rules_to_series(df[col])

if missing_cols:
    print(f"[Note] Missing numeric columns (skipped): {missing_cols}")

# Save cleaned CSV
df.to_csv(OUTPUT_CSV, index=False)
print(f"[Saved] {OUTPUT_CSV}")


[Warn] UTF-8 decode failed — retrying with 'latin1' encoding.
[Info] Loaded NWQLD_Geochem_Seds.csv → (84977, 108)
[Note] Missing numeric columns (skipped): ['Au2', 'Au3', 'Au4', 'F']
[Saved] C:\Users\Julian.Diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\05_Geodatabases\05_Excel\NWQLD\NWQLD_Geochem_Seds_cleaned.csv
