In [2]:
# JUPYTER NOTEBOOK CELL — Clean numeric columns in NWQLD_Geochem_Rocks.csv

import os
import re
import numpy as np
import pandas as pd

# =========================
# Paths
# =========================
FOLDER = r"C:\Users\Julian.Diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\05_Geodatabases\05_Excel\NWQLD"
INPUT_FILE  = os.path.join(FOLDER, "NWQLD_Geochem_Soils.csv")
OUTPUT_FILE = os.path.join(FOLDER, "NWQLD_Geochem_Soils_cleaned.csv")

# Optional CSV read hints (set if needed)
CSV_SEP = None        # e.g., ";" if semicolon-delimited; None lets pandas guess
CSV_UTF8_FIRST = True # try UTF-8 first, then latin1 fallback

# Columns to convert to numbers + apply negative rules
NUMERIC_COLUMNS = [
    "Au","Au1","Cu","Pb","Zn","Ag","As","Bi","Mo","Mn","Fe","Ni","Co","Cr","V","Ba","Cd","Sn",
    "Sb","Hg","Te","P","W","Zr","Ti","Mg","Th","U","Pt","Pd","S"
]

# =========================
# Helpers
# =========================
def normalize_number_string(s: str) -> str:
    """Normalize number-like text for robust parsing while preserving decimals."""
    if s is None or (isinstance(s, float) and np.isnan(s)):
        return ""
    if not isinstance(s, str):
        s = str(s)
    s = s.strip()
    s = s.replace("%", "").replace("−", "-")  # drop literal %; normalize minus
    if "," in s and "." not in s:              # comma as decimal
        s = s.replace(",", ".")
    s = re.sub(r"[^0-9.\-]+", "", s)           # keep digits, ., -
    if s.count(".") > 1:                       # collapse multiple dots
        first = s.find(".")
        s = s[:first+1] + s[first+1:].replace(".", "")
    return s

def to_numeric(series: pd.Series) -> pd.Series:
    return pd.to_numeric(series.map(normalize_number_string), errors="coerce")

def apply_negative_rules_to_series(s: pd.Series) -> pd.Series:
    """
    Apply your rules:
      - values < -99  → NaN
      - other negatives → abs(value) / 2 (becomes positive and halved)
    """
    s = pd.to_numeric(s, errors="coerce")
    s = s.mask(s < -99, np.nan)
    s = s.mask((s < 0) & (s >= -99), abs(s) / 2)
    return s

# =========================
# Load CSV with encoding fallback
# =========================
read_kwargs = {}
if CSV_SEP is not None:
    read_kwargs["sep"] = CSV_SEP

try:
    df = pd.read_csv(INPUT_FILE, dtype=str, **read_kwargs) if CSV_UTF8_FIRST \
         else pd.read_csv(INPUT_FILE, dtype=str, encoding="latin1", **read_kwargs)
except UnicodeDecodeError:
    df = pd.read_csv(INPUT_FILE, dtype=str, encoding="latin1", **read_kwargs)

print(f"[Info] Loaded {os.path.basename(INPUT_FILE)} → {df.shape}")

# =========================
# Clean numeric columns
# =========================
missing_cols = [c for c in NUMERIC_COLUMNS if c not in df.columns]
for col in NUMERIC_COLUMNS:
    if col in df.columns:
        df[col] = to_numeric(df[col])
        df[col] = apply_negative_rules_to_series(df[col])

if missing_cols:
    print(f"[Warn] Missing columns (skipped): {missing_cols}")

# =========================
# Save output
# =========================
df.to_csv(OUTPUT_FILE, index=False)
print(f"[Saved] {OUTPUT_FILE}")


[Info] Loaded NWQLD_Geochem_Soils.csv → (220699, 104)
[Saved] C:\Users\Julian.Diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\05_Geodatabases\05_Excel\NWQLD\NWQLD_Geochem_Soils_cleaned.csv
