In [2]:
# JUPYTER NOTEBOOK CELL — Merge Drilling / Rocks / Soils / (Sediments if present) across two locations

import os
import pandas as pd

# =========================
# Paths
# =========================
DATA1_FOLDER = r"C:\Users\Julian.Diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\05_Geodatabases\05_Excel\NWQLD"
DATA2_FOLDER = r"C:\Users\Julian.Diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\05_Geodatabases\05_Excel\NW-East_Isa"

OUT_FOLDER   = r"C:\Users\Julian.Diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\05_Geodatabases\05_Excel\NW-East-West_Isa"
os.makedirs(OUT_FOLDER, exist_ok=True)

FILES = {
    "Drilling": (
        os.path.join(DATA1_FOLDER, "geochem_cleaned.csv"),
        os.path.join(DATA2_FOLDER, "NW-East_Isa_Geochem_Drilling_Samples_cleaned.csv"),
    ),
    "Rocks": (
        os.path.join(DATA1_FOLDER, "NWQLD_Geochem_Rocks_cleaned.csv"),
        os.path.join(DATA2_FOLDER, "NW-East_Isa_Geochem_rocks_cleaned.csv"),
    ),
    "Soils": (
        os.path.join(DATA1_FOLDER, "NWQLD_Geochem_Soils_cleaned.csv"),
        os.path.join(DATA2_FOLDER, "NW-East_Isa_Geochem_Soils_cleaned.csv"),
    ),
    # Sediments are optional — add paths if available; handled safely if missing
    "Sediments": (
        os.path.join(DATA1_FOLDER, "NWQLD_Geochem_Seds_cleaned.csv"),
        os.path.join(DATA2_FOLDER, "NW-East_Isa_Geochem_Seds_cleaned.csv"),
    ),
}

# =========================
# Helpers
# =========================
def read_csv_safe(path: str) -> pd.DataFrame | None:
    """Read CSV as strings with low_memory=False; return None if not found."""
    if not os.path.exists(path):
        print(f"[Skip] File not found: {path}")
        return None
    try:
        return pd.read_csv(path, dtype=str, low_memory=False)
    except UnicodeDecodeError:
        # Windows-1252 fallback
        return pd.read_csv(path, dtype=str, low_memory=False, encoding="latin1")

def trim_headers(df: pd.DataFrame) -> pd.DataFrame:
    """Strip whitespace from headers and drop exact duplicate columns if any."""
    new_cols = []
    for c in df.columns:
        new_cols.append(c.strip() if isinstance(c, str) else c)
    df.columns = new_cols
    # If duplicate names after trimming, make them unique by suffixing
    if len(set(df.columns)) != len(df.columns):
        counts = {}
        uniq = []
        for c in df.columns:
            if c not in counts:
                counts[c] = 1
                uniq.append(c)
            else:
                counts[c] += 1
                uniq.append(f"{c}_{counts[c]}")
        df.columns = uniq
    return df

def schema_report(df1: pd.DataFrame, df2: pd.DataFrame, name1: str, name2: str) -> pd.DataFrame:
    """Return a presence/absence table of columns across two frames."""
    cols = sorted(set(df1.columns) | set(df2.columns))
    return pd.DataFrame({
        "column": cols,
        f"in_{name1}": [col in df1.columns for col in cols],
        f"in_{name2}": [col in df2.columns for col in cols],
    })

def align_and_concat(df1: pd.DataFrame, df2: pd.DataFrame, source1: str, source2: str) -> pd.DataFrame:
    """Align columns (union), add missing as NaN, then vertical concat with a Source column."""
    all_cols = sorted(set(df1.columns) | set(df2.columns))
    df1a = df1.reindex(columns=all_cols)
    df2a = df2.reindex(columns=all_cols)
    df1a["Source"] = source1
    df2a["Source"] = source2
    return pd.concat([df1a, df2a], axis=0, ignore_index=True)

# =========================
# Merge loop
# =========================
for dataset_name, (path1, path2) in FILES.items():
    print(f"\n=== {dataset_name} ===")
    df1 = read_csv_safe(path1)
    df2 = read_csv_safe(path2)

    # If neither file present, skip
    if df1 is None and df2 is None:
        print(f"[Skip] No files present for {dataset_name}.")
        continue
    # If one file missing, just copy the other to output
    if df1 is None and df2 is not None:
        df2 = trim_headers(df2)
        out_csv = os.path.join(OUT_FOLDER, f"NW-East-West_Isa_{dataset_name}_merged.csv")
        df2.to_csv(out_csv, index=False)
        print(f"[Saved] Only Data set 2 available → {out_csv}")
        # schema report vs empty
        rep = schema_report(df2, pd.DataFrame(), "DataSet2", "DataSet1_EMPTY")
        rep.to_csv(os.path.join(OUT_FOLDER, f"NW-East-West_Isa_{dataset_name}_schema_report.csv"), index=False)
        continue
    if df1 is not None and df2 is None:
        df1 = trim_headers(df1)
        out_csv = os.path.join(OUT_FOLDER, f"NW-East-West_Isa_{dataset_name}_merged.csv")
        df1.to_csv(out_csv, index=False)
        print(f"[Saved] Only Data set 1 available → {out_csv}")
        rep = schema_report(df1, pd.DataFrame(), "DataSet1", "DataSet2_EMPTY")
        rep.to_csv(os.path.join(OUT_FOLDER, f"NW-East-West_Isa_{dataset_name}_schema_report.csv"), index=False)
        continue

    # Both present — trim headers and schema check
    df1 = trim_headers(df1)
    df2 = trim_headers(df2)

    report = schema_report(df1, df2, "DataSet1", "DataSet2")
    rep_path = os.path.join(OUT_FOLDER, f"NW-East-West_Isa_{dataset_name}_schema_report.csv")
    report.to_csv(rep_path, index=False)
    print(f"[Info] Schema report saved → {rep_path}")

    # Merge (align columns)
    merged = align_and_concat(df1, df2, "DataSet1", "DataSet2")
    out_csv = os.path.join(OUT_FOLDER, f"NW-East-West_Isa_{dataset_name}_merged.csv")
    merged.to_csv(out_csv, index=False)
    print(f"[Saved] Merged {dataset_name} → {out_csv}")

print("\n[Done] Merging completed. Check the schema reports for any column mismatches before downstream use.")



=== Drilling ===
[Info] Schema report saved → C:\Users\Julian.Diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\05_Geodatabases\05_Excel\NW-East-West_Isa\NW-East-West_Isa_Drilling_schema_report.csv
[Saved] Merged Drilling → C:\Users\Julian.Diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\05_Geodatabases\05_Excel\NW-East-West_Isa\NW-East-West_Isa_Drilling_merged.csv

=== Rocks ===
[Info] Schema report saved → C:\Users\Julian.Diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\05_Geodatabases\05_Excel\NW-East-West_Isa\NW-East-West_Isa_Rocks_schema_report.csv
[Saved] Merged Rocks → C:\Users\Julian.Diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\05_Geodatabases\05_Excel\NW-East-West_Isa\NW-East-West_Isa_Rocks_merged.csv

=== Soils ===
[Info] Schema report saved → C:\Users\Julian.Diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\05_Geodatabases\05_Excel\NW-East-West_Isa\NW-East-West_Isa_Soils_schema_report.csv
[Saved] Merged Soils → C:\Users\Julian.Diaz\OneDrive - XENITH CONSULT