In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

pd.set_option("display.max_columns", 200)

In [None]:
# Define relative paths to data files
ROOT = Path("..")
WELLS_PATH = ROOT / "data" / "raw" / "wells.csv"

WELLS_PATH.exists(), WELLS_PATH

In [None]:
# Load wells data and display its shape and columns
wells = pd.read_csv(WELLS_PATH)
wells.shape, wells.columns.tolist()
#wells.head()

In [None]:
# sample data for testing when building the pipeline
USE_SAMPLE = False
SAMPLE_N = 20

if USE_SAMPLE:
    wells = wells.sample(SAMPLE_N, random_state=42).copy()
    wells.shape

In [None]:
# take a look at the data
wells.head(10)

In [None]:
# check for missingness and class balance 
wells.isna().mean().sort_values(ascending=False).head(20)
wells["ArsenicCat"].value_counts(dropna=False)



In [None]:
# lets do a schema check see what columns we have
REQUIRED = [
    "DataID",
    "xCoord", "yCoord",
    "pH",
    "ActcualDepth", "Depth_IDW", "DepthType",
    "GelogicUnit",
    "ArsenicCat", "ArsenicCat2", "ArsenicOld",
    "WellDepth",
]
[c for c in REQUIRED if c not in wells.columns]


In [None]:
# for the pH value, treat 0 and out of range values as missing
wells["pH"] = pd.to_numeric(wells["pH"], errors="coerce")
wells.loc[(wells["pH"] <= 0) | (wells["pH"] > 14), "pH"] = np.nan
wells["pH"].describe()


In [None]:
# well depth - use actual measured depth, if unavailable use the interpolated depth from inverse distance weighting (IDW)
wells["ActcualDepth"] = pd.to_numeric(wells["ActcualDepth"], errors="coerce")
wells["Depth_IDW"] = pd.to_numeric(wells["Depth_IDW"], errors="coerce")
wells["WellDepth"] = pd.to_numeric(wells["WellDepth"], errors="coerce")

wells["depth_value"] = wells["ActcualDepth"]
wells.loc[wells["depth_value"].isna(), "depth_value"] = wells.loc[wells["depth_value"].isna(), "Depth_IDW"]
wells.loc[wells["depth_value"].isna(), "depth_value"] = wells.loc[wells["depth_value"].isna(), "WellDepth"]

wells["depth_value"].describe()


In [None]:
# create a value to help with exploratory plots
def parse_arsenic_value(v):
    if pd.isna(v):
        return np.nan
    s = str(v).strip().replace(",", "")
    if s.startswith("<"):
        s = s[1:].strip()
    try:
        return float(s)
    except:
        return np.nan
    
wells["arsenic_value"] = wells["ArsenicOld"].apply(parse_arsenic_value)
wells[["ArsenicOld", "arsenic_value"]].head(10)

In [None]:
# Create a binary target used in modeling.
wells["y"] = pd.to_numeric(wells["ArsenicCat"], errors="coerce").astype("Int64")
wells["y"].value_counts(dropna=False)


In [None]:
# rule for dealing with dups - for each location keep the highest arsenic measurement
wells["xCoord"] = pd.to_numeric(wells["xCoord"], errors="coerce")
wells["yCoord"] = pd.to_numeric(wells["yCoord"], errors="coerce")

wells["loc_key"] = wells["xCoord"].round(6).astype(str) + "_" + wells["yCoord"].round(6).astype(str)
wells["_ars_sort"] = wells["arsenic_value"].fillna(-1)

wells_dedup = (
    wells.sort_values(["loc_key", "_ars_sort", "y"], ascending=[True, False, False])
         .drop_duplicates("loc_key", keep="first")
         .drop(columns=["_ars_sort"])
)

wells.shape, wells_dedup.shape


In [None]:
# Build the minimal modeling table
model_df = wells_dedup[[
    "DataID",
    "xCoord", "yCoord",
    "pH",
    "depth_value",
    "DepthType",
    "GelogicUnit",
    "ArsenicCat2",
    "arsenic_value",
    "y",
]].copy()

model_df.rename(columns={
    "DataID": "well_id",
    "xCoord": "longitude",
    "yCoord": "latitude",
    "pH": "ph",
    "depth_value": "depth",
    "DepthType": "depth_type",
    "GelogicUnit": "geology_unit",
    "ArsenicCat2": "EPA_threshold",
}, inplace=True)

model_df.head(10)


In [None]:
# class balance in the raw data without droping the missing rows
wells["y"].value_counts(dropna=False)

In [None]:
# final data clean up - drop rows missing the predictors and target
core = ["well_id", "longitude", "latitude", "ph", "depth", "geology_unit", "EPA_threshold", "y"]
before = len(model_df)
model_df_clean = model_df.dropna(subset=core).copy()
model_df_clean["y"].value_counts(dropna=False)
after = len(model_df_clean)

(before, after), model_df_clean["y"].value_counts(dropna=False)


In [None]:
out_dir = ROOT / "data" / "processed"
out_dir.mkdir(parents=True, exist_ok=True)

out_path = out_dir / "model_table.csv"
model_df_clean.to_csv(out_path, index=False)

out_path