In [1]:
import io
import pandas as pd
from pathlib import Path

# ◼ Adjust this to point at your metadata CSV folder:
DATA_FOLDER = Path(
    r"C:\Users\danielg\PycharmProjects\Taccari_et_al\GroundwaterFlowGNN-main\data\input\piezometers\csv\csv"
)

records = []

for f in DATA_FOLDER.glob("*.csv"):
    lines = f.read_text(encoding="utf-8").splitlines()

    # 1) find the metadata header (first block)
    for i, L in enumerate(lines):
        if L.startswith("LOCATIE") and "X-COORDINAAT" in L:
            start = i
            break
    else:
        # no metadata header in this file
        continue

    # 2) find end of that block (blank line or next header)
    end = None
    for j in range(start + 1, len(lines)):
        if not lines[j].strip() or (
            lines[j].startswith("LOCATIE") and "PEIL DATUM" in lines[j]
        ):
            end = j
            break
    end = end or len(lines)

    # 3) read only that slice into a DataFrame
    block = io.StringIO("\n".join(lines[start:end]))
    dfm = pd.read_csv(block, sep=",", engine="python", header=0, dtype=str)

    # 4) normalize columns and grab the first row
    dfm.columns = dfm.columns.str.strip().str.upper()
    row = dfm.iloc[0]

    # 5) append only the requested fields
    records.append({
        "file"          : f.name,
        "locatie"       : row["LOCATIE"],
        "external_id"   : row["EXTERNE AANDUIDING"],
        "x_coord"       : float(row["X-COORDINAAT"]),
        "y_coord"       : float(row["Y-COORDINAAT"]),
        "ground_elev"   : float(row["MAAIVELD NAP"]),
        "top_filter"    : float(row["BOVENKANT FILTER"]),
        "bottom_filter" : float(row["ONDERKANT FILTER"]),
    })

# 6) build the final metadata table
meta_df = pd.DataFrame.from_records(records).set_index("file")

print(meta_df)


ERROR! Session/line number was not unique in database. History logging moved to new session 135
                                           locatie   external_id    x_coord  \
file                                                                          
B39E0275001_1028_39EP0033001.csv          B39E0275   39EP0033001  169470.00   
B39E0275002_1028_39EP0033002.csv          B39E0275   39EP0033002  169470.00   
B39E2783001_1036_39E-2783001.csv          B39E2783   39E-2783001  168961.01   
B39E2784001_1036_39E-2784001.csv          B39E2784   39E-2784001  168261.55   
B39E2786001_1036_39E-2786001.csv          B39E2786   39E-2786001  168915.19   
...                                            ...           ...        ...   
HEM-P00-31002_1018_HEM-P00-31-2.csv  HEM-P00-31002  HEM-P00-31-2  175967.00   
HEM-P00-31003_1018_HEM-P00-31-3.csv  HEM-P00-31003  HEM-P00-31-3  175967.00   
P39F0006006_1033_39F-0006001.csv          P39F0006   39F-0006001  176220.00   
P40B0010001_1029_40B-0010001.csv   

In [2]:
meta_df.to_csv("C:\\Users\\danielg\\PycharmProjects\\Taccari_et_al\\GroundwaterFlowGNN-main\\data\\preprocessed\\well_metadata.csv")