In [1]:
import os
import datetime as dt
import logging
import waterdata_utils as wdu
import duckdb
import pyarrow.parquet as pq


In [2]:
usgs_sites = [
    "09152500",   # Gunnison River Near Grand Junction, CO
    "09095500",   # Colorado River Near Cameo, CO
    "09106150",   # Colorado River Below Grand Valley Div NR Palisade, CO
    "09163500",   # Colorado River Near Colorado-utah State Line
    "09306500",   # White River Near Watson, Utah
    "09251000",   # Yampa River Near Maybell, CO
    "09260050",   # Yampa River at Deerlodge Park, CO
    "09260000",   # Little Snake River Near Lily, CO
    "09261000",   # Green River Near Jensen, UT
#    "09315000",   # Green River at Green River, UT
    "09302000",   # Duchesne River Near Randlett, UT
    "09180000",   # Dolores River Near Cisco, UT
]

In [16]:
# === CONFIGURATION ===
sites = usgs_sites
parameter_codes = ['00060', '00010']
start_date = '2022-01-01'
end_date = '2025-03-01'
service_code = 'iv'
output_root = '/Volumes/T7_raw_I/waterdata_lake'
# ======================


# Configure logging ------------------------------------------------
os.makedirs('logs', exist_ok=True)
log_name = 'logs/' + dt.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + '.log'
logging.basicConfig(filename=log_name,
                    level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')
# -------------------------------------------------------------------

In [5]:
# Itterate through each site main function
for site in sites:
    print(f"Processing site: {site}")
    all_data = []

    # Itterate through each parameter code
    for pcode in parameter_codes:
        
        # Fetch data for the current site and parameter code from NWIS
        df_raw = wdu.fetch_data(site, pcode, start_date, end_date, service_code)
        if df_raw is None:
            continue

        # Clean and transform the data, standardizing column names and types
        df_transformed = wdu.transform_data(df_raw, site, pcode)
        if df_transformed.empty:
            continue

        # Write the data to a parquet file
        wdu.write_to_datalake(df_transformed, site, output_root)

Processing site: 09152500
Processing site: 09095500
Processing site: 09106150
Processing site: 09163500
Processing site: 09306500
Processing site: 09251000
Processing site: 09260050
Processing site: 09260000
Processing site: 09261000
Processing site: 09302000
Processing site: 09180000


In [30]:
query = ("""
SELECT *
FROM read_parquet('/Volumes/T7_raw_I/waterdata_lake/timeseries_iv/**/*.parquet')
LIMIT 10;
"""
)

query = (
    "SELECT * "
    f"FROM '{output_root}/timeseries_vi/' "
    "WHERE site = '09180000' AND year = 2023"
)
query = (
    "SELECT date_trunc('day', datetime) AS day, "
    "AVG(value) AS avg_flow "
    "FROM read_parquet('/Volumes/T7_raw_I/waterdata_lake/timeseries_iv/**/*.parquet') "
    "WHERE site = '09180000' "
    "AND parameter = '00060' "  # Replace with actual parameter code if needed"
    "GROUP BY day "
    "ORDER BY day;"
)

print(query)

SELECT date_trunc('day', datetime) AS day, AVG(value) AS avg_flow FROM read_parquet('/Volumes/T7_raw_I/waterdata_lake/timeseries_iv/**/*.parquet') WHERE site = '09180000' AND parameter = '00060' GROUP BY day ORDER BY day;


In [31]:




# Connect to DuckDB and execute the query
with duckdb.connect() as con:
    result = con.execute(query).fetchdf()
# Print the result
print(result)

Empty DataFrame
Columns: [day, avg_flow]
Index: []


In [25]:
import duckdb

con = duckdb.connect()

df = con.execute("""
    SELECT *
    FROM read_parquet('/volumes/T7_raw_I/waterdata_lake/timeseries_iv/')
    WHERE site = '09180000' AND year = 2023
""").fetchdf()

IOException: IO Error: No files found that match the pattern "/volumes/T7_raw_I/waterdata_lake/timeseries_iv/"

In [10]:
df = duckdb.read_parquet(
    f"{output_root}/timeseries_iv/site=09180000/year=2023/data.parquet"
)

In [11]:
print(df)

┌──────────┬─────────────────────┬───────────┬────────┬─────────────────┬───────┐
│   site   │      datetime       │ parameter │ value  │ approval_status │ year  │
│ varchar  │    timestamp_ns     │  varchar  │ double │     varchar     │ int64 │
├──────────┼─────────────────────┼───────────┼────────┼─────────────────┼───────┤
│ 09180000 │ 2023-01-01 00:00:00 │ 00010     │    2.8 │ A               │  2023 │
│ 09180000 │ 2023-01-01 00:15:00 │ 00010     │    2.8 │ A               │  2023 │
│ 09180000 │ 2023-01-01 00:30:00 │ 00010     │    2.8 │ A               │  2023 │
│ 09180000 │ 2023-01-01 00:45:00 │ 00010     │    2.8 │ A               │  2023 │
│ 09180000 │ 2023-01-01 01:00:00 │ 00010     │    2.8 │ A               │  2023 │
│ 09180000 │ 2023-01-01 01:15:00 │ 00010     │    2.8 │ A               │  2023 │
│ 09180000 │ 2023-01-01 01:30:00 │ 00010     │    2.8 │ A               │  2023 │
│ 09180000 │ 2023-01-01 01:45:00 │ 00010     │    2.8 │ A               │  2023 │
│ 09180000 │ 202

In [None]:
table = pq.read_table('data/data_lake/timeseries_iv/site=09152500/year=2022/data.parquet')
df = table.to_pandas()