In [6]:
import os
import datetime as dt
import logging
from utils.fetch_data import fetch_nwis_data
from utils.execute_sql_script import execute_sql_script
from utils.transform_data import transform_nwis_iv_data, transform_nwis_site_data
from utils.write_to_datalake import write_to_datalake
import duckdb
import pyarrow.parquet as pq
from pathlib import Path



In [2]:
usgs_sites = [
    "09152500",   # Gunnison River Near Grand Junction, CO
    "09095500",   # Colorado River Near Cameo, CO
    "09106150",   # Colorado River Below Grand Valley Div NR Palisade, CO
    "09163500",   # Colorado River Near Colorado-utah State Line
    "09306500",   # White River Near Watson, Utah
    "09251000",   # Yampa River Near Maybell, CO
    "09260050",   # Yampa River at Deerlodge Park, CO
    "09260000",   # Little Snake River Near Lily, CO
    "09261000",   # Green River Near Jensen, UT
    "09315000",   # Green River at Green River, UT
    "09302000",   # Duchesne River Near Randlett, UT
    "09180000",   # Dolores River Near Cisco, UT
]

In [3]:
# === CONFIGURATION ===
sites = usgs_sites
parameter_codes = ['00060', '00010']
start_date = '2022-01-01'
end_date = (dt.date.today() - dt.timedelta(days=1)).strftime('%Y-%m-%d')
service_code = 'iv'
notebook_dir = Path.cwd()
project_root = notebook_dir.resolve().parents[0]
datalake_path = project_root / 'data' / 'hydrology_datalake'
db_path = project_root / 'data' / 'hydrology.duckdb'
# output_root = Path(__file__).resolve().parents[2] / 'hydrology_datalake' # ok for main.py
# output_root = '/Volumes/T7_raw_I/waterdata_lake'
# datalake_path = output_root / 'timeseries_iv'
# ======================


# Configure logging ------------------------------------------------
os.makedirs('logs', exist_ok=True)
log_name = 'logs/' + dt.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + '.log'
logging.basicConfig(filename=log_name,
                    level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')
# -------------------------------------------------------------------

In [None]:
sites_str = ', '.join(sites)

In [4]:

sites_df = fetch_nwis_data(site=', '.join(sites), service_code='site')
sites_df.columns

Index(['agency_cd', 'site_no', 'station_nm', 'site_tp_cd', 'lat_va', 'long_va',
       'dec_lat_va', 'dec_long_va', 'coord_meth_cd', 'coord_acy_cd',
       'coord_datum_cd', 'dec_coord_datum_cd', 'district_cd', 'state_cd',
       'county_cd', 'country_cd', 'land_net_ds', 'map_nm', 'map_scale_fc',
       'alt_va', 'alt_meth_cd', 'alt_acy_va', 'alt_datum_cd', 'huc_cd',
       'basin_cd', 'topo_cd', 'instruments_cd', 'construction_dt',
       'inventory_dt', 'drain_area_va', 'contrib_drain_area_va', 'tz_cd',
       'local_time_fg', 'reliability_cd', 'gw_file_cd', 'nat_aqfr_cd',
       'aqfr_cd', 'aqfr_type_cd', 'well_depth_va', 'hole_depth_va',
       'depth_src_cd', 'project_no'],
      dtype='object')

In [5]:
sites_cleaned = transform_nwis_site_data(sites_df)

In [None]:
sites = [sites[0]]

# output_root = notebook_dir.resolve().parents[0] / 'hydrology_datalake'
print(f'Output root: {datalake_path}')
print(notebook_dir)

In [15]:
ck = execute_sql_script(project_root / 'src' / 'sql' / 'build_hydrology_duckdb.sql', duckdb_path=db_path)
ck = execute_sql_script(project_root / 'src' / 'sql' / 'hydrology_datalake_views.sql', duckdb_path=db_path)

In [11]:
with duckdb.connect(str(db_path)) as con:
    con.register('sites_cleaned', sites_cleaned)
    con.execute("""
        INSERT INTO site (
            site_code,
            site_name,
            agency_code,
            latitude,
            longitude,
            site_type,
            hydro_area_name
        )
        SELECT
            site_code,
            site_name,
            agency_code,
            latitude,
            longitude,
            site_type,
            hydro_area_name
        FROM sites_cleaned
    """)

In [13]:
with duckdb.connect(str(db_path)) as con:
    df = con.execute("""
            SELECT *
            FROM sites_cleaned
        """).fetch_df()
    
    print(df.head())

  site_code                                          site_name agency_code  \
0  09095500                     COLORADO RIVER NEAR CAMEO, CO.        USGS   
1  09106150  COLO RIVER BELOW GRAND VALLEY DIV NR PALISADE, CO        USGS   
2  09152500            GUNNISON RIVER NEAR GRAND JUNCTION, CO.        USGS   
3  09163500       COLORADO RIVER NEAR COLORADO-UTAH STATE LINE        USGS   
4  09180000                       DOLORES RIVER NEAR CISCO, UT        USGS   

    latitude   longitude    site_type  hydro_area_name  
0  39.239146 -108.266195  stream gage             <NA>  
1  39.098592 -108.355086  stream gage             <NA>  
2  38.983316 -108.450645  stream gage             <NA>  
3  39.132760 -109.027055  stream gage             <NA>  
4  38.797208 -109.195114  stream gage             <NA>  


In [16]:
with duckdb.connect(str(db_path)) as con:
    df = con.execute("""
                     SELECT *
                     FROM vw_nwis_iv_local
                     LIMIT 10
                     """).fetch_df()

In [None]:
# Itterate through each site main function
for site in sites:
    print(f"Processing site: {site}")
    #all_data = []

    # Itterate through each parameter code
    for pcode in parameter_codes:
        
        # Fetch data for the current site and parameter code from NWIS
        df_raw = fetch_nwis_data(site, pcode, start_date, end_date, service_code)
        if df_raw is None:
            continue

        # Clean and transform the data, standardizing column names and types
        df_transformed = transform_nwis_iv_data(df_raw, site, pcode)
        if df_transformed.empty:
            continue

        # Write the data to a parquet file
        write_to_datalake(df_transformed, site, datalake_path)

In [None]:
query = (
    "SELECT * "
    f"FROM read_parquet('{output_root}\\timeseries_iv\\**\\*.parquet') "
    "LIMIT 10;"
)



print(query)

In [None]:
query = (
    "SELECT * "
    f"FROM '{output_root}\\timeseries_iv\\**\\*.parquet' "
    "WHERE site = '09180000' AND year = 2023"
)
print(query)

In [None]:
query = (
    "SELECT "
    "  site, "
    "  date_trunc('day', datetime) AS day, "
    "  parameter, "
    "  AVG(value) AS daily_mean "
    f"FROM read_parquet('{output_root}\\timeseries_iv\\**\\*.parquet') "
  #  "WHERE parameter = '00060' "
  #  " AND datetime >= '2023-01-01' "
    "WHERE year = 2024 "
    "GROUP BY site, date_trunc('day', datetime), parameter "
    "ORDER BY site, day;"
)


In [None]:
query = (
    "SELECT "
    "  datetime AT TIME ZONE 'UTC' AT TIME ZONE 'America/Denver' AS local_time, * "
     f"FROM read_parquet('{output_root}\\timeseries_iv\\**\\*.parquet') "
  #  "WHERE parameter = '00060' "
  #  " AND datetime >= '2023-01-01' "
    "WHERE year = 2024 "
)
print(query)

In [None]:
from pathlib import Path
print(Path.cwd())
print(output_root)

In [None]:
# Connect to DuckDB and execute the query
with duckdb.connect() as con:
    result = con.execute(query).fetchdf()
# Print the result
print(result)

In [None]:
with open(project_root / 'src' / 'sql' / 'hydrology_datalake_views.sql', 'r') as f:
    sql_script = f.read()
# Connect to DuckDB and execute the query
with duckdb.connect() as con:
    con.execute(sql_script)


In [None]:
query = (
    "SELECT "
    "  site, "
    "  date_trunc('day', datetime_local) AS day_local, "
    "  parameter, "
    "  AVG(value) AS daily_mean "
    "FROM vw_nwis_iv_local "
  #  "WHERE parameter = '00060' "
  #  " AND datetime >= '2023-01-01' "
    "WHERE year = 2024 "
    "GROUP BY site, date_trunc('day', datetime_local), parameter "
    "ORDER BY site, day;"
)


In [None]:
# Connect to DuckDB and execute the query
with duckdb.connect() as con:
    result = con.execute(query).fetchdf()
# Print the result
print(result)

In [None]:
import duckdb

con = duckdb.connect()

df = con.execute("""
    SELECT *
    FROM read_parquet('/volumes/T7_raw_I/waterdata_lake/timeseries_iv/')
    WHERE site = '09180000' AND year = 2023
""").fetchdf()

In [None]:
df = duckdb.read_parquet(
    f"{output_root}/timeseries_iv/site=09180000/year=2023/data.parquet"
)

In [None]:
print(df)

In [None]:
table = pq.read_table('data/data_lake/timeseries_iv/site=09152500/year=2022/data.parquet')
df = table.to_pandas()