In [None]:
import pandas as pd
import datetime as dt
import os
import logging
from pathlib import Path
import sys
sys.path.append('../src')
from utils.fetch_data import fetch_nwis_data, fetch_site_parameters, fetch_approval_status
from utils.transform_data import transform_nwis_iv_data
from utils.write_to_datalake import write_to_datalake
import utils.duckdb_utils as db
from collections import namedtuple
# import utils.site_list as sl

In [None]:
# db.run_sql_file(Path('../db/views.sql'))


In [None]:
with db.connect_duckdb() as con:
    tables = con.execute("SHOW TABLES").df()
    print(tables)

with db.connect_duckdb() as con:
    result = con.execute("DESCRIBE vw_nwis_iv_local").df()
    print(result)

In [None]:
#db.write_meta_tables_to_csv()

In [None]:
# === CONFIGURATION ===
"""
start_date = '2020-01-01'
end_date = (dt.date.today() - dt.timedelta(days=1)).strftime('%Y-%m-%d')
service_code = 'iv'
notebook_dir = Path.cwd()
project_root = notebook_dir.resolve().parents[0]
datalake_path = project_root / 'data' / 'hydrology_datalake'
db_path = project_root / 'data' / 'hydrology.duckdb'
# output_root = Path(__file__).resolve().parents[2] / 'hydrology_datalake' # ok for main.py
# output_root = '/Volumes/T7_raw_I/waterdata_lake'
# datalake_path = output_root / 'timeseries_iv'
# ======================
"""

# Configure logging ------------------------------------------------
os.makedirs('logs', exist_ok=True)
log_name = 'logs/' + dt.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + '.log'
logging.basicConfig(filename=log_name,
                    level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')
# -------------------------------------------------------------------

In [None]:
# Create a named tuple for site information
SiteInfo = namedtuple('SiteInfo', ['id', 'code'])

# fetch site list associated with NWIS API
with db.connect_duckdb() as con:
    result = con.execute("SELECT site_id, site_cd FROM site WHERE lower(source) LIKE '%nwis%'").fetchall()
    sites = [SiteInfo(site[0], site[1]) for site in result]
for site in sites:
    print(site.id, site.code)

In [None]:
for id, code in sites:
    print(f"Processing site: {code} (ID: {id})")

In [None]:
approval_check = fetch_approval_status(site_cd="09095500")
#if approval_check is not None:
#    for parameter_code, max_approval_date in approval_check:
#        print(f"Site code: {code}, Parameter code: {parameter_code}, Max approval date: {max_approval_date}")




In [None]:
for id, code in sites:
    logging.info(f"Processing site: {code}, ID: {id}")

    # Fetch parameter codes for the site
    approval_status = fetch_approval_status(site_cd=code)
    if approval_status is not None:
        for parameter_code, max_approval_date in approval_status:

            print(f"Site code: {code}, Parameter code: {parameter_code}, Max approval date: {max_approval_date}")


In [None]:
sites = sites[:1]  # Limit to first site for testing

In [None]:

for id, code in sites:
    logging.info(f"Processing site: {code}, ID: {id}")

    # Fetch parameter codes for the site
    approval_status = fetch_approval_status(site_cd=code)

    if not approval_status:
        logging.warning(f"No approval status found for site {code} (ID: {id}). Skipping...")
        continue
    
    all_data = []
    for parameter_code, max_approval_date in approval_status:
        logging.info(f"Fetching data for parameter: {parameter_code}")

        # Fetch NWIS data for the site and parameter
        df_raw = fetch_nwis_data(
            site_code=code,
            parameter_code=parameter_code,
            start_date=max_approval_date,
            end_date=(dt.date.today() - dt.timedelta(days=1)).strftime('%Y-%m-%d')
        )

        if df_raw is None:
            continue

        df_transformed = transform_nwis_iv_data(df_raw, site_code=code, parameter_code=parameter_code)
        if not df_transformed.empty:
            all_data.append(df_transformed)

    if all_data:
        df_combined = pd.concat(all_data, ignore_index=True)
        # Split into dictionary by year
        yearly_data = {year: group for year, group in df_combined.groupby(df_combined['read_ts'].dt.year)}
        for year, new_data in yearly_data.items():
            print(f"Year {year}: {len(new_data)} records for site {code}")

            query = (
                "SELECT * "
                " FROM read_parquet('/Volumes/T7_raw_I/ucpo_waterdata/hydrology_datalake/"
                "timeseries_iv/site=*/year=*/*.parquet')" 
                f" WHERE site_cd IN ('{code}')"
                f" AND year(read_ts) = {year}"
            )
            with db.connect_duckdb() as con:
                existing = con.execute(query).df()

            # Fix timezone alignment first
            if not existing.empty:
                existing['read_ts'] = existing['read_ts'].dt.tz_localize(None)
            new_data['read_ts'] = new_data['read_ts'].dt.tz_localize(None)

            # Use concat instead of append (which is deprecated)
            if not existing.empty:
                # Remove existing records that match new data timestamps/parameters
                mask = ~existing.set_index(['read_ts', 'parameter_cd']).index.isin(
                    new_data.set_index(['read_ts', 'parameter_cd']).index
                )
                filtered_existing = existing[mask]
                
                # Combine filtered existing data with new data
                merged = pd.concat([filtered_existing, new_data], ignore_index=True)
            else:
                merged = new_data.copy()

            # Write the combined data to the datalake            

            
        

In [None]:

# Create a named tuple for site information
ApprovalStatus = namedtuple('ApprovalStatus', ['parameter_code', 'max_approval_date'])

for id, code in sites:
    logging.info(f"Processing site: {code}, ID: {id}")

    # Fetch parameter codes for the site
    parameter_codes = fetch_site_parameters(site_id=id)
    if not parameter_codes:
        logging.warning(f"No parameters found for site {code} (ID: {id}). Skipping...")
        continue
    for parameter in parameter_codes:
        query = (
            "SELECT parameter_cd, strftime(max(datetime_utc), '%Y-%m-%d') AS date"
            " FROM vw_nwis_iv_local"
            f" WHERE site_cd = '{code}'"
            f" AND parameter_cd = '{parameter}'"
            " AND approval_status = 'A'"
            " GROUP BY site_cd, parameter_cd"
        )

        with db.connect_duckdb() as con:
            result = con.execute(query).fetchall()
        if result:
            for approval in result:
                approval_status = ApprovalStatus(approval[0], approval[1])
                print(approval_status)

        

In [None]:
code = '09041395'
year = 2025
query = (
        "SELECT * "
        " FROM read_parquet('/Volumes/T7_raw_I/ucpo_waterdata/hydrology_datalake/"
        "timeseries_iv/site=*/year=*/*.parquet')" 
        f" WHERE site_cd IN ('{code}')"
        f" AND year(read_ts) = {year}"
        " LIMIT 500")
with db.connect_duckdb() as con:
    existing = con.execute(query).df()


In [None]:
for id, code in sites:
    logging.info(f"Processing site: {code}, ID: {id}")

    query = (
    "SELECT p.parameter_cd"
    " FROM parameter p"
    " INNER JOIN site_parameter sp ON p.parameter_id = sp.parameter_id"
    f" WHERE sp.site_id = '{id}'"
    )

    with db.connect_duckdb() as con:
        params = con.execute(query).fetchall()
        parameter_codes = [param[0] for param in params]
        print(parameter_codes)

    all_data = []
# Loop through each site and its associated parameter codes
    for pcode in parameter_codes:
        df_raw = fetch_nwis_data(
            site=code,
            pcode=pcode,
            start_date=start_date,
            end_date=end_date,
        )
        if df_raw is None:
            continue

        df_transformed = transform_nwis_iv_data(df_raw, site=code, pcode=pcode)
        if not df_transformed.empty:
            all_data.append(df_transformed)

    if all_data:
        df_combined = pd.concat(all_data, ignore_index=True)
        write_to_datalake(df_combined, site=code)

    
    

In [None]:
df_raw.columns

In [None]:
with db.connect_duckdb() as con:
    result = con.execute("SELECT * FROM vw_nwis_annual_stats_local WHERE approval_status = 'A' AND year = 2023 AND parameter_nm = 'Water temperature'").df()
    param_codes = con.execute("SELECT DISTINCT parameter_cd FROM vw_nwis_iv_local").fetchall()
    param_codes = [param[0] for param in param_codes]
    print(param_codes)

In [None]:
query = """
    SELECT site_nm, parameter_nm, max(max_value) AS max_value
    FROM vw_nwis_annual_stats_local
    WHERE approval_status = 'A' AND hydro_area_nm = 'Colorado River'
    GROUP BY site_nm, year, parameter_nm
    """

with db.connect_duckdb() as con:
    result = con.execute(query).df()
    print(result.head())

In [None]:
with db.connect_duckdb() as con:
    result = con.execute("SELECT * FROM vw_nwis_annual_stats_local WHERE min_date IS NULL").df()
    print(result.head())

In [None]:
result.columns
print(result['parameter_cd'].unique())

In [None]:
site = '09095500'
query = (
    "SELECT p.parameter_cd"
    " FROM parameter p"
    " INNER JOIN site_parameter sp ON p.parameter_id = sp.parameter_id"
    " INNER JOIN site s ON sp.site_id = s.site_id"
    f" WHERE s.site_cd = '{site}'"
)

with db.connect_duckdb() as con:
        params = con.execute(query).fetchall()
        pcodes = [param[0] for param in params]
        print(pcodes)





In [None]:
with db.connect_duckdb() as con:
    tables = con.execute("SHOW TABLES").fetchall()
    table_names = [table[0]for table in tables]

    print(table_names)

with db.connect_duckdb() as con:
    columns = con.execute("DESCRIBE site").fetchall()
    column_names = [col[0] for col in columns]

    print(column_names)

In [None]:
ck = pd.read_csv("../artifacts/site_parameter.csv",
                 dtype={"parameter_cd": "string",
                        "site_cd": "string"})
print(ck.head())

In [None]:
db.refresh_db_from_csv(table_name="site_parameter", csv_path="../artifacts")

In [None]:
with db.connect_duckdb() as con:
    # Check if the table is refreshed
    refreshed_data = con.execute("SELECT * FROM site").df()
    print(f"Refreshed data in 'parameter' table: {refreshed_data[:5]}")  # Print first 5 rows for verification

In [None]:
data = [
    {
        'site_id': 46,
        'site_cd': '2003',
        'site_nm': 'SHADOW MOUNTAIN RESERVOIR',
        'site_dsc': None,
        'agency_cd': 'BOR',
        'agency_nm': 'US Bureau of Reclamation',
        'lat_dd': 40.22698,
        'lon_dd': -105.84385,
        'elev_m': None,
        'site_type': 'lake',
        'hydro_area_cd': 'SMR',
        'hydro_area_nm': 'Shadow Mountain Reservoir',
        'source': 'https://www.usbr.gov/uc/water/hydrodata/reservoir_data/<site_cd>/csv/<parameter_cd>.csv'
    },
    {
        'site_id': 47,
        'site_cd': '100118',
        'site_nm': 'WILLOW CREEK RESERVOIR',
        'site_dsc': None,
        'agency_cd': 'BOR',
        'agency_nm': 'US Bureau of Reclamation',
        'lat_dd': 40.146932,
        'lon_dd': -105.942513,
        'elev_m': None,
        'site_type': 'lake',
        'hydro_area_cd': 'WCR',
        'hydro_area_nm': 'Willow Creek Reservoir',
        'source': 'https://www.usbr.gov/uc/water/hydrodata/reservoir_data/<site_cd>/csv/<parameter_cd>.csv'
    }
]

df = pd.DataFrame(data)
df['created_ts'] = pd.Timestamp.now()
df['updated_ts'] = pd.Timestamp.now()

In [None]:
with connect_duckdb() as con:
    #con.register('site_tmp', df)
    result = con.execute("SELECT * FROM site").df()

In [None]:
data = [
    {
        'parameter_id': 30,
        'parameter_cd': '63160',
        'parameter_nm': 'Stream surface elevation',
        'parameter_dsc': 'NAVD 1988',
        'unit_cd': 'ft',
        'unit_nm': 'Feet'
    },
    {
        'parameter_id': 31,
        'parameter_cd': '00054',
        'parameter_nm': 'Reservoir storage',
        'parameter_dsc': None,
        'unit_cd': 'af',
        'unit_nm': 'Acre-feet'
    }
]
df = pd.DataFrame(data)
df['created_ts'] = pd.Timestamp.now()
df['updated_ts'] = pd.Timestamp.now()

In [None]:
with connect_duckdb() as con:
    df = con.execute("SELECT * FROM parameter").df()

In [None]:
query = (
    "SELECT sp.site_id, s.site_nm, p.parameter_id, p.parameter_nm, p.parameter_cd"
    " FROM site_parameter sp"
    " LEFT JOIN parameter p"
    " ON sp.parameter_id = p.parameter_id"
    " LEFT JOIN site s"
    " ON sp.site_id = s.site_id"
    " ORDER BY sp.site_id, p.parameter_id"
)
#query = "select * from parameter"
with connect_duckdb() as con:
    result = con.execute(query).df()
    
"""

print(query)
with connect_duckdb() as con:
    df = con.execute(query).df()
    print(df.head())
"""


In [None]:
result['created_ts'] = pd.Timestamp.now()
result['updated_ts'] = pd.Timestamp.now()
result['site_parameter_id'] = range(1, len(result) + 1)
result = result[['site_parameter_id', 'site_id', 'parameter_id', 'created_ts', 'updated_ts']].copy()


In [None]:

with connect_duckdb() as con:
    con.register('site_parameter_tmp', result)
    con.execute("INSERT INTO site_parameter SELECT * FROM site_parameter_tmp")
    print(con.execute("SELECT * FROM site_parameter").df().head())

In [None]:
result['created_ts'] = pd.Timestamp.now()
result['updated_ts'] = pd.Timestamp.now()
result['site_parameter_id'] = range(1, len(result) + 1)
result = result[['site_parameter_id', 'site_id', 'parameter_id', 'created_ts', 'updated_ts']].copy()


In [None]:

with connect_duckdb() as con:
    con.register('site_parameter_tmp', result)
    con.execute("INSERT INTO site_parameter SELECT * FROM site_parameter_tmp")
    print(con.execute("SELECT * FROM site_parameter").df().head())

In [None]:
query = """
    SELECT hydro_area_nm, site_nm, year, max_value, max_date
    FROM vw_nwis_annual_stats_local
    WHERE parameter_cd = '00060'
    order by year DESC, hydro_area_nm, site_nm
    """
query = "SELECT * FROM parameter"

with connect_duckdb() as con:
    result = con.execute(query).df()
    print(result.head())

In [None]:
nwis_params = pd.read_csv("../artifacts/USGS_parameter_codes.tsv", sep="\t", dtype=str)
nwis_params = nwis_params.loc[~nwis_params['parameter_cd'].isin(['00060', '00010'])].reset_index(drop=True)
nwis_params['parameter_id'] = range(23, len(nwis_params) + 23)
nwis_params['create_ts'] = pd.Timestamp.now()
nwis_params['update_ts'] = pd.Timestamp.now()

In [None]:
column_list = result.columns.tolist()
nwis_params = nwis_params[column_list].copy()

In [None]:
with connect_duckdb() as con:
    con.register('nwis_params', nwis_params)
    con.execute("INSERT INTO parameter SELECT * FROM nwis_params")

In [None]:
query = "SELECT * FROM site"
query = "SELECT * FROM parameter"

In [None]:
run_sql_file(Path('../db/views.sql'))

In [None]:
with connect_duckdb() as con:
    result = con.execute(query).df()
print(result.head())

In [None]:
param = pd.read_csv('../artifacts/parameters.csv')

with connect_duckdb() as con:
    con.register('param', param)
    con.execute('INSERT INTO parameter SELECT * FROM param')