In [None]:
import pandas as pd
from pathlib import Path

import sys
sys.path.append('../src')
from utils.duckdb_utils import connect_duckdb
from utils.duckdb_utils import run_sql_file
# import utils.site_list as sl

In [4]:
with connect_duckdb() as con:
    tables = con.execute("SHOW TABLES").fetchall()
    table_names = [table[0]for table in tables]

    print(table_names)

with connect_duckdb() as con:
    columns = con.execute("DESCRIBE site_parameter").fetchall()
    column_names = [col[0] for col in columns]

    print(column_names)

In [7]:
with connect_duckdb() as con:
    df = con.execute("SELECT * FROM site").df()
    print(df.columns)

Index(['site_id', 'site_cd', 'site_nm', 'site_dsc', 'agency_cd', 'agency_nm',
       'lat_dd', 'lon_dd', 'elev_m', 'site_type', 'hydro_area_cd',
       'hydro_area_nm', 'source', 'create_ts', 'update_ts'],
      dtype='object')


In [23]:
data = [
    {
        'site_id': 46,
        'site_cd': '2003',
        'site_nm': 'SHADOW MOUNTAIN RESERVOIR',
        'site_dsc': None,
        'agency_cd': 'BOR',
        'agency_nm': 'US Bureau of Reclamation',
        'lat_dd': 40.22698,
        'lon_dd': -105.84385,
        'elev_m': None,
        'site_type': 'lake',
        'hydro_area_cd': 'SMR',
        'hydro_area_nm': 'Shadow Mountain Reservoir',
        'source': 'https://www.usbr.gov/uc/water/hydrodata/reservoir_data/<site_cd>/csv/<parameter_cd>.csv'
    },
    {
        'site_id': 47,
        'site_cd': '100118',
        'site_nm': 'WILLOW CREEK RESERVOIR',
        'site_dsc': None,
        'agency_cd': 'BOR',
        'agency_nm': 'US Bureau of Reclamation',
        'lat_dd': 40.146932,
        'lon_dd': -105.942513,
        'elev_m': None,
        'site_type': 'lake',
        'hydro_area_cd': 'WCR',
        'hydro_area_nm': 'Willow Creek Reservoir',
        'source': 'https://www.usbr.gov/uc/water/hydrodata/reservoir_data/<site_cd>/csv/<parameter_cd>.csv'
    }
]

df = pd.DataFrame(data)
df['created_ts'] = pd.Timestamp.now()
df['updated_ts'] = pd.Timestamp.now()

In [25]:
with connect_duckdb() as con:
    #con.register('site_tmp', df)
    result = con.execute("SELECT * FROM site").df()

In [15]:
data = [
    {
        'parameter_id': 30,
        'parameter_cd': '63160',
        'parameter_nm': 'Stream surface elevation',
        'parameter_dsc': 'NAVD 1988',
        'unit_cd': 'ft',
        'unit_nm': 'Feet'
    },
    {
        'parameter_id': 31,
        'parameter_cd': '00054',
        'parameter_nm': 'Reservoir storage',
        'parameter_dsc': None,
        'unit_cd': 'af',
        'unit_nm': 'Acre-feet'
    }
]
df = pd.DataFrame(data)
df['created_ts'] = pd.Timestamp.now()
df['updated_ts'] = pd.Timestamp.now()

In [16]:
with connect_duckdb() as con:
    df = con.execute("SELECT * FROM parameter").df()

In [None]:
path = '../artifacts/site_parameter.csv'

df = pd.read_csv(path, usecols=['site_id', 'parameter_cd'],
                 dtype={'site_id': 'int64', 'parameter_cd': 'str'})
df = df[df['site_id'].notna() & df['parameter_cd'].notna()]

query = (
    "SELECT sp.site_id, s.site_nm, p.parameter_id, p.parameter_nm, p.parameter_cd"
    " FROM sp_tmp sp"
    " LEFT JOIN parameter p"
    " ON sp.parameter_cd = p.parameter_cd"
    " LEFT JOIN site s"
    " ON sp.site_id = s.site_id"
    " WHERE sp.parameter_cd IS NOT NULL"
    " ORDER BY sp.site_id, p.parameter_id"
)
#query = "select * from parameter"
with connect_duckdb() as con:
    con.register('sp_tmp', df)
    result = con.execute(query).df()
    
"""

print(query)
with connect_duckdb() as con:
    df = con.execute(query).df()
    print(df.head())
"""


In [36]:
result['created_ts'] = pd.Timestamp.now()
result['updated_ts'] = pd.Timestamp.now()
result['site_parameter_id'] = range(1, len(result) + 1)
result = result[['site_parameter_id', 'site_id', 'parameter_id', 'created_ts', 'updated_ts']].copy()


In [37]:

with connect_duckdb() as con:
    con.register('site_parameter_tmp', result)
    con.execute("INSERT INTO site_parameter SELECT * FROM site_parameter_tmp")
    print(con.execute("SELECT * FROM site_parameter").df().head())

   site_parameter_id  site_id  parameter_id                  create_ts  \
0                  1        1             3 2025-06-23 17:58:27.213919   
1                  2        1            31 2025-06-23 17:58:27.213919   
2                  3        2             1 2025-06-23 17:58:27.213919   
3                  4        2             2 2025-06-23 17:58:27.213919   
4                  5        2            23 2025-06-23 17:58:27.213919   

                   update_ts  
0 2025-06-23 17:58:27.213919  
1 2025-06-23 17:58:27.213919  
2 2025-06-23 17:58:27.213919  
3 2025-06-23 17:58:27.213919  
4 2025-06-23 17:58:27.213919  


In [None]:
result['created_ts'] = pd.Timestamp.now()
result['updated_ts'] = pd.Timestamp.now()
result['site_parameter_id'] = range(1, len(result) + 1)
result = result[['site_parameter_id', 'site_id', 'parameter_id', 'created_ts', 'updated_ts']].copy()


In [None]:

with connect_duckdb() as con:
    con.register('site_parameter_tmp', result)
    con.execute("INSERT INTO site_parameter SELECT * FROM site_parameter_tmp")
    print(con.execute("SELECT * FROM site_parameter").df().head())

In [None]:
query = """
    SELECT hydro_area_nm, site_nm, year, max_value, max_date
    FROM vw_nwis_annual_stats_local
    WHERE parameter_cd = '00060'
    order by year DESC, hydro_area_nm, site_nm
    """
query = "SELECT * FROM parameter"

with connect_duckdb() as con:
    result = con.execute(query).df()
    print(result.head())

In [None]:
nwis_params = pd.read_csv("../artifacts/USGS_parameter_codes.tsv", sep="\t", dtype=str)
nwis_params = nwis_params.loc[~nwis_params['parameter_cd'].isin(['00060', '00010'])].reset_index(drop=True)
nwis_params['parameter_id'] = range(23, len(nwis_params) + 23)
nwis_params['create_ts'] = pd.Timestamp.now()
nwis_params['update_ts'] = pd.Timestamp.now()

In [None]:
column_list = result.columns.tolist()
nwis_params = nwis_params[column_list].copy()

In [None]:
with connect_duckdb() as con:
    con.register('nwis_params', nwis_params)
    con.execute("INSERT INTO parameter SELECT * FROM nwis_params")

In [None]:
query = "SELECT * FROM site"
query = "SELECT * FROM parameter"

In [None]:
run_sql_file(Path('../db/views.sql'))

In [None]:
with connect_duckdb() as con:
    result = con.execute(query).df()
print(result.head())

In [None]:
param = pd.read_csv('../artifacts/parameters.csv')

with connect_duckdb() as con:
    con.register('param', param)
    con.execute('INSERT INTO parameter SELECT * FROM param')