In [None]:
import pandas as pd
import numpy as np
import requests
import sys
import logging
import os
import datetime as dt
sys.path.append('../src')
print(sys.path)  # Debug: check if '../src' is in sys.path

from utils.fetch_data import fetch_nwis_data
import utils.duckdb_utils as du
import utils.site_list as sl



In [None]:
# Configure logging ------------------------------------------------
os.makedirs('logs', exist_ok=True)
log_name = 'logs/' + dt.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + '.log'
logging.basicConfig(filename=log_name,
                    level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')
# -------------------------------------------------------------------

In [None]:
print(sl.nwis_sites)
sites = ", ".join(sl.nwis_sites)

with du.connect_duckdb() as con:
    result = con.execute("SELECT site_nm from site WHERE agency_cd = 'USGS'").fetchall()
    sites = [row[0] for row in result]
    print(sites)

with du.connect_duckdb() as con:
    result = con.execute("SELECT site_cd, site_nm from site WHERE agency_cd = 'USGS' AND hydro_area_cd = 'GR'").df()
    print(result)
    
with du.connect_duckdb() as con:
    result = con.execute("SELECT max(site_id) FROM site").fetchone()
    print(result)

In [None]:
nwis_rename_map = {
    'site_no': 'site_cd',
    'station_nm': 'site_nm',
    'agency_cd': 'agency_cd',
    'dec_lat_va': 'lat_dd',
    'dec_long_va': 'lon_dd',
    'alt_va': 'elev_ft',
    'site_tp_cd': 'site_type'
    }

bor_rename_map = {
    'site_id': 'site_cd', 'site_metadata.site_name': 'site_nm', 
    'site_metadata.lat': 'lat_dd', 'site_metadata.longi': 'lon_dd', 
    'site_metadata.elevation': 'elev_ft',

}

In [None]:
sites = ["09272400", "09301500", "09295100", "09295000", "09314500", "404417108524900", "09244490", "09247600"]
quoted_sites = "'" + "', '".join(sites) + "'"
with du.connect_duckdb() as con:
    result = con.execute(f"SELECT site_cd, site_nm FROM site WHERE site_cd IN ({quoted_sites})").fetchall()
    sites_ckd = [row[0] for row in result]
    print(f"Checked sites: {sites_ckd}")


In [None]:
nwis_metadata = fetch_nwis_data(
    site=",".join(sites),
    service_code='site')

In [None]:


selected = nwis_metadata[list(nwis_rename_map)].rename(columns=nwis_rename_map).reset_index(drop=True)
merged = selected.merge(sl.hydrologic_areas, on='site_cd', how='left')
merged['elev_m'] = (merged['elev_ft'] * 0.3048).round()
merged['lat_dd'] = merged['lat_dd'].round(7)
merged['lon_dd'] = merged['lon_dd'].round(7)
merged['agency_nm'] = 'US Geological Survey'
merged['site_type'] = merged['site_type'].str.replace('ST', 'Stream')
merged['site_type'] = merged['site_type'].str.replace('LK', 'Lake')
merged['source'] = 'NWIS API'
merged['site_dsc'] = None
merged['site_id'] = range(48, 48 + len(merged))
merged['create_ts'] = dt.datetime.now()
merged['update_ts'] = dt.datetime.now()

conditions = [
    merged['site_nm'].str.contains('Duchesne', case=False, na=False),
    merged['site_nm'].str.contains('Green', case=False, na=False),
    merged['site_nm'].str.contains('Yampa', case=False, na=False),
    merged['site_nm'].str.contains('Uinta', case=False, na=False),
    merged['site_nm'].str.contains('Price', case=False, na=False),
]

choices_cd = ['DU', 'GR', 'YA', 'UI', 'PR']
choices_nm = ['Duchesne River', 'Green River', 'Yampa River', 'Uinta River', 'Price River']
merged['hydro_area_cd'] = np.select(conditions, choices_cd, default="unknown")
merged['hydro_area_nm'] = np.select(conditions, choices_nm, default="unknown")


nwis_metadata_cleaned = merged[
    ['site_id', 'site_cd', 'site_nm', 'site_dsc', 'agency_cd', 'agency_nm', 
    'lat_dd', 'lon_dd', 'elev_m', 'site_type', 'hydro_area_cd',
    'hydro_area_nm', 'source', 'create_ts', 'update_ts']].sort_values(by='site_cd')


In [None]:
with du.connect_duckdb() as con:
    
    result = con.execute('SELECT * FROM site').df()
    print(result)
    

In [None]:
bor_metadata = pd.read_csv("https://www.usbr.gov/uc/water/hydrodata/reservoir_data/meta.csv")
bor_selected = bor_metadata[list(bor_rename_map)].rename(columns=bor_rename_map).reset_index(drop=True)
bor_selected['site_cd'] = bor_selected['site_cd'].astype(str)
bor_filtered = bor_selected[bor_selected['site_cd'].isin(sl.bor_sites)].drop_duplicates(subset='site_cd')

bor_merged = bor_filtered.merge(sl.hydrologic_areas, on='site_cd', how='left')
bor_merged['elev_m'] = (bor_merged['elev_ft'] * 0.3048).round()
bor_merged['lat_dd'] = bor_merged['lat_dd'].round(7)
bor_merged['lon_dd'] = bor_merged['lon_dd'].round(7)
bor_merged['agency_nm'] = 'US Bureau of Reclamation'
bor_merged['agency_cd'] = 'BOR'
bor_merged['site_type'] = "Lake"
bor_merged['source'] = 'https://www.usbr.gov/uc/water/hydrodata/reservoir_data/<site_cd>/csv/<parameter_cd>.csv'
bor_merged['site_dsc'] = None
bor_merged['site_id'] = range(18, len(bor_merged) + 18)
bor_merged['create_ts'] = dt.datetime.now()
bor_merged['update_ts'] = dt.datetime.now()

bor_metadata_cleaned = bor_merged[
    ['site_id', 'site_cd', 'site_nm', 'site_dsc', 'agency_cd', 'agency_nm', 
    'lat_dd', 'lon_dd', 'elev_m', 'site_type', 'hydro_area_cd',
    'hydro_area_nm', 'source', 'create_ts', 'update_ts']]



In [None]:
with du.connect_duckdb() as con:
    new_sites = con.execute("SELECT * FROM site WHERE site_id > 47").df()
    parameters = con.execute("SELECT * FROM parameter").df()
    site_param = con.execute("SELECT * FROM site_parameter").df()
    

In [None]:
with du.connect_duckdb() as con:
    # Verify the data was inserted
    result = con.execute("SELECT MAX(site_parameter_id) FROM site_parameter").fetchone()
    print(f"max site_parameter_id from 'site_parameter' table: {result[0]}")
    #logging.info(f"Number of rows in 'site' table: {result[0]}")


In [None]:
df = pd.read_csv("new_site_param.csv",
                 dtype={'site_id': int, 'parameter_cd': str})
merged_df = pd.merge(df, parameters, on='parameter_cd', how='left')
merged_df['site_parameter_id'] = range(193, len(merged_df) + 193)
merged_df['create_ts'] = dt.datetime.now()
merged_df['update_ts'] = dt.datetime.now()
merged_df = merged_df[['site_parameter_id', 'site_id', 'parameter_id', 'create_ts', 'update_ts']]

In [None]:
with du.connect_duckdb() as con:
    con.register('new_site_param', merged_df)
    con.execute("INSERT INTO site_parameter SELECT * FROM new_site_param")

In [None]:
cbrfc_metadata = sl.cbrfc_sites_df.copy()
cbrfc_metadata['site_id'] = range(28, len(cbrfc_metadata) + 28)
cbrfc_metadata['create_ts'] = dt.datetime.now()
cbrfc_metadata['update_ts'] = dt.datetime.now()
cbrfc_metadata_cleaned = cbrfc_metadata[
    ['site_id', 'site_cd', 'site_nm', 'site_dsc', 'agency_cd', 'agency_nm', 
    'lat_dd', 'lon_dd', 'elev_m', 'site_type', 'hydro_area_cd',
    'hydro_area_nm', 'source', 'create_ts', 'update_ts']]

In [None]:
print(sl.hydrologic_areas)
ck = pd.read_csv(f"https://www.usbr.gov/uc/water/hydrodata/reservoir_data/{bor_merged['site_cd'][0]}/csv/17.csv")

In [None]:
du.run_sql_file('../db/schema.sql')

In [None]:
query = """
INSERT INTO site
SELECT * FROM staging_table
"""
print(query)

In [None]:
with du.connect_duckdb() as con:
    columns = con.execute("PRAGMA table_info('site')").fetchall()
    print(columns)

In [None]:
with du.connect_duckdb() as con:
        try:
            con.register('staging_table', cbrfc_metadata_cleaned)
            con.execute(query)
            logging.info(f"✅ Successfully executed SQL query: {query}")
        except Exception as e:
            logging.error(f"❌ Error executing SQL file {query}: {e}")
            raise


In [None]:
with du.connect_duckdb() as con:
    # Verify the data was inserted
    result = con.execute("SELECT COUNT(*) FROM site").fetchone()
    print(f"Number of rows in 'site' table: {result[0]}")
    logging.info(f"Number of rows in 'site' table: {result[0]}")


In [None]:
with du.connect_duckdb() as con:
    # Verify the data was inserted
    result = con.execute("SELECT * FROM site").df()

In [None]:
result.to_csv('site_metadata.csv', index=False)