In [1]:
import sys
import logging
import pandas as pd
import os
import datetime as dt
from pathlib import Path
sys.path.append('../src')
import utils.duckdb_utils as du
from utils.fetch_data import fetch_nwis_data, get_available_parameters
from utils.transform_data import transform_nwis_iv_data
from utils.write_to_datalake import write_to_datalake




In [2]:
# === CONFIGURATION ===
#parameter_codes = ['00060', '00010', '62614']
start_date = '2024-01-01'
end_date = (dt.date.today() - dt.timedelta(days=1)).strftime('%Y-%m-%d')
service_code = 'iv'
notebook_dir = Path.cwd()
project_root = notebook_dir.resolve().parents[0]
datalake_path = project_root / 'data' / 'hydrology_datalake'
db_path = project_root / 'data' / 'hydrology.duckdb'
# ======================


# Configure logging ------------------------------------------------
os.makedirs(project_root / 'logs', exist_ok=True)
log_name = str(project_root) + '/logs/' + dt.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + '.log'
logging.basicConfig(filename=log_name,
                    level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')
# -------------------------------------------------------------------

In [31]:
query = """
        SELECT site_cd
        FROM site
        WHERE agency_cd = 'USGS';
        SELECT parameter_cd
        FROM parameter
        WHERE length(parameter_cd) = 5
        """
queries = query.split(';')

query = "SELECT site_id, site_cd, site_nm, NULL AS parameter_cd, NULL AS notes FROM site"


print(queries)

["\n        SELECT site_cd\n        FROM site\n        WHERE agency_cd = 'USGS'", '\n        SELECT parameter_cd\n        FROM parameter\n        WHERE length(parameter_cd) = 5\n        ']


In [32]:
with du.connect_duckdb() as con:
    
    con.execute(query).df().to_csv('../artifacts/site_parameter.csv', index=False)
    

In [7]:

result = []
with du.connect_duckdb() as con:
    for q in queries:
        tmp = con.execute(q).fetchall()
        result.append([row[0] for row in tmp if row[0] is not None])

    
print(result)

[['09041395', '09095500', '09106150', '09106485', '09147022', '09152500', '09163500', '09180000', '09251000', '09260000', '09260050', '09261000', '09302000', '09306500', '09315000', '09328960', '09379900'], ['62614']]


In [9]:
get_available_parameters(site="09163500")

[]

In [4]:
for site in sites:
    logging.info(f"Fetching data for site {site}")
    all_data = []
    for pcode in parameter_codes:
        df_raw = fetch_nwis_data(
            site=site,
            pcode=pcode,
            start_date=start_date,
            end_date=end_date,
        )
        if df_raw is None:
            continue

        df_transformed = transform_nwis_iv_data(df_raw, site, pcode)
        if not df_transformed.empty:
            all_data.append(df_transformed)

    if all_data:
        df_combined = pd.concat(all_data, ignore_index=True)
        write_to_datalake(df_combined, site)

    


In [None]:
query = ("""
    SELECT *
    FROM vw_nwis_annual_stats_local
    WHERE hydro_area_nm LIKE '%Colo%' AND
         year = 1980
""")


print(query)

In [None]:
query = ("""
    SELECT *
    FROM vw_nwis_annual_stats_local
    WHERE (hydro_area_nm LIKE '%Green%' OR
           hydro_area_nm LIKE '%Colo%') AND
         YEAR IN (1999, 2000)
""")


print(query)

In [None]:
query = ("""
    SELECT s.site_nm, iv.site_cd, year, COUNT(*)
        FROM read_parquet('C:/Users/miesho/Projects_git/ucpo_waterdata/data/hydrology_datalake/timeseries_iv/site=*/year=*/*.parquet') iv
         INNer JOIN site s ON iv.site_cd = s.site_cd
        GROUP BY s.site_nm, iv.site_cd, year
         ORDER BY s.site_nm, iv.site_cd, year
         """)


print(query)

In [None]:
query = ("SELECT DISTINCT parameter_cd FROM vw_nwis_iv_local")

In [None]:
#du.run_sql_file('../db/views.sql')

In [None]:
with du.connect_duckdb() as con:
    result = con.execute(query).df()
print(result)

In [None]:
with du.connect_duckdb() as con:
    sites = con.execute("SELECT * FROM site").df()