In [13]:
import os
import datetime as dt
import logging
import waterdata_utils as wdu
import duckdb
import pyarrow.parquet as pq


In [3]:
usgs_sites = [
    "09152500",   # Gunnison River Near Grand Junction, CO
]

In [4]:
# === CONFIGURATION ===
sites = usgs_sites
parameter_codes = ['00060', '00010']
start_date = '2022-01-01'
end_date = '2022-03-01'
service_code = 'iv'
output_root = 'data/data_lake'
# ======================


# Configure logging ------------------------------------------------
os.makedirs('logs', exist_ok=True)
log_name = 'logs/' + dt.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + '.log'
logging.basicConfig(filename=log_name,
                    level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')
# -------------------------------------------------------------------

In [6]:
# Itterate through each site main function
for site in sites:
    print(f"Processing site: {site}")
    all_data = []

    # Itterate through each parameter code
    for pcode in parameter_codes:
        
        # Fetch data for the current site and parameter code from NWIS
        df_raw = wdu.fetch_data(site, pcode, start_date, end_date, service_code)
        if df_raw is None:
            continue

        # Clean and transform the data, standardizing column names and types
        df_transformed = wdu.transform_data(df_raw, site, pcode)
        if df_transformed.empty:
            continue

        # Write the data to a parquet file
        wdu.write_to_datalake(df_transformed, site, output_root)

Processing site: 09152500


In [16]:
query = (
    "SELECT * "
    "FROM 'data/data_lake/timeseries_iv/site=09152500/year=2022/data.parquet'"
    "LIMIT 10"
)
# Connect to DuckDB and execute the query
with duckdb.connect() as con:
    result = con.execute(query).fetchdf()
# Print the result
print(result)

       site            datetime parameter  value approval_status  year
0  09152500 2022-01-01 07:00:00     00010    2.4               A  2022
1  09152500 2022-01-01 07:15:00     00010    2.4               A  2022
2  09152500 2022-01-01 07:30:00     00010    2.4               A  2022
3  09152500 2022-01-01 07:45:00     00010    2.4               A  2022
4  09152500 2022-01-01 08:00:00     00010    2.4               A  2022
5  09152500 2022-01-01 08:15:00     00010    2.4               A  2022
6  09152500 2022-01-01 08:30:00     00010    2.4               A  2022
7  09152500 2022-01-01 08:45:00     00010    2.4               A  2022
8  09152500 2022-01-01 09:00:00     00010    2.4               A  2022
9  09152500 2022-01-01 09:15:00     00010    2.3               A  2022


In [10]:
df = duckdb.read_parquet('data/data_lake/timeseries_iv/site=09152500/year=2022/data.parquet')

In [12]:
print(df)

┌──────────┬─────────────────────┬───────────┬────────┬─────────────────┬───────┐
│   site   │      datetime       │ parameter │ value  │ approval_status │ year  │
│ varchar  │    timestamp_ns     │  varchar  │ double │     varchar     │ int64 │
├──────────┼─────────────────────┼───────────┼────────┼─────────────────┼───────┤
│ 09152500 │ 2022-01-01 07:00:00 │ 00010     │    2.4 │ A               │  2022 │
│ 09152500 │ 2022-01-01 07:15:00 │ 00010     │    2.4 │ A               │  2022 │
│ 09152500 │ 2022-01-01 07:30:00 │ 00010     │    2.4 │ A               │  2022 │
│ 09152500 │ 2022-01-01 07:45:00 │ 00010     │    2.4 │ A               │  2022 │
│ 09152500 │ 2022-01-01 08:00:00 │ 00010     │    2.4 │ A               │  2022 │
│ 09152500 │ 2022-01-01 08:15:00 │ 00010     │    2.4 │ A               │  2022 │
│ 09152500 │ 2022-01-01 08:30:00 │ 00010     │    2.4 │ A               │  2022 │
│ 09152500 │ 2022-01-01 08:45:00 │ 00010     │    2.4 │ A               │  2022 │
│ 09152500 │ 202

In [14]:
table = pq.read_table('data/data_lake/timeseries_iv/site=09152500/year=2022/data.parquet')
df = table.to_pandas()

ArrowTypeError: Unable to merge: Field site has incompatible types: string vs dictionary<values=int32, indices=int32, ordered=0>