In [14]:
import concurrent.futures
import os
import json
import shutil
from datetime import datetime
import pandas as pd
import numpy as np

In [15]:
r = '/caldera/hovenweep/projects/usgs/water'
d = os.path.join(r, 'wymtwsc', 'dketchum')

if not os.path.isdir(d):
    home = os.path.expanduser('~')
    d = os.path.join(home, 'data', 'IrrigationGIS')

c404 = os.path.join(d, 'conus404')
dads = os.path.join(d, 'dads')
ghcn = os.path.join(d, 'climate', 'ghcn')

sites = os.path.join(dads, 'met', 'stations', 'madis_29OCT2024.csv')

model_target = 'uncorrected'
if model_target == 'ba':
    csv_files = os.path.join(c404, 'station_data_ba')
    p_files = os.path.join(c404, 'parquet_ba')
else:
    csv_files = os.path.join(c404, 'station_data')
    p_files = os.path.join(c404, 'parquet')


In [16]:
stations = sites
root = csv_files
outdir = p_files
workers = 1
debug = True
missing_file = None
start_date='2000-01-01'
end_date='2022-09-30'
output_target = 'uncorrected'

In [17]:
start = pd.to_datetime(start_date)
end = pd.to_datetime(end_date)
expected_index = pd.date_range(start=start, end=end, freq='h')

station_list = pd.read_csv(stations)
if 'LAT' in station_list.columns:
    station_list = station_list.rename(columns={'STAID': 'fid', 'LAT': 'latitude', 'LON': 'longitude'})
w, s, e, n = (-125.0, 25.0, -67.0, 53.0)
station_list = station_list[(station_list['latitude'] < n) & (station_list['latitude'] >= s)]
station_list = station_list[(station_list['longitude'] < e) & (station_list['longitude'] >= w)]

station_list = station_list.sample(frac=1)
subdirs = station_list['fid'].to_list()

print(f'{len(subdirs)} directories to check')

53590 directories to check


In [22]:
def conus404_parquet(root_, subdir_, expected_index_, outdir_):
    subdir_path = os.path.join(root_, subdir_)
    print(subdir_path)
    out_file = os.path.join(outdir_, f'{subdir_}.parquet.gzip')

    if os.path.isdir(subdir_path):
        csv_files_ = [f for f in os.listdir(subdir_path) if f.endswith('.parquet')]
        # if os.path.exists(out_file) and csv_files_:
        #     shutil.rmtree(subdir_path)
        #     print(f'{os.path.basename(out_file)} exists, removing {len(csv_files)} csv files')
        #     return

        dtimes = [f.split('_')[-1].replace('.csv', '') for f in csv_files_]
        rm_files = csv_files_.copy()
        required_years_ = sorted(list(set([i.year for i in expected_index_])))

        if len(dtimes) < len(required_years_):
            missing = [m for m in required_years_ if m not in dtimes]
            if len(missing) > 0:
                print(f'{subdir_} missing {len(missing)} months: {np.random.choice(missing, size=5, replace=False)}')
                return

        dfs = []
        for file in csv_files_:
            c = pd.read_parquet(os.path.join(subdir_path, file))
            if file == csv_files_[0]:
                print(c.head())
            dfs.append(c)
        df = pd.concat(dfs)
        df = df.drop_duplicates(subset='dt', keep='first')
        df = df.set_index('dt').sort_index()
        missing = len(expected_index_) - df.shape[0]
        if missing > 15:
            print(f'{subdir_} is missing {missing} records')

        df['dt'] = df.index
        df.to_parquet(out_file, compression='gzip')
        # shutil.rmtree(subdir_path)
        now = datetime.strftime(datetime.now(), '%Y%m%d %H:%M')
        print(f'wrote {outfile},{now}')
        prq = 

    else:
        if os.path.exists(out_file):
            print(f'{os.path.basename(out_file)} exists, skipping')
        else:
            print(f'{subdir_} not found')


In [23]:
if debug:
    for subdir in ['E4939']:
        conus404_parquet(root, subdir, expected_index, outdir)

/caldera/hovenweep/projects/usgs/water/wymtwsc/dketchum/conus404/station_data/E4939
    latitude  longitude   elev stype          T2         TD2    QVAPOR  \
0  42.062099 -93.698402  303.0   IEM  264.192200  259.629852  0.001376   
1  42.062099 -93.698402  303.0   IEM  260.865295  256.627594  0.001073   
2  42.062099 -93.698402  303.0   IEM  258.290771  253.972870  0.000857   
3  42.062099 -93.698402  303.0   IEM  256.608063  252.200455  0.000736   
4  42.062099 -93.698402  303.0   IEM  254.663788  251.362976  0.000696   

        U10       V10          PSFC  ACSWDNLSM        lat        lon  \
0  5.738750  2.803222  96767.179688        0.0  42.058777 -93.717865   
1  4.267025  1.518956  96892.562500        0.0  42.058777 -93.717865   
2  3.572805  1.393518  96948.750000        0.0  42.058777 -93.717865   
3  3.725136  1.291566  96964.593750        0.0  42.058777 -93.717865   
4  2.706985  1.375903  97006.320312        0.0  42.058777 -93.717865   

          x         y          dt  
0 