# CAMELS Digitization

Read the USGS streamflow stations from the [CAMELS dataset](https://ral.ucar.edu/solutions/products/camels), discretize the dataset based on a 3 bit (8 bin) equiprobable distribution, and write the digitized streamflow series to individual files.

In [1]:
import os
import time
import pandas as pd
import numpy as np
import scipy.stats as st
import collections
from multiprocessing import Pool

db_path = '/media/danbot/T7 Touch/camels_db/usgs_streamflow/'

folders = [f'{db_path}{f}/' for f in os.listdir(db_path)]

In [2]:
# retrieve all filepaths for all streamflow files
all_filepaths = []
for folder in folders:
    files = os.listdir(folder)
    all_filepaths += [f'{folder}{e}' for e in files]


In [3]:
def generate_uniform_noise(precision, n):    
    threshold = (10**(precision+6) / 2 - 1) / 10**(precision+7)
    noise = np.random.uniform(-threshold, threshold, n)    
    return np.clip(noise, a_min=0., a_max=None)


def determine_series_precision(df):
    n_decimals = [len(str(e).split('.')[-1]) for e in df['flow'].to_numpy()]
    return max(n_decimals)


def add_dummy_precision(df, label, i=0, series_precision=2):
    # find all duplicate values and add artificial noise 
    # to allow equiprobable binning
    
    series_precision = determine_series_precision(df)
    
    repeated_vals = [item for item, count in collections.Counter(df[label].to_numpy()).items() if count > 1]
#     print(f'{len(repeated_vals)} at iteration {i}')
    while (i < 10) & (len(repeated_vals) != 0):
        
        dupe_idx = df[df[label].duplicated()].index
        n_values = len(dupe_idx)
        
        df.loc[dupe_idx, label] += generate_uniform_noise(series_precision, n_values)
        foo = generate_uniform_noise(series_precision, n_values)

        i += 1
        series_precision += 1
        if series_precision > 3:
            series_precision == 3
            
        repeated_vals = [item for item, count in collections.Counter(df[label].to_numpy()).items() if count > 1]

        if len(repeated_vals) == 0:
            return df
        else:
            add_dummy_precision(df, label, i, series_precision)
        

In [4]:
def derive_equiprobable_bin_edges(df, n_bins):
    n_obs = len(df)
    min_q = df
    probs = np.linspace(0.0,1, n_bins+1)
    return st.mstats.mquantiles(df['flow'], prob=probs)


def convert_to_datetime_str(row):
    year = row['year']
    month = row['month']
    day = row['day']
    s = f'{year}-{month}-{day}'
    return pd.to_datetime(s)


In [None]:
def process_files(f):
    n_bins = 128
    df = pd.read_csv(f, sep='\s+', header=None, names=['id', 'year', 'month', 'day', 'flow', 'flag'])
    df['flow'] = df['flow'].clip(lower=0, upper=None)
    
    stn = f.split('/')[-1].split('_')[0]
    folder = f'/media/danbot/T7 Touch/camels_db/digitized_series/{n_bins}bit/'
    
    if f'{stn}.csv' in os.listdir(folder):
#         print(f'{stn} file exists.')
        return None
    
    df = add_dummy_precision(df, 'flow')

    if len(df[df['flow'].isna()]) > 0:
        print("NULL VALUES FOUND")
        print("NULL VALUES FOUND")
        print("NULL VALUES FOUND")
        print("NULL VALUES FOUND")
        print("NULL VALUES FOUND")
        
        print('')
    
    equiprobable_bin_edges = derive_equiprobable_bin_edges(df, n_bins)
        
#     print(f'For station {stn}, bin edges are {equiprobable_bin_edges}')
    max_q = df['flow'].max()
    min_q = df['flow'].min()
#     print(f'    flow range: {min_q} - {max_q} cms')
    
    df['datetime'] = pd.to_datetime(df[['year', 'month', 'day']])
    try:
        df['bin_no'] = np.digitize(df['flow'], equiprobable_bin_edges[1:], right=True)
    except ValueError as err:
        print(f'Error at {stn}: ')
        print(f'         {err}')
        print(f'         {equiprobable_bin_edges}')
        return None
    
#     bin_nos = list(set(df['bin_no']))
#     print(f'    bin nos:')
#     print(f'           {bin_nos}')
    
    df = df[['datetime', f'bin_no']]
    output_filepath = f'{stn}.csv'
    df.to_csv(folder + output_filepath, index=False)
    


In [None]:

t0 = time.time()
# pool = Pool()
# pool.map(process_files, all_filepaths[:3])
# pool.close()
# pool.join()

for f in all_filepaths:
    process_files(f)

t1 = time.time()
t_tot = t1 - t0
print(f'Time to generate {len(all_filepaths)} files: {t_tot:.1f}s')