In [1]:
### Environment setup
import sys
sys.path.append('/content')
from src.fluvius import WaterData, WaterStation
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import geopandas as gpd
import fsspec
from pystac_client import Client
import planetary_computer as pc
import os

import matplotlib.pyplot as plt
# Set the environment variable PC_SDK_SUBSCRIPTION_KEY, or set it here.
# The Hub sets PC_SDK_SUBSCRIPTION_KEY automatically.
# pc.settings.set_subscription_key(<YOUR API Key>)
env_vars = !cat /content/.env

for var in env_vars:
    key, value = var.split(' = ')
    os.environ[key] = value

#################  set up ####################
data_source = 'itv'
container = f'{data_source}-data'

storage_options={'account_name':os.environ['ACCOUNT_NAME'],\
                 'account_key':os.environ['BLOB_KEY']}
fs = fsspec.filesystem('az',\
                       account_name=storage_options['account_name'],\
                       account_key=storage_options['account_key'])   

In [65]:
import datetime

def datestdtojd (stddate):
    fmt='%Y-%m-%d'
    sdtdate = datetime.datetime.strptime(stddate, fmt)
    sdtdate = sdtdate.timetuple()
    jdate = sdtdate.tm_yday
    return(jdate)

def process_df(df, datatype):
    df = df.copy()
    if datatype == 'ana':
        df = df.rename(columns={'Suspended Sediment Concentration (mg/L)':'SSC (mg/L)',\
                           'Discharge': 'Q (m3/s)'})
    elif datatype == 'itv':
        df = df.rename(columns={'SSC (mg/l)':'SSC (mg/L)',\
                           'Q (m³/s)': 'Q (m3/s)'})
    elif datatype == 'usgs':
        df = df.rename(columns={'Computed instantaneous suspended sediment (mg/L)':'SSC (mg/L)',\
                           'Instantaneous computed discharge (cfs)': 'Q (m3/s)'})
        cfs_to_m3s = 0.028316847
        df['Q (m3/s)'] = cfs_to_m3s * df['Q (m3/s)']
    else:
        df = df.rename(columns={'Instantaneous suspended sediment (mg/L)':'SSC (mg/L)',\
                           'Instantaneous computed discharge (cfs)': 'Q (m3/s)'})
        cfs_to_m3s = 0.028316847
        df['Q (m3/s)'] = cfs_to_m3s * df['Q (m3/s)']
        
    df['julian'] = [datestdtojd(d) for d in df['Date-Time']]
    selection = ['region', 'site_no', 'sample_id','julian',\
                 'Date-Time', 'Q (m3/s)', 'SSC (mg/L)',\
                 'Chip Cloud Pct','sentinel-2-l2a_R','sentinel-2-l2a_G','sentinel-2-l2a_B']
    return df[selection]
    
container_list = ['ana', 'itv', 'usgsi', 'usgs']
#we are looking for all the individual processed csv
#adding a column for the data type
db = []
for con in container_list:
    f = fs.walk(f'{con}-data/stations')
    processed_list = []
    for a in f:
        for b in a:
            for c in b:
                if 'processed' in c:
                    file = f'az://{a[0]}/{c}'
                    processed_list.append(pd.read_csv(file, storage_options=storage_options))
    df = pd.concat(processed_list)
    df.insert(0,'site_no',[i[:8] for i in df.sample_id])
    df.insert(0,'region',con)
    db.append(process_df(df, con))
db = pd.concat(db).dropna().reset_index(drop=True)

In [68]:
#db.to_json('az://modeling-data/fluvius_data.json',storage_options=storage_options)
db.to_csv('az://modeling-data/fluvius_data.csv',\
          index=False,\
          storage_options=storage_options)