In [1]:
### Environment setup
from pystac_client import Client
import planetary_computer as pc
import os

# Set the environment variable PC_SDK_SUBSCRIPTION_KEY, or set it here.
# The Hub sets PC_SDK_SUBSCRIPTION_KEY automatically.
# pc.settings.set_subscription_key(<YOUR API Key>)
env_vars = !cat /content/.env

for var in env_vars:
    key, value = var.split(' = ')
    os.environ[key] = value

In [2]:
import pandas as pd
import fsspec
import numpy as np
import geopandas as gpd
import rasterio
import matplotlib.pyplot as plt

data_srcs = ['itv', 'ana', 'usgs']
src = data_srcs[0]
container = f'{src}-data'

storage_options={'account_name':os.environ['ACCOUNT_NAME'],\
                 'account_key':os.environ['BLOB_KEY']}
fs = fsspec.filesystem('az', account_name=storage_options['account_name'], account_key=storage_options['account_key'])

In [21]:
total_tifs = []
total_csvs = []
for src in data_srcs:
    container = f'{src}-data'
    station_url = f'az://{container}/{src}_station_metadata.csv'
    df = pd.read_csv(station_url, storage_options=storage_options)
    folder = f'{container}/stations'
    station_names = [str(f).zfill(8) for f in df.site_no]
    #iterate on stations
    tif_count = []
    for station_folder in station_names:
        s = f'{folder}/{station_folder}'
        s_list = fs.ls(s)
        if s_list is not None:
            total = sum([s.count('.tif') for s in s_list])
            total_csv = sum([s.count('.csv') for s in s_list])
            tif_count.append({'site_no':station_folder, 'tif_count':total, 'csv_count':total_csv})
    tif_count = pd.DataFrame(tif_count)
    print(f'{src} data has {np.sum(tif_count.tif_count)/2} tifs and {np.sum(tif_count.csv_count)} csvs')
    total_tifs.append(tif_count)
total_tifs = pd.concat(total_tifs)

itv data has 198.0 tifs and 16 csvs
ana data has 143.0 tifs and 64 csvs
usgs data has 6315.0 tifs and 69 csvs


In [18]:
sdf = pd.read_csv(f'az://{s_list[0]}',storage_options=storage_options)
date_range = f"{sdf['Date-Time'].iloc[0]}-{sdf['Date-Time'].iloc[-1]}"

In [19]:
date_range

'07/01/2015 01:00-04/26/2017 13:00'

In [8]:
img_df

Unnamed: 0,site_no,site_name,Latitude,Longitude,geometry,cloudless_img_count,date_range
0,1632900,"Smith Creek Near New Market, VA",38.6935,-78.6428,"POLYGON ((-78.63279999999999 38.7035, -78.6327...",100,2015-11-16/2019-12-30
1,1645704,"Difficult Run Above Fox Lake Near Fairfax, VA",38.8847,-77.3324,"POLYGON ((-77.3224 38.8947, -77.3224 38.8747, ...",83,2015-08-05/2019-12-30
2,1645762,S F Little Difficult Run Above Mouth NR Vienna...,38.9089,-77.3383,"POLYGON ((-77.3283 38.9189, -77.3283 38.8989, ...",86,2015-08-05/2019-12-30
3,1646000,"Difficult Run Near Great Falls, VA",38.9759,-77.2458,"POLYGON ((-77.2358 38.9859, -77.2358 38.9659, ...",156,2015-08-05/2019-12-30
4,1646305,"Dead Run at Whann Avenue Near Mclean, VA",38.9598,-77.1757,"POLYGON ((-77.1657 38.9698, -77.1657 38.9498, ...",31,2015-08-05/2016-12-30
5,1649190,"Paint Branch Near College Park, MD",39.0331,-76.9643,"POLYGON ((-76.95429999999999 39.0431, -76.9542...",72,2015-08-05/2019-12-25
6,1649500,"Northeast Branch Anacostia River at Riverdale, MD",38.9603,-76.926,"POLYGON ((-76.916 38.97029999999999, -76.916 3...",24,2015-08-05/2017-12-30
7,1654000,"Accotink Creek Near Annandale, VA",38.8129,-77.2283,"POLYGON ((-77.2183 38.8229, -77.2183 38.8029, ...",277,2016-04-01/2019-12-30
8,1656903,Flatlick Branch Above Frog Branch at Chantilly...,38.8824,-77.4319,POLYGON ((-77.42189999999999 38.89239999999999...,147,2015-08-05/2019-12-30
9,1673000,"Pamunkey River Near Hanover, VA",37.7676,-77.3322,"POLYGON ((-77.3222 37.7776, -77.3222 37.7576, ...",101,2015-07-26/2019-12-25


## 500 mb per 3 channel image on sentinel-2 including scl downsampled

In [12]:
total_tb = np.sum(img_df.cloudless_img_count)*500/1000000

In [17]:
print(f'USGS total memory needed for {np.sum(img_df.cloudless_img_count)} images: {total_tb} TB')

USGS total memory needed for 3564 images: 1.782 TB


In [18]:
#this does not include prediction

In [21]:
ana = 29*500/1000000

In [23]:
total_tb + ana

1.7965

In [4]:
### Environment setup
from pystac_client import Client
import planetary_computer as pc
import os

# Set the environment variable PC_SDK_SUBSCRIPTION_KEY, or set it here.
# The Hub sets PC_SDK_SUBSCRIPTION_KEY automatically.
# pc.settings.set_subscription_key(<YOUR API Key>)
env_vars = !cat /content/.env

for var in env_vars:
    key, value = var.split(' = ')
    os.environ[key] = value

In [76]:
import pandas as pd
import fsspec
import numpy as np
import geopandas as gpd
import rasterio
from rasterio import windows
from rasterio import features
from rasterio import warp

import numpy as np
from PIL import Image
import matplotlib.pyplot as plt

container = 'usgs-data'

storage_options={'account_name':os.environ['CSP_ACCOUNT_NAME'],\
                 'account_key':os.environ['CSP_BLOB_KEY']}
fs = fsspec.filesystem('az', account_name=storage_options['account_name'], account_key=storage_options['account_key'])
fs_list = fs.ls(f'{container}/stations')

fluvius_storage={'account_name':os.environ['ACCOUNT_NAME'],\
                 'account_key':os.environ['BLOB_KEY']}

In [78]:
search_term = 'csv'
fs_query = [f for f in fs_list if search_term in f]

In [79]:
for f in fs_query:
    src = f'az://{f}'
    df = pd.read_csv(src, storage_options=storage_options)
    out= f'az://{f}'
    df.to_csv(out, storage_options=fluvius_storage, index=False)