In [1]:
from pathlib import Path
import pandas as pd 
import xml.etree.ElementTree as et 
%matplotlib inline

root_dir_sr = Path("/mnt/cropmaskperm/unpacked_ard_landsat_downloads/ARDSR/")
root_dir_xml = Path("/mnt/cropmaskperm/unpacked_ard_landsat_downloads/ARDxml/")

scene_paths = sorted(root_dir_sr.glob("*"))
xml_paths = sorted(root_dir_xml.glob("*"))
df_cols = ["cloud_cover", "cloud_shadow", "snow_ice", "fill", "instrument", "level1_collection", "ard_version"]
rows = []

for xml_path in xml_paths:
    
    xtree = et.parse(xml_path)
    tile_meta_global = list(xtree.getroot())[0][0]
    dataframe_dict = {}

    element = tile_meta_global.find("{https://landsat.usgs.gov/ard/v1}"+"tile_grid")
    h = element.attrib['h']
    v = element.attrib['v']
    
    element = tile_meta_global.find("{https://landsat.usgs.gov/ard/v1}"+"acquisition_date")
    datetime = pd.to_datetime(element.text, format="%Y-%m-%d")
    
    dataframe_dict.update({'h':h, 'v':v, 'acquisition_date':datetime})
    
    for col in df_cols:
        element = tile_meta_global.find("{https://landsat.usgs.gov/ard/v1}"+col)
        if col in ["cloud_cover", "cloud_shadow", "snow_ice", "fill"]:
            element.text = float(element.text)
        dataframe_dict.update({col:element.text})
    rows.append(dataframe_dict)
    
out_df = pd.DataFrame(rows, columns = df_cols.extend(['acquisition_date', 'h','v']))

out_df = out_df.set_index("acquisition_date")

out_df['xml_paths'] = xml_paths
out_df['scene_paths'] = scene_paths

out_df

Unnamed: 0_level_0,h,v,cloud_cover,cloud_shadow,snow_ice,fill,instrument,level1_collection,ard_version,xml_paths,scene_paths
acquisition_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2005-05-15,012,007,0.4535,0.0105,0.0020,35.2225,TM,01,01,/mnt/cropmaskperm/unpacked_ard_landsat_downloa...,/mnt/cropmaskperm/unpacked_ard_landsat_downloa...
2005-06-16,012,007,0.6347,0.2153,0.0021,33.8039,TM,01,01,/mnt/cropmaskperm/unpacked_ard_landsat_downloa...,/mnt/cropmaskperm/unpacked_ard_landsat_downloa...
2005-07-02,012,007,0.5160,0.0849,0.0014,34.1859,TM,01,01,/mnt/cropmaskperm/unpacked_ard_landsat_downloa...,/mnt/cropmaskperm/unpacked_ard_landsat_downloa...
2005-09-11,012,007,0.0599,0.0242,0.0000,22.6643,TM,01,01,/mnt/cropmaskperm/unpacked_ard_landsat_downloa...,/mnt/cropmaskperm/unpacked_ard_landsat_downloa...
2005-09-20,012,007,0.4365,0.0186,0.0017,30.7791,TM,01,01,/mnt/cropmaskperm/unpacked_ard_landsat_downloa...,/mnt/cropmaskperm/unpacked_ard_landsat_downloa...
...,...,...,...,...,...,...,...,...,...,...,...
2005-10-28,017,008,0.4467,0.0387,0.0021,22.1750,TM,01,01,/mnt/cropmaskperm/unpacked_ard_landsat_downloa...,/mnt/cropmaskperm/unpacked_ard_landsat_downloa...
2005-05-21,017,009,7.7989,0.2351,0.0005,2.5132,TM,01,01,/mnt/cropmaskperm/unpacked_ard_landsat_downloa...,/mnt/cropmaskperm/unpacked_ard_landsat_downloa...
2005-06-22,017,009,0.7452,0.4263,0.0005,1.9832,TM,01,01,/mnt/cropmaskperm/unpacked_ard_landsat_downloa...,/mnt/cropmaskperm/unpacked_ard_landsat_downloa...
2005-09-10,017,009,0.1065,0.0120,0.0005,0.8915,TM,01,01,/mnt/cropmaskperm/unpacked_ard_landsat_downloa...,/mnt/cropmaskperm/unpacked_ard_landsat_downloa...


In [2]:
# drops the cloudiest duplicate by keeping the first duplicate. since we sorted by least cloudy to cloudiest
least_cloudy_jul_aug_df = out_df['2005-07-01':"2005-08-31"]\
    .sort_values(["fill", "cloud_cover"])\
    .drop_duplicates(['h','v']) 

In [3]:
least_cloudy_jul_aug_df

Unnamed: 0_level_0,h,v,cloud_cover,cloud_shadow,snow_ice,fill,instrument,level1_collection,ard_version,xml_paths,scene_paths
acquisition_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2005-08-28,13,8,0.0847,0.0445,0.0,0.0389,TM,1,1,/mnt/cropmaskperm/unpacked_ard_landsat_downloa...,/mnt/cropmaskperm/unpacked_ard_landsat_downloa...
2005-08-30,15,7,0.0786,0.0331,0.0001,0.0515,TM,1,1,/mnt/cropmaskperm/unpacked_ard_landsat_downloa...,/mnt/cropmaskperm/unpacked_ard_landsat_downloa...
2005-08-05,14,8,0.127,0.0283,0.0,1.8143,TM,1,1,/mnt/cropmaskperm/unpacked_ard_landsat_downloa...,/mnt/cropmaskperm/unpacked_ard_landsat_downloa...
2005-08-07,16,7,0.1174,0.0066,0.0,2.021,TM,1,1,/mnt/cropmaskperm/unpacked_ard_landsat_downloa...,/mnt/cropmaskperm/unpacked_ard_landsat_downloa...
2005-08-19,12,8,2.2159,1.8556,0.0006,3.2831,TM,1,1,/mnt/cropmaskperm/unpacked_ard_landsat_downloa...,/mnt/cropmaskperm/unpacked_ard_landsat_downloa...
2005-08-05,14,7,0.1623,0.0,0.0007,5.6604,TM,1,1,/mnt/cropmaskperm/unpacked_ard_landsat_downloa...,/mnt/cropmaskperm/unpacked_ard_landsat_downloa...
2005-08-30,15,8,0.4191,0.2053,0.0,10.1315,TM,1,1,/mnt/cropmaskperm/unpacked_ard_landsat_downloa...,/mnt/cropmaskperm/unpacked_ard_landsat_downloa...
2005-07-11,13,9,1.1113,0.5699,0.0,11.6855,TM,1,1,/mnt/cropmaskperm/unpacked_ard_landsat_downloa...,/mnt/cropmaskperm/unpacked_ard_landsat_downloa...
2005-08-28,13,7,0.3151,0.0033,0.0016,17.0922,TM,1,1,/mnt/cropmaskperm/unpacked_ard_landsat_downloa...,/mnt/cropmaskperm/unpacked_ard_landsat_downloa...
2005-07-22,15,9,1.2552,0.7,0.0007,21.9231,TM,1,1,/mnt/cropmaskperm/unpacked_ard_landsat_downloa...,/mnt/cropmaskperm/unpacked_ard_landsat_downloa...


In [4]:
# cloud_qa = sorted(least_cloudy_jul_aug_df.iloc[-1]['scene_paths'].glob("*"))[-1]

# least_cloudy_jul_aug_df.iloc[-1]['scene_paths']

# import rasterio
# import skimage.io as skio
# from rasterio.plot import reshape_as_image
# import numpy as np
# with rasterio.open(cloud_qa.as_posix()) as src:
#     arr = src.read()


# img = reshape_as_image(arr)[:,:,0]
# skio.imshow(np.where(img==2,1,0))

In [5]:
least_cloudy_jul_aug_lst = least_cloudy_jul_aug_df['scene_paths'].apply(str).to_list()

In [6]:
from cropmask.preprocess import PreprocessWorkflow, setup_dirs
import time
import dask

param_path = "/home/ryan/work/CropMask_RCNN/cropmask/test_preprocess_config.yaml"

# selected scenes with almost no clouds that occurred as well outside of the frost season as possible (ends in February-March)
scene_list = least_cloudy_jul_aug_lst
labels_path = "/mnt/cropmaskperm/external/nebraska_pivots_projected.geojson"

setup_dirs(param_path)


# this is just to get the train dir path
wflow = PreprocessWorkflow(param_path, 
                             scene_list[0],
                             labels_path)

results = []
for scene_path in scene_list:

    wflow = dask.delayed(PreprocessWorkflow)(param_path, scene_path, labels_path)

#     wflow = PreprocessWorkflow(param_path, scene_path, labels_path)
    
    band_list = wflow.yaml_to_band_index()
        
    product_list = wflow.get_product_paths(band_list)
        
    a = wflow.load_meta_and_bounds(product_list)
        
    b = a.stack_and_save_bands()
        
    result = b.tile_scene_and_vector()

    results.append(result)
  

# https://docs.dask.org/en/stable/delayed-best-practices.html
from dask.distributed import Client

client = Client()  # use dask.distributed by default

x = client.compute(results, scheduler="processes")  # start computation in the background



In [7]:
x[0].result()

TypeError: tile_scene_and_vector() missing 2 required positional arguments: 'neg_buffer' and 'small_area_filter'

In [None]:
import skimage.io as skio

arr = skio.imread("/mnt/cropmaskperm/test-landsat/chips/LT050320312005040601T1-SC20190418222326_tile_1024_3072/mask/LT050320312005040601T1-SC20190418222326_tile_1024_3072_label.tif")

In [None]:
import numpy as np
smaller_arr = arr.astype(np.uint8)

In [None]:
print("%d megabytes" % (arr.size * arr.itemsize / (1e6)))

In [None]:
print("%d megabytes" % (smaller_arr.size * smaller_arr.itemsize / (1e6)))

In [None]:
arr.shape