this notebook must be run from the cropmask environment's jupyterlab, otherwise there are kernel restart errors.

In [None]:
from pathlib import Path
import pandas as pd 
import xml.etree.ElementTree as et 
%matplotlib inline

root_dir_sr = Path("/mnt/cropmaskperm/unpacked_ard_landsat_downloads/ARDSR/")
root_dir_xml = Path("/mnt/cropmaskperm/unpacked_ard_landsat_downloads/ARDxml/")

scene_paths = sorted(root_dir_sr.glob("*"))[0]
xml_paths = sorted(root_dir_xml.glob("*"))
df_cols = ["cloud_cover", "cloud_shadow", "snow_ice", "fill", "instrument", "level1_collection", "ard_version"]
rows = []

for xml_path in xml_paths:
    
    xtree = et.parse(xml_path)
    tile_meta_global = list(xtree.getroot())[0][0]
    dataframe_dict = {}

    element = tile_meta_global.find("{https://landsat.usgs.gov/ard/v1}"+"tile_grid")
    h = element.attrib['h']
    v = element.attrib['v']
    
    element = tile_meta_global.find("{https://landsat.usgs.gov/ard/v1}"+"acquisition_date")
    datetime = pd.to_datetime(element.text, format="%Y-%m-%d")
    
    dataframe_dict.update({'h':h, 'v':v, 'acquisition_date':datetime})
    
    for col in df_cols:
        element = tile_meta_global.find("{https://landsat.usgs.gov/ard/v1}"+col)
        if col in ["cloud_cover", "cloud_shadow", "snow_ice", "fill"]:
            element.text = float(element.text)
        dataframe_dict.update({col:element.text})
    rows.append(dataframe_dict)
    
out_df = pd.DataFrame(rows, columns = df_cols.extend(['acquisition_date', 'h','v']))

out_df = out_df.set_index("acquisition_date")

out_df['xml_paths'] = xml_paths
out_df['scene_paths'] = scene_paths

out_df

In [None]:
# drops the cloudiest duplicate by keeping the first duplicate. since we sorted by least cloudy to cloudiest
least_cloudy_jul_aug_df = out_df['2005-07-01':"2005-08-31"]\
    .sort_values("cloud_cover")\
    .drop_duplicates(['h','v']) 

In [None]:
cloud_qa = sorted(least_cloudy_jul_aug_df.iloc[-1]['scene_paths'].glob("*"))[-1]

In [None]:
cloud_qa

In [None]:
from __future__ import print_function
import sys; print(sys.version)
import platform; print(platform.platform())
import skimage; print("scikit-image version: {}".format(skimage.__version__))
import numpy; print("numpy version: {}".format(numpy.__version__))

In [None]:
import rasterio
import skimage.io as skio

with rasterio.open(cloud_qa.as_posix()) as src:
    arr = src.read()

arr

skio.imshow(cloud_qa.as_posix())

skio.imread(cloud_qa.as_posix())

In [None]:
import skimage.io as skio



In [None]:
from cropmask.preprocess import PreprocessWorkflow, setup_dirs
import time
import dask

param_path = "/home/ryan/work/CropMask_RCNN/cropmask/test_preprocess_config.yaml"

# selected scenes with almost no clouds that occurred as well outside of the frost season as possible (ends in February-March)
scene_list = [
    "/mnt/cropmaskperm/unpacked_landsat_downloads/LT050320312005082801T1-SC20190418222350", 
    "/mnt/cropmaskperm/unpacked_landsat_downloads/LT050320312005040601T1-SC20190418222326",
    "/mnt/cropmaskperm/unpacked_landsat_downloads/LT050290312005031601T1-SC20190818204935",  
    "/mnt/cropmaskperm/unpacked_landsat_downloads/LT050300312005020301T1-SC20190818205734",
    "/mnt/cropmaskperm/unpacked_landsat_downloads/LT050300322005020301T1-SC20190818205733",
    "/mnt/cropmaskperm/unpacked_landsat_downloads/LT050290322005031601T1-SC20190818205024"
]
labels_path = "/mnt/cropmaskperm/external/nebraska_pivots_projected.geojson"

setup_dirs(param_path)


# this is just to get the train dir path
wflow = PreprocessWorkflow(param_path, 
                             scene_list[0],
                             labels_path)

results = []
for scene_path in scene_list:

    wflow = dask.delayed(PreprocessWorkflow)(param_path, scene_path, labels_path)

#     wflow = PreprocessWorkflow(param_path, scene_path, labels_path)
    
    band_list = wflow.yaml_to_band_index()
        
    product_list = wflow.get_product_paths(band_list)
        
    a = wflow.load_meta_and_bounds(product_list)
        
    b = a.stack_and_save_bands()
        
    c = b.negative_buffer_and_small_filter(-31, 100)
        
    d = c.grid_images()
    
    e = d.imgs_to_pngs()
        
    f = e.connected_components()
    
    result = f.labels_to_pngs()

    results.append(result)
  

In [None]:
# https://docs.dask.org/en/stable/delayed-best-practices.html
from dask.distributed import Client

client = Client()  # use dask.distributed by default

x = dask.compute(*results, scheduler="single-threaded")  # start computation in the background



In [None]:
# this is just to get the train dir path
wflow = PreprocessWorkflow(param_path, 
                             scene_list[0],
                             labels_path)

In [None]:
wflow
band_list = wflow.yaml_to_band_index()
        
product_list = wflow.get_product_paths(band_list)
        
a = wflow.load_meta_and_bounds(product_list)

In [None]:
a.meta

In [None]:
    
    start = time.time()

    means = []
    for i in wflow.band_list:
        mean = get_arr_channel_mean(wflow.TRAIN,int(i)-1)
        means.append(mean)
        print("Band index {} mean for COCO normalization: ".format(i), mean)
        
    stop = time.time()
        
    print(stop-start, " seconds for this number of scenes: " + str(len(scene_list)))

In [None]:
import skimage.io as skio

arr = skio.imread("/mnt/cropmaskperm/test-landsat/chips/LT050320312005040601T1-SC20190418222326_tile_1024_3072/mask/LT050320312005040601T1-SC20190418222326_tile_1024_3072_label.tif")

In [None]:
import numpy as np
smaller_arr = arr.astype(np.uint8)

In [None]:
print("%d megabytes" % (arr.size * arr.itemsize / (1e6)))

In [None]:
print("%d megabytes" % (smaller_arr.size * smaller_arr.itemsize / (1e6)))

In [None]:
arr.shape