In [None]:
from pathlib import Path
import pandas as pd 
import xml.etree.ElementTree as et 
%matplotlib inline

root_dir_sr = Path("/mnt/cropmaskperm/unpacked_ard_landsat_downloads/ARDSR/")
root_dir_xml = Path("/mnt/cropmaskperm/unpacked_ard_landsat_downloads/ARDxml/")

scene_paths = sorted(root_dir_sr.glob("*"))
xml_paths = sorted(root_dir_xml.glob("*"))
df_cols = ["cloud_cover", "cloud_shadow", "snow_ice", "fill", "instrument", "level1_collection", "ard_version"]
rows = []


for xml_path in xml_paths:
    
    xtree = et.parse(xml_path)
    tile_meta_ard = list(xtree.getroot())[0][0]
    tile_meta_global = list(xtree.getroot())[1][1]
    dataframe_dict = {}

    element = tile_meta_ard.find("{https://landsat.usgs.gov/ard/v1}"+"tile_grid")
    h = element.attrib['h']
    v = element.attrib['v']
    
    element = tile_meta_global.find("{https://landsat.usgs.gov/ard/v1}"+"wrs")
    path = element.attrib['path']
    row = element.attrib['row']
    
    element = tile_meta_ard.find("{https://landsat.usgs.gov/ard/v1}"+"acquisition_date")
    datetime = pd.to_datetime(element.text, format="%Y-%m-%d")
    
    dataframe_dict.update({'h':h, 'v':v, 'path': int(path), 'row': int(row), 'acquisition_date':datetime})
    
    for col in df_cols:
        element = tile_meta_ard.find("{https://landsat.usgs.gov/ard/v1}"+col)
        if col in ["cloud_cover", "cloud_shadow", "snow_ice", "fill"]:
            element.text = float(element.text)
        dataframe_dict.update({col:element.text})
    rows.append(dataframe_dict)
    
out_df = pd.DataFrame(rows, columns = df_cols.extend(['h','v', 'path', 'row', 'acquisition_date']))

out_df = out_df.set_index("acquisition_date")

out_df['xml_paths'] = xml_paths
out_df['scene_paths'] = scene_paths

# original paths and rows used in (path, row) form
og_path_rows = [(29, 31), (29, 32), (30, 31), (30, 32), (31, 31), (31, 32), (32, 31), (32, 32), (33, 31), (33, 32)]
# below not used yet, unsure which dates correspond to which path rows since this isn't documented in the label metadata
og_dates = pd.to_datetime(["2005/06/20", '2005/07/22', '2005/09/08', '2005/06/27', '2005/08/30', '2005/09/15', '2005/08/05', '2005/09/06', '2005/07/11', '2005/08/28', '2005/07/02', '2005/08/19', '2005/09/20'])
# 2005-07-20 was missing, could be a mistake in the metadata, amybe they meant 2005-07-22

out_df = out_df.loc[og_dates]

out_df = out_df[out_df[['path', 'row']].apply(tuple, axis=1).isin(og_path_rows)]
out_df

In [None]:
# drops the cloudiest duplicate by keeping the first duplicate. since we sorted by least cloudy to cloudiest
least_cloudy_june_sept_df = out_df['2005-06-20':"2005-09-20"]\
    .sort_values(["fill", "cloud_cover"])\
    .drop_duplicates(['h','v']) 

least_cloudy_june_sept_lst = least_cloudy_june_sept_df['scene_paths'].apply(str).to_list()

In [None]:
from cropmask.preprocess import PreprocessWorkflow, setup_dirs
import time
import dask

param_path = "/home/ryan/work/CropMask_RCNN/cropmask/test_preprocess_config.yaml"

# selected scenes with almost no clouds that occurred as well outside of the frost season as possible (ends in February-March)
scene_list = least_cloudy_june_sept_lst
labels_path = "/mnt/cropmaskperm/external/nebraska_pivots_projected.geojson"

setup_dirs(param_path)

results = []
for scene_path in scene_list:

#     wflow = dask.delayed(PreprocessWorkflow)(param_path, scene_path, labels_path)

    wflow = PreprocessWorkflow(param_path, scene_path, labels_path)
    
    band_list = wflow.yaml_to_band_index()
        
    product_list = wflow.get_product_paths(band_list)
        
    a = wflow.load_meta_and_bounds(product_list)
        
    b = a.stack_and_save_bands()
        
    c = b.tile_scene_and_vector()
    
    result = c.geojsons_to_masks() #not needed for coco conversions tep if using solaris

    results.append(result)
  

# # # https://docs.dask.org/en/stable/delayed-best-practices.html
# from dask.distributed import Client

# client = Client()  # use dask.distributed by default

# x = client.compute(results, scheduler="processes", num_workers=3)  # start computation in the background

In [None]:
x

# Single Case

In [None]:
%load_ext autoreload
%autoreload 2
from cropmask.preprocess import PreprocessWorkflow, setup_dirs
import time
import dask

param_path = "/home/ryan/work/CropMask_RCNN/cropmask/test_preprocess_config.yaml"

# selected scenes with almost no clouds that occurred as well outside of the frost season as possible (ends in February-March)
# scene_list = least_cloudy_june_sept_lst
labels_path = "/mnt/cropmaskperm/external/nebraska_pivots_projected.geojson"

setup_dirs(param_path)
# problem path
wflow = PreprocessWorkflow(param_path, 
                             "/mnt/cropmaskperm/unpacked_ard_landsat_downloads/ARDSR/LT05_CU_016008_20050620_20190102_C01_V01_SR",
                             labels_path)

band_list = wflow.yaml_to_band_index()

product_list = wflow.get_product_paths(band_list)

a = wflow.load_meta_and_bounds(product_list)

b = a.stack_and_save_bands()

In [None]:
import geopandas as gpd

_ = gpd.read_file("/mnt/cropmaskperm/external/nebraska_pivots_projected.geojson")

In [None]:
_[_.is_valid==False]['geometry'].iloc[0].intersection(_[_.is_valid==False]['geometry'].iloc[0]).is_valid

In [None]:
_[_.is_valid==False]['geometry'] = _[_.is_valid==False].buffer(0)

In [None]:
_

In [None]:
all_not_valid.buffer(0).iloc[0]

In [None]:
all_not_valid.iloc[0]['geometry']

In [None]:
all_not_valid['geometry'].buffer(0)

In [None]:
all_not_valid['geometry'].convex_hull.iloc[4].intersection(all_not_valid.iloc[4]['geometry'].buffer(0))

In [None]:
b.tile_scene_and_vector()

In [None]:
# import types
# wflow.geojsons_to_masks = types.MethodType(geojsons_to_masks, wflow )

rasterizing 915 512x512 tiles that have anyware from 0 to 100 instances took 18 minutes and 32 seconds

In [None]:
# import geopandas as gpd
# import solaris as sol
# import os
# from tqdm import tqdm
wflow.geojsons_to_masks()

In [None]:
import xarray as xa
import rioxarray
label = xa.open_rasterio(sorted(wflow.rasterized_label_paths)[0])

In [None]:
rast = xa.open_rasterio(sorted(wflow.raster_tile_paths)[0])

In [None]:
import numpy as np
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
rast.where(rast>0).plot.imshow(ax=ax, robust=True)
label.any(axis=0).where(label.any(axis=0)>0).plot.imshow(ax=ax, alpha=.5, add_colorbar=False)
# for i in np.arange(label.shape[0]):
#     label[i].plot.imshow(ax=ax, alpha=.5, add_colorbar=False)

code graveyard, shape preserving tiling but non uniform shapes

In [None]:
from shapely.geometry import box, Polygon, MultiPolygon, GeometryCollection

def katana(geometry, approx_tile_size, use_projection_size=False, transform=None, recursion_max = 5000, count = 0):
    """Split a Polygon into two parts across it's shortest dimension
    
    Arguments
    ---------
    geometry : str, optional
        A shapely.geometry.Polygon, path to a single feature geojson, 
    or list-like bounding box shaped like [left, bottom, right, top]
    src_tile_size : `tuple` of `int`s, optional
        The size of the input tiles in ``(y, x)`` coordinates. By default,
        this is in pixel units; this can be changed to metric units using the
        `use_metric_size` argument.
    use_metric_size : bool, optional
        Is `src_tile_size` in pixel units (default) or metric? To set to metric
        use ``use_metric_size=True``.
    transform : `tuple` of `int`s, optional
        A rasterio transform.
    
    Adapted from @lossyrob's Gist https://gist.github.com/lossyrob/7b620e6d2193cb55fbd0bffacf27f7f2
    
    
    """
    if isinstance(geometry, str):
        gj = json.loads(open(geometry).read())
        features = gj['features']
        if not len(features) == 1:
            print('Feature collection must only contain one feature')
            sys.exit(1)
        geometry = shape(features[0]['geometry'])
        
    elif isinstance(geometry, list) or isinstance(geometry, np.ndarray):
        assert len(geometry) == 4
        geometry = box(*geometry)
    
    elif isinstance(geometry, (Polygon, MultiPolygon)) is False:
        print("geometry must be one of type list, numpy.ndarray or shapely.geometry.Polygon")
        return
    
    bounds = geometry.bounds
    width = bounds[2] - bounds[0]
    height = bounds[3] - bounds[1]
    if use_projection_size is False:
        if transform is None:
            print("""Error: A transform is needed to convert pixel units to 
                  projection units if use_projection_size is False""")
            return
        approx_tile_size = approx_tile_size * transform[0]
    if max(width, height) <= approx_tile_size or count == recursion_max:
        # either the max dimesnion of the polygon is smaller than the threshold, 
        # or the maximum number of recursions has been reached
        return [geometry]
    if height >= width:
        # split left to right
        a = box(bounds[0], bounds[1], bounds[2], bounds[1]+height/2)
        b = box(bounds[0], bounds[1]+height/2, bounds[2], bounds[3])
    else:
        # split top to bottom
        a = box(bounds[0], bounds[1], bounds[0]+width/2, bounds[3])
        b = box(bounds[0]+width/2, bounds[1], bounds[2], bounds[3])
    result = []
    for d in (a, b,):
        c = geometry.intersection(d)
        if not isinstance(c, GeometryCollection):
            c = [c]
        for e in c:
            if isinstance(e, (Polygon, MultiPolygon)):
                result.extend(katana(e, approx_tile_size, count=count+1, use_projection_size=use_projection_size, transform=transform))
    if count > 0:
        return result
    # convert multipart into singlepart
    final_result = []
    for g in result:
        if isinstance(g, MultiPolygon):
            final_result.extend(g)
        else:
            final_result.append(g)
    return final_result

In [None]:
shapes = katana(neb, 15360, use_projection_size = True)

gpd.GeoDataFrame(geometry=[neb]).plot()


gpd.GeoDataFrame(geometry=shapes)[400:401].area

In [None]:
import dask
lst = [1,2,3,4,5,6]
output_lst = []
for i in lst:
    output_lst.append(dask.delayed(sum)([i,1]))
results = dask.compute(*output_lst)

results

In [None]:
import geopandas as gpd
f = gpd.read_file("/mnt/cropmaskperm/external/nebraska_pivots_projected.geojson")
f.crs

import xarray
import rioxarray
crs = xarray.open_rasterio(list(Path(scene_path).glob("*"))[6]).rio.crs

crs

f.crs = crs

f = f.to_crs(crs)

f.crs