In [3]:
import geopandas as gpd
from shapely.geometry import Polygon,shape
import numpy as np
import os
import fiona
import rasterio
from rasterio.mask import mask
from pathlib import Path

np.random.seed(42)

cutline_path = "/home/jovyan/work/satellite_data/ku_sync/South_Africa/cutlines/"
data_path = "/home/jovyan/work/satellite_data/ku_sync/South_Africa/all/"
years = list(range(2008,2021))


out_dir = "shapes/V6_all_data/"

# ncols = 100
# nrows = 100
patch_size = 256

test_ratio = 0.0000025

In [2]:
shapes = {}
Path(out_dir).mkdir(parents=True, exist_ok=True)
for year in years:
    polygons = []
    cut_df = gpd.read_file(os.path.join(cutline_path,str(year)+".geojson"))
    
    for i,row in cut_df.iterrows():
        f = row["id"]
        if os.path.isfile(os.path.join(data_path,f+"_"+str(year)+".tif")):
            filename = f+"_"+str(year)+".tif"
        elif os.path.isfile(os.path.join(data_path,f+"_"+str(year)+".jp2.tif")):
            filename = f+"_"+str(year)+".jp2.tif"
        elif os.path.isfile(os.path.join(data_path,f+"_"+str(year)+".jp2")):
            filename = f+"_"+str(year)+".jp2"
        else:
            print(f"{f} is missing!")
            continue
        with rasterio.open(os.path.join(data_path,filename)) as img:
            if img.crs != 4326:
                print(f"{filename} not in CRS EPSG:4326!")
            
            xmin, ymin, xmax, ymax = img.bounds
            width = xmax-xmin
            height = ymax-ymin

            x = xmin
            y = ymin
            
            stepsize = patch_size * img.meta["transform"][0]
            ncols = int(np.ceil(width / stepsize))
            nrows = int(np.ceil(height / stepsize))
            for r in range(nrows):
                for c in range(ncols):        
                    if (c % ncols) == 0:
                        x = xmin

                    p = Polygon([(x,y), (x+stepsize, y), (x+stepsize, y+stepsize), (x, y+stepsize)])
                    polygons.append(p)

                    x += stepsize
                y += stepsize
                
    sample_size = int(np.ceil(test_ratio * len(polygons)))
    sample_idxs = np.random.choice(range(len(polygons)),sample_size,replace=False)

    idxs = []
    idxs = sample_idxs
            
    if len(idxs) > 0:    
        grid = gpd.GeoDataFrame({'geometry':polygons})
        grid.crs = img.crs
        sampled_grid =  grid.iloc[idxs] 
        sampled_grid["idx"] = sampled_grid.index.values

        sampled_grid.to_file(out_dir+str(year)+'.geojson')#, driver='ESRI Shapefile')
        shapes[year] =sampled_grid
        print("Year: ",year)
        print("Number of files: ",len(cut_df))
        print("Number patches: ",len(grid))
        print("Number of samples: ",len(sampled_grid))
    else:
        print(f"No additional patches for year: {year}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


Year:  2008
Number of files:  96
Number patches:  3761829
Number of samples:  10


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


Year:  2009
Number of files:  567
Number patches:  22669862
Number of samples:  57


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


Year:  2010
Number of files:  463
Number patches:  18824878
Number of samples:  48
2727CB is missing!


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


Year:  2011
Number of files:  527
Number patches:  22150336
Number of samples:  56
3224BB is missing!
2331AA is missing!


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


Year:  2012
Number of files:  316
Number patches:  13335335
Number of samples:  34
3326CB is missing!


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


Year:  2013
Number of files:  791
Number patches:  32169478
Number of samples:  81


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


Year:  2014
Number of files:  570
Number patches:  23531697
Number of samples:  59


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


Year:  2015
Number of files:  665
Number patches:  27379135
Number of samples:  69


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


Year:  2016
Number of files:  733
Number patches:  29329615
Number of samples:  74


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


Year:  2017
Number of files:  384
Number patches:  63405505
Number of samples:  159
2427AC is missing!


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


Year:  2018
Number of files:  346
Number patches:  57422719
Number of samples:  144


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


Year:  2019
Number of files:  53
Number patches:  8735446
Number of samples:  22
Year:  2020
Number of files:  19
Number patches:  2942719
Number of samples:  8


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


# Old

In [None]:
#V1
shapes_V1 = {}
for i in years:
    shps = []
    f = fiona.open("shapes/"+str(i)+".shp")
    for s in f.values():
        sh = shape(s["geometry"])
        shps.append(sh)
    shapes_V1[i] = shps
    f.close()

In [3]:
shapes = {}
for year in years:
    polygons = []
    files = [i for i in os.listdir(path) if i.endswith(str(year)+".tif" or if i.endswith(str(y)+".jp2")]
    #files = [os.path.join(path,str(year)+".vrt")]
    
    for f in files:
        with rasterio.open(os.path.join(path,f)) as img:
            xmin, ymin, xmax, ymax = img.bounds
            width = xmax-xmin
            height = ymax-ymin

            x = xmin
            y = ymin
            
            stepsize = patch_size * img.meta["transform"][0]
            ncols = int(np.ceil(width / stepsize))
            nrows = int(np.ceil(height / stepsize))
            for r in range(nrows):
                for c in range(ncols):        
                    if (c % ncols) == 0:
                        x = xmin

                    p = Polygon([(x,y), (x+stepsize, y), (x+stepsize, y+stepsize), (x, y+stepsize)])
                    polygons.append(p)

                    x += stepsize
                y += stepsize
                
    sample_size = int(np.ceil(test_ratio * len(polygons)))
    sample_idxs = np.random.choice(range(len(polygons)),sample_size,replace=False)

    idxs = []
    idxs = sample_idxs
    # for i in sample_idxs:
    #     p = polygons[i]
    #     equals = False
    #     for p2 in shapes_V1[year]:
    #         if p.intersects(p2):
    #             equals = True
    #             break
    #     if not equals:
    #         idxs.append(i)
            
    if len(idxs) > 0:    
        grid = gpd.GeoDataFrame({'geometry':polygons})
        grid.crs = img.crs
        sampled_grid =  grid.iloc[idxs] 
        sampled_grid["idx"] = sampled_grid.index.values

        sampled_grid.to_file(out_dir+str(year)+'.geojson')#, driver='ESRI Shapefile')
        shapes[year] =sampled_grid
        print("Year: ",year)
        print("Number of files: ",len(files))
        print("Number patches: ",len(grid))
        print("Number of samples: ",len(sampled_grid))
    else:
        print(f"No additional patches for year: {year}")
    
    



KeyboardInterrupt



In [None]:
shapes = {}
for year in years:
    polygons = []
    files = [i for i in os.listdir(path) if i.endswith(str(year)+".tif")]
    
    for f in files:
        with rasterio.open(os.path.join(path,f)) as img:
            xmin, ymin, xmax, ymax = img.bounds
            width = xmax-xmin
            height = ymax-ymin

            wide = width / ncols
            length = height / nrows

            x = xmin
            y = ymin
            for r in range(nrows):
                if r== nrows-1:
                    ystep = length#+(height % nrows)
                else:
                    ystep = length

                for c in range(ncols):        
                    if (c % ncols) == 0:
                        x = xmin
                    if c == ncols-1:
                        xstep = wide#+(width % ncols)
                    else:
                        xstep = wide
                    p = Polygon([(x,y), (x+xstep, y), (x+xstep, y+ystep), (x, y+ystep)])
                    polygons.append(p)

                    x += xstep
                y += ystep
                
    sample_size = int(np.ceil(test_ratio * len(polygons)))
    sample_idxs = np.random.choice(range(len(polygons)),sample_size,replace=False)

    idxs = []
    for i in sample_idxs:
        p = polygons[i]
        equals = False
        for p2 in shapes_V1[year]:
            if p.intersect(p2):
                equals = True
                break
        if not equals:
            idxs.append(i)
            
    if len(idxs) > 0:    
        grid = gpd.GeoDataFrame({'geometry':polygons})
        grid.crs = img.crs
        sampled_grid =  grid.iloc[idxs] 
        sampled_grid["idx"] = sampled_grid.index.values

        sampled_grid.to_file(out_dir+str(year)+'.geojson')#, driver='ESRI Shapefile')
        shapes[year] =sampled_grid
        print("Year: ",year)
        print("Number of files: ",len(files))
        print("Number patches: ",len(grid))
        print("Number of samples: ",len(sampled_grid))
    else:
        print(f"No additional patches for year: {year}")
    
    
