# Filter pixel classifications results by segments

To Do:
- multiband inputs to rsgislib.shpeherdseg ?
- custom statistics to zonal stats? https://pythonhosted.org/rasterstats/manual.html#user-defined-statistics


        def mymean(x):
           return np.ma.mean(x)

        zonal_stats("tests/data/polygons.shp",
                    "tests/data/slope.tif",
                    stats="count",
                     add_stats={'mymean':mymean})


In [1]:
import os
import sys
import gdal
import shutil
import xarray as xr
import geopandas as gpd
import subprocess as sp
from rsgislib.segmentation import segutils

sys.path.append('../../Scripts')
from deafrica_classificationtools import HiddenPrints
from deafrica_spatialtools import xr_rasterize, xr_vectorize, zonal_stats_parallel

%load_ext autoreload
%autoreload 2

  return f(*args, **kwds)


# Analysis Parameters

In [2]:
test_shapefile = 'data/eastern_testing_sites.geojson'

results = 'results/classifications/'

model_type='gm_mads_two_seasons'

min_seg_size=100 #in number of pixels

### Open testing tile shapefile

In [3]:
gdf = gpd.read_file(test_shapefile)

## Image segmentation

1. Generate image segments
2. Vectorize segments
3. Calculate zonal 'majority' statistic over vectorized segments
4. Rasterize segments using 'majority' as pixel values.
5. Write object-based classification to disk

### Automatically find the number of cpus

In [4]:
try:
    ncpus = int(float(sp.getoutput('env | grep CPU')[-4:]))
except:
    ncpus = int(float(sp.getoutput('env | grep CPU')[-3:]))

print('ncpus = '+str(ncpus))

ncpus = 15


In [5]:
%%time
for g_id in gdf['GRID_ID'][5:6].values:
    print('working on grid: ' + g_id)
    
    #store temp files somewhere
    directory=results+'tmp_'+g_id
    if not os.path.exists(directory):
        os.mkdir(directory)
    
    tmp='tmp_'+g_id+'/'
    
    #inputs to image seg
    tiff_to_segment = results+'Eastern_tile_'+g_id+'_NDVI_S1.tif'
    kea_file = results+'Eastern_tile_'+g_id+'_NDVI_S1.kea'
    segmented_kea_file = results+'Eastern_tile_'+g_id+'_segmented.kea'

    #convert tiff to kea
    gdal.Translate(destName=kea_file,
                   srcDS=tiff_to_segment,
                   format='KEA',
                   outputSRS='EPSG:6933')
    
    #run image seg
    print('   image segmentation...')
    with HiddenPrints():
        segutils.runShepherdSegmentation(inputImg=kea_file,
                                         outputClumps=segmented_kea_file,
                                         tmpath=tmp,
                                         numClusters=60,
                                         minPxls=min_seg_size)
    
    #open segments
    da=xr.open_rasterio(segmented_kea_file).squeeze()

    #convert to polygons and export to disk
    print('   writing segments to shapefile...')
    with HiddenPrints():
        gdf_seg = xr_vectorize(da, attribute_col='attribute')
    print("Number of segments: "+str(len(gdf_seg)))
    gdf_seg.to_file(results+tmp+'Eastern_tile_'+g_id+'_segments.shp')
    
#     #calculate zonal-stats
#     print('   zonal statistics...')
#     zonal_stats_parallel(shp=results+tmp+'Eastern_tile_'+g_id+'_segments.shp',
#            raster=results+ 'Eastern_tile_'+g_id+'_prediction_pixel_'+model_type+'.tif',
#            statistics=['majority'],
#            out_shp=results+tmp+'Eastern_tile_'+g_id+"_zonal_stats.shp",
#            ncpus=ncpus
#                )
    
#     #rasterize the zonal-stats
#     with HiddenPrints():
#         gdf_zs=gpd.read_file(results+tmp+'Eastern_tile_'+g_id+"_zonal_stats.shp")
#         predict_zs = xr_rasterize(gdf_zs, da, attribute_col='majority')
    
#     #write to disk
#     write_cog(predict_zs, results+ 'Eastern_tile_'+g_id+'_prediction_object_'+model_type+'.tif', overwrite=True)
    
#     #remove the tmp folder
#     shutil.rmtree(results+tmp)
#     os.remove(kea_file)
#     os.remove(segmented_kea_file)

working on grid: E-14
   image segmentation...


  projstring = _prepare_from_string(projparams)


Number of segments: 348908
CPU times: user 9min 3s, sys: 6.74 s, total: 9min 9s
Wall time: 9min 11s


In [6]:
gdf_seg

Unnamed: 0,attribute,geometry
0,12.0,"POLYGON ((3624200.000 121740.000, 3624200.000 ..."
1,19.0,"POLYGON ((3620220.000 121720.000, 3620220.000 ..."
2,7.0,"POLYGON ((3622320.000 121740.000, 3622320.000 ..."
3,81.0,"POLYGON ((3602340.000 121620.000, 3602340.000 ..."
4,86.0,"POLYGON ((3603440.000 121620.000, 3603440.000 ..."
...,...,...
348903,348882.0,"POLYGON ((3471080.000 -66840.000, 3471080.000 ..."
348904,348854.0,"POLYGON ((3463060.000 -66760.000, 3463060.000 ..."
348905,348902.0,"POLYGON ((3465820.000 -66980.000, 3465820.000 ..."
348906,348812.0,"POLYGON ((3463640.000 -66680.000, 3463640.000 ..."


In [None]:
%time
zonal_stats_parallel(shp=results+tmp+'Eastern_tile_'+g_id+'_segments.shp',
           raster=results+ 'Eastern_tile_'+g_id+'_prediction_pixel_'+model_type+'.tif',
           statistics=['majority'],
           out_shp=results+tmp+'Eastern_tile_'+g_id+"_zonal_stats.shp",
           ncpus=ncpus,
           nodata=-1
               )

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.01 µs


  return self.session.__getitem__(item)


In [None]:
kwargs={'nodata':-1},

In [None]:
predict_zs.plot(size=8)

***
## RSGISlib Shepherd Seg

### single cpu, tiled

In [None]:
%time
# #run the segmentation
with HiddenPrints():
    tiledsegsingle.performTiledSegmentation(kea_file,
                                    segmented_kea_file,
                                    tmpDIR=temp,
                                    numClusters=60,
                                    validDataThreshold=validDataTileFraction, 
                                    tileWidth=width,
                                    tileHeight=height,
                                    minPxls=9)

In [None]:
# Attribute segments with zonal mean of input image and output as geotiff
meanImage(tiff_to_segment, segmented_kea_file, segments_zonal_mean, "GTIFF",rsgislib.TYPE_32FLOAT)

### n cpus, tiled

In [None]:
# %time
#run the segmentation
with HiddenPrints():
    tiledSegParallel.performTiledSegmentation(kea_file,
                                segmented_kea_file,
                                tmpDIR=temp,
                                numClusters=60,
                                validDataThreshold=validDataTileFraction, 
                                tileWidth=width,
                                tileHeight=height,
                                minPxls=9,
                                ncpus=ncpus)

In [None]:
# Attribute segments with zonal mean of input image and output as geotiff
meanImage(tiff_to_segment, segmented_kea_file, segments_zonal_mean, "GTIFF",rsgislib.TYPE_32FLOAT)

Dask parallel zonal stats (seems really slow)

In [None]:
# %%time
# pred = xr.open_rasterio(results+ 'Eastern_tile_'+g_id+'_prediction_pixel_'+model_type+'.tif').squeeze()
# arr = pred.values
# affine = pred.geobox.affine

# #gdf_segrt geopandas dataframe to dask dataframe
# data_dd = dd.from_pandas(z, npartitions=15)

# #fin majority of pixel
# res = data_dd.map_partitions(lambda df: df.assign(majority=pd.DataFrame(zonal_stats(vectors=z['geometry'],
#                                            affine=affine,
#                                            raster=arr,
#                                            stats='majority'))['majority']),
#                                             meta=data_dd).compute()
# print(res)