In [32]:
import geopandas as gpd
import pandas as pd
from pathlib import Path

# Read the input files
java_neg = gpd.read_file('gs://demeter-labs/tea/samples/java_neg_built_tree_samples_2000.geojson').rename({'remapped': 'class'}, axis=1)
sumatra_neg = gpd.read_file('gs://demeter-labs/tea/samples/sumatra_neg_built_tree_samples_2000.geojson').rename({'remapped': 'class'}, axis=1)
coffee_neg = gpd.read_file('gs://demeter-labs/tea/geometries/ra_data/subsets/Indonesia_Coffee_polygons.geojson').rename({'crop': 'class'}, axis=1)
cocoa_neg = gpd.read_file('gs://demeter-labs/tea/geometries/ra_data/subsets/Indonesia_Cocoa_polygons.geojson').rename({'crop': 'class'}, axis=1)
neg_samples = gpd.GeoDataFrame(pd.concat([java_neg, sumatra_neg, coffee_neg, cocoa_neg]), geometry='geometry')

# First save locally, then copy to GCS
local_path = '/tmp/neg_samples_2000.geojson'
neg_samples.to_file(local_path)

# GCSFS fails with current env... need to update geopandas probably.
from google.cloud import storage
client = storage.Client()
bucket = client.bucket('demeter-labs')
blob = bucket.blob('tea/samples/neg_samples_lc_2000_coffee_cocoa.geojson')
blob.upload_from_filename(local_path)

In [6]:
import glob
from joblib import Parallel, delayed
from tqdm import tqdm

def process_parquet_file(file):
    mgrs_id = Path(file).stem[:5]
    gdf = gpd.read_parquet(file)
    union = gpd.GeoDataFrame(geometry=[gdf.geometry.unary_union], crs=gdf.crs).to_crs('epsg:4326').geometry[0]
    return mgrs_id, union

parquet_files = glob.glob('/home/christopher.x.ren/embeddings/ra_tea/tiles/*.parquet')

# Process files in parallel with progress bar
results = Parallel(n_jobs=-1, verbose=20)(
    delayed(process_parquet_file)(file) 
    for file in tqdm(parquet_files)
)

# Unzip results
mgrs_ids, tile_geometries = zip(*results)

tiles_df = gpd.GeoDataFrame(
    {'mgrs_id': mgrs_ids, 'geometry': tile_geometries}, 
    geometry='geometry'
)

tiles_df = tiles_df.set_crs('EPSG:4326')


  0%|          | 0/63 [00:00<?, ?it/s][Parallel(n_jobs=-1)]: Using backend LokyBackend with 30 concurrent workers.


 48%|████▊     | 30/63 [00:00<00:00, 72.67it/s][Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    2.7s
100%|██████████| 63/63 [00:02<00:00, 23.23it/s]
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done   8 out of  63 | elapsed:    6.4s remaining:   44.0s
[Parallel(n_jobs=-1)]: Done  12 out of  63 | elapsed:    8.8s remaining:   37.4s
[Parallel(n_jobs=-1)]: Done  16 out of  63 | elapsed:   10.7s remaining:   31.4s
[Parallel(n_jobs=-1)]: Done  20 out of  63 | elapsed:   11.6s remaining:   25.0s
[Parallel(n_jobs=-1)]: Done  24 out of  63 | elapsed:   12.0s remaining:   19.5s
[Parallel(n_jobs=-1)]: Done  28 out of  63 | elapsed:   12.3s remaining:   15.3s
[Parallel(n_jobs=-1)]: Done  32 out of  63 | elapsed:   12.4s remaining:   12.0s
[Parallel(n_jobs=-1)]: Done  36 out of  63 | elapsed:   13.8s remaining:   10.3s
[Parallel

In [9]:
# First save locally, then copy to GCS
local_path = '/tmp/ra_roi_mgrs_tiles.geojson'
tiles_df.to_file(local_path)

# Upload to GCS using storage client
client = storage.Client()
bucket = client.bucket('demeter-labs')
blob = bucket.blob('tea/geometries/ra_roi_mgrs_tiles.geojson')
blob.upload_from_filename(local_path)

In [33]:
neg_samples = neg_samples.drop(columns=['area_ha', 'id', 'aoi', 'index_right', 'GID_0', 'country'])
# Perform spatial join between neg_samples and tiles_df
neg_samples_with_mgrs = gpd.sjoin(neg_samples, tiles_df, how='left', predicate='within')

# Display results showing neg_samples with their corresponding MGRS tile IDs
neg_samples_with_mgrs[['geometry', 'mgrs_id']]

Unnamed: 0,geometry,mgrs_id
0,POINT (106.11763 -6.65041),48MXT
1,POINT (106.17512 -6.44056),48MXT
2,POINT (105.85029 -6.71652),48MWT
3,POINT (106.47120 -6.82576),48MXT
4,POINT (106.52295 -6.78839),48MXT
...,...,...
3589,"MULTIPOLYGON (((105.53245 -5.23794, 105.53294 ...",48MWV
3590,"MULTIPOLYGON (((98.93662 2.91585, 98.93661 2.9...",47NMD
3591,"MULTIPOLYGON (((105.55091 -5.17357, 105.55091 ...",48MWV
3592,"MULTIPOLYGON (((105.56647 -5.29036, 105.56634 ...",48MWV


In [34]:
import os
from joblib import Parallel, delayed

def process_mgrs_group(mgrs_id, group, tile_dir):
    parquet_directory = tile_dir
    parquet_file = [f for f in os.listdir(parquet_directory) if f.split('_')[0] == mgrs_id]
    tile_gdf = gpd.read_parquet(parquet_directory + parquet_file[0])
    tile_gdf.to_crs('epsg:4326', inplace=True)
    tile_gdf['tile_geometry'] = tile_gdf.geometry
    
    joined = gpd.sjoin(group, tile_gdf, how='left', predicate='within')
    return joined

# iterate over mgrs_ids and get tiles intersecting points
points_with_tile_id = gpd.GeoDataFrame(neg_samples_with_mgrs[['geometry', 'mgrs_id', 'class']])
tile_dir = '/home/christopher.x.ren/embeddings/ra_tea/tiles/'

# Process groups in parallel
groups = [(mgrs_id, group) for mgrs_id, group in points_with_tile_id.groupby('mgrs_id')]
results = Parallel(n_jobs=-1, verbose=20)(
    delayed(process_mgrs_group)(mgrs_id, group, tile_dir)
    for mgrs_id, group in tqdm(groups)
)

# Concatenate all results into a single GeoDataFrame
result_df = gpd.GeoDataFrame(pd.concat(results, ignore_index=True))

# # save points with tile id
# result_df.drop(columns=['index_right'], inplace=True)
# result_df.drop('tile_geometry', axis=1).to_file('points_with_tile_id_intersecting_indonesia_aoi.geojson', driver='GeoJSON')

# # save tiles intersecting points 
# tiles = result_df.drop('geometry', axis=1).rename(columns={'tile_geometry': 'geometry'}).drop_duplicates(subset=['tile_id'])
# tiles.to_file('tiles_intersecting_points.geojson', driver='GeoJSON')

  0%|          | 0/57 [00:00<?, ?it/s][Parallel(n_jobs=-1)]: Using backend LokyBackend with 30 concurrent workers.
100%|██████████| 57/57 [00:00<00:00, 26907.75it/s]


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done   4 out of  57 | elapsed:    1.2s remaining:   16.3s
[Parallel(n_jobs=-1)]: Done   7 out of  57 | elapsed:    1.5s remaining:   10.5s
[Parallel(n_jobs=-1)]: Done  10 out of  57 | elapsed:    1.7s remaining:    7.9s
[Parallel(n_jobs=-1)]: Done  13 out of  57 | elapsed:    2.2s remaining:    7.5s
[Parallel(n_jobs=-1)]: Done  16 out of  57 | elapsed:    3.4s remaining:    8.6s
[Parallel(n_jobs=-1)]: Done  19 out of  57 | elapsed:    3.7s remaining:    7.5s
[Parallel(n_jobs=-1)]: Done  22 out of  57 | elapsed:    3.9s remaining:    6.2s
[Parallel(n_jobs=-1)]: Done  25 out of  57 | elapsed:    4.0s remaining:    5.2s
[Parallel(n_jobs=-1)]: Done  28 out of  57 | elapsed:    4.1s remaining:    4.2s
[Parallel(n_jobs=-1)]: Done  31 out of  57 | elapsed:    4.3s remaining:    3.6s
[Parallel(n_jobs=-1)]: Done  34 out of  57 | elapsed:    4.6s remaining:    3.1s
[Parallel(n_jobs=-1)]: Done  37 out of  57 | e

In [40]:
filtered_df = result_df.drop_duplicates(subset=['tile_id'])
filtered_df['class'] = filtered_df['class'].replace({2: 'Trees', 5: 'Built'})
filtered_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


Unnamed: 0,geometry,mgrs_id,class,index_right,tile_id,tile_geometry
0,POINT (100.51789 -0.92131),47MPU,Trees,81605.0,47MPU_32_16_10_613_310,"POLYGON ((100.51884 -0.92033, 100.51884 -0.923..."
1,POINT (100.51789 -0.92131),47MPU,Trees,81606.0,47MPU_32_16_10_614_310,"POLYGON ((100.51884 -0.91888, 100.51884 -0.921..."
2,POINT (100.51789 -0.92131),47MPU,Trees,82057.0,47MPU_32_16_10_613_311,"POLYGON ((100.52028 -0.92033, 100.52028 -0.923..."
3,POINT (100.51789 -0.92131),47MPU,Trees,82058.0,47MPU_32_16_10_614_311,"POLYGON ((100.52028 -0.91888, 100.52028 -0.921..."
4,POINT (100.78810 -1.06504),47MPU,Built,192006.0,47MPU_32_16_10_513_498,"POLYGON ((100.78916 -1.06489, 100.78916 -1.067..."
...,...,...,...,...,...,...
28504,"MULTIPOLYGON (((110.16052 -7.17065, 110.16051 ...",49MDN,Coffee,15591.0,49MDN_32_16_10_46_46,"POLYGON ((110.16240 -7.16868, 110.16240 -7.171..."
28509,"MULTIPOLYGON (((110.09433 -7.13777, 110.09429 ...",49MDN,Coffee,68.0,49MDN_32_16_10_68_0,"POLYGON ((110.09581 -7.13671, 110.09581 -7.139..."
28515,"MULTIPOLYGON (((110.17902 -7.23242, 110.17895 ...",49MDN,Coffee,20061.0,49MDN_32_16_10_3_59,"POLYGON ((110.18113 -7.23094, 110.18112 -7.233..."
28523,"MULTIPOLYGON (((110.18939 -7.22589, 110.18976 ...",49MDN,Coffee,22491.0,49MDN_32_16_10_7_66,"POLYGON ((110.19128 -7.22517, 110.19128 -7.228..."


In [41]:
# First save locally, then copy to GCS
local_path = '/tmp/neg_esri_lc_tiles.parquet'
filtered_df.to_parquet(local_path)

# Upload to GCS using storage client
client = storage.Client()
bucket = client.bucket('demeter-labs')
blob = bucket.blob('tea/samples/neg_esri_lc_cocoa_coffee_tiles.parquet')
blob.upload_from_filename(local_path)