In [None]:
from datetime import datetime
import os 

import geopandas as gpd
import numpy as np
from tqdm import tqdm

WORK_DIR = '../data/sampling_locations'

In [None]:
data_files, data_file_labels, dates = [], [], []
pos2019 = [
'amazonas_2020_thresh_0.8_sumbsample_3_positives.geojson',
'MinesPos2018-2020Sentinel_points.geojson',
'v2.4_amazon_positives.geojson',
'bolivar_2020_thresh_0.8_sumbsample_5_positives_cleaned.geojson'
]
data_files += pos2019
data_file_labels += [1 for _ in range(len(pos2019))]
dates += [('2019-01-01', '2020-01-01') for _ in range(len(pos2019))]

neg2019 = [
'bolivar_2020_thresh_0.8_1_negatives.geojson',
'amazonas_2020_thresh_0.5_2_negatives.geojson',
'full_amazon_v9_negatives.geojson',
'v2.0_bolivar_negatives.geojson',
'v2.1.1_bolivar_negatives.geojson',
'v2.4_amazonas_negatives.geojson',
'v2.4_amazon_negatives.geojson',
'v2.6_amazon_thresh_0.8_negatives.geojson',
'v2.6_amazon_negatives.geojson',
'v2.6_amazon_negatives_v2.geojson',
]
data_files += neg2019
data_file_labels += [0 for _ in range(len(neg2019))]
dates += [('2019-01-01', '2020-01-01') for _ in range(len(neg2019))]

pos2023 = [
'amazon_all_48px_v3.1_2023_positives_0.999_cleaned.geojson',
'v3.3_2023_positives.geojson',
'v3.6_2023_positives.geojson',
'v0.5-0.8-0.9ensemble_2023_positives.geojson'
]
data_files += pos2023
data_file_labels += [1 for _ in range(len(pos2023))]
dates += [('2023-01-01', '2024-01-01') for _ in range(len(pos2023))]

neg2023 = [
'v3.1_2023_negatives.geojson',
'v3.2_2023_negatives.geojson',
'v3.4_2023_negatives.geojson',
'v3.5_2023_negatives.geojson',
'v0.3SSL4EO-MLP_2023_negatives.geojson',
'v0.5-0.8-0.9ensemble_2023_negatives.geojson',
'v1.0.1SSL4EO-MLP_2023_negatives.geojson'
]
data_files += neg2023
data_file_labels += [0 for _ in range(len(neg2023))]
dates += [('2023-01-01', '2024-01-01') for _ in range(len(neg2023))]

data_files += ['dredge_mining_positives2025-08.geojson']
data_file_labels += [1]
dates += [('2024-08-01', '2024-11-01')]

pos2024 = [
'ACAexpanded_amazon_positives2025-08.geojson',
'BD_MineriaAurifera_Peru-curated2025-09.geojson',
'v3.2-3.7ensemble_positives2025-08.geojson',
]
data_files += pos2024
data_file_labels += [1 for _ in range(len(pos2024))]
dates += [('2024-01-01', '2025-01-01') for _ in range(len(pos2024))]

neg2024 = [
'ACAexpanded_amazon_negatives2025-08.geojson',
'industrial_negatives2025-08.geojson',
'aquaculture-faziendas-negatives2025-08.geojson',
'landslide_negatives2025-08.geojson',
'random_negatives25k_2025-08.geojson'
]
data_files += neg2024
data_file_labels += [0 for _ in range(len(neg2024))]
dates += [('2024-01-01', '2025-01-01') for _ in range(len(neg2024))]


data_files += ['riverbank_negatives_v2025-08.geojson']
data_file_labels += [0]
dates += [('2024-02-01', '2024-05-01')]

data_files += ['riverbank_negatives_bolivia_v2025-08.geojson']
data_file_labels += [0]
dates += [('2024-08-01', '2024-11-01')]

data_files += ['v0.4-6SSL4EO-MLP_bolivia_2023_negatives.geojson']
data_file_labels += [0]
dates += [('2023-08-01', '2023-11-01')]






In [None]:
df = gpd.pd.DataFrame(
    [(f, l, d[0], d[1]) for f, l, d in zip(data_files, data_file_labels, dates)],
    columns=["source_file", "label", "start_date", "end_date"]
)
df

In [None]:
gdfs = []
for f, l, (start, end) in tqdm(zip(data_files, data_file_labels, dates)):
    gdf = gpd.read_file(os.path.join(WORK_DIR, f))   
    gdf = gdf.to_crs("EPSG:4326") 
    
    gdf["source_file"] = f
    gdf["label"] = l
    gdf["start_date"] = start
    gdf["end_date"] = end
    
    gdfs.append(gdf.loc[:, ["geometry", "source_file", "label", "start_date", "end_date"]])

gdf = gpd.GeoDataFrame(gpd.pd.concat(gdfs, ignore_index=True), crs=gdfs[0].crs)

In [None]:
gdf.label.value_counts()

In [None]:
rng = np.random.default_rng(seed=27)
gdf['split'] = rng.choice(['train', 'val'], size=len(gdf), p=[0.80, 0.20])
gdf.split.value_counts()

#### Separate some negatives for cloudier scenes

A clear_threshold 0.1 yields a handful of fully clouded scenes and a fair number with haze or whispy cloud artefacts. A clear_treshold of 0 yields predominantly clouds.

In [None]:
gdf['clear_threshold'] = 0.75

In [None]:
cloudy_samples = gdf[gdf.source_file == 'random_negatives25k_2025-08.geojson'].sample(n=1000, random_state=37).index
gdf.loc[cloudy_samples, 'clear_threshold'] = 0.1

In [None]:
gdf.clear_threshold.value_counts()

In [None]:
now = datetime.today().isoformat()[:-10]
gdf.to_file(os.path.join(WORK_DIR, f'collected_locations{now}.geojson'), index=False)

#### Add some data

Add to data_files / gdf above, then follow this pattern.  

In [None]:
existing = gpd.read_file('../data/sampling_locations/collected_locations2025-10-21T13:25.geojson')
existing

In [None]:
new = gdf[gdf.source_file.isin(['v1.0.1SSL4EO-MLP_2023_negatives.geojson'])]
new

In [None]:
new['clear_threshold'] = 0.75

rng = np.random.default_rng(seed=27)
new['split'] = rng.choice(['train', 'val'], size=len(new), p=[0.80, 0.20])
new.split.value_counts()

In [None]:
updated = gpd.pd.concat([existing, new])
updated

In [None]:
now = datetime.today().isoformat()[:-10]
updated.to_file(os.path.join(WORK_DIR, f'collected_locations{now}.geojson'), index=False)