In [38]:
import os 

import geopandas as gpd
import numpy as np
from tqdm import tqdm

WORK_DIR = '../data/sampling_locations'

In [39]:
data_files, data_file_labels, dates = [], [], []
pos2019 = [
'amazonas_2020_thresh_0.8_sumbsample_3_positives.geojson',
'MinesPos2018-2020Sentinel_points.geojson',
'v2.4_amazon_positives.geojson',
'bolivar_2020_thresh_0.8_sumbsample_5_positives_cleaned.geojson'
]
data_files += pos2019
data_file_labels += [1 for _ in range(len(pos2019))]
dates += [('2019-01-01', '2020-01-01') for _ in range(len(pos2019))]

neg2019 = [
'bolivar_2020_thresh_0.8_1_negatives.geojson',
'amazonas_2020_thresh_0.5_2_negatives.geojson',
'full_amazon_v9_negatives.geojson',
'v2.0_bolivar_negatives.geojson',
'v2.1.1_bolivar_negatives.geojson',
'v2.4_amazonas_negatives.geojson',
'v2.4_amazon_negatives.geojson',
'v2.6_amazon_thresh_0.8_negatives.geojson',
'v2.6_amazon_negatives.geojson',
'v2.6_amazon_negatives_v2.geojson',
]
data_files += neg2019
data_file_labels += [0 for _ in range(len(neg2019))]
dates += [('2019-01-01', '2020-01-01') for _ in range(len(neg2019))]

pos2023 = [
'amazon_all_48px_v3.1_2023_positives_0.999_cleaned.geojson',
'v3.3_2023_positives.geojson',
'v3.6_2023_positives.geojson'
]
data_files += pos2023
data_file_labels += [1 for _ in range(len(pos2023))]
dates += [('2023-01-01', '2024-01-01') for _ in range(len(pos2023))]

neg2023 = [
'v3.1_2023_negatives.geojson',
'v3.2_2023_negatives.geojson',
'v3.4_2023_negatives.geojson',
'v3.5_2023_negatives.geojson',
]
data_files += neg2023
data_file_labels += [0 for _ in range(len(neg2023))]
dates += [('2023-01-01', '2024-01-01') for _ in range(len(neg2023))]

pos2024 = [
'ACAexpanded_amazon_positives2025-08.geojson',
'BD_MineriaAurifera_Peru-cleaned_2025-08samp500.geojson',
'punino_positives2025-08.geojson',
'v3.2-3.7ensemble_positives2025-08.geojson',
]
data_files += pos2024
data_file_labels += [1 for _ in range(len(pos2024))]
dates += [('2024-01-01', '2025-01-01') for _ in range(len(pos2024))]

neg2024 = [
'ACAexpanded_amazon_negatives2025-08.geojson',
'industrial_negatives2025-08.geojson',
'aquaculture-faziendas-negatives2025-08.geojson',
'landslides2025-08.geojson',
'random_negatives25k_2025-08.geojson'
]
data_files += neg2024
data_file_labels += [0 for _ in range(len(neg2024))]
dates += [('2024-01-01', '2025-01-01') for _ in range(len(neg2024))]


data_files += ['riverbank_negatives_v2025-08.geojson']
data_file_labels += [0]
dates += [('2024-02-01', '2024-05-01')]

data_files += ['riverbank_negatives_bolivia_v2025-08.geojson']
data_file_labels += [0]
dates += [('2024-08-01', '2024-11-01')]



In [40]:
df = gpd.pd.DataFrame(
    [(f, l, d[0], d[1]) for f, l, d in zip(data_files, data_file_labels, dates)],
    columns=["source_file", "label", "start_date", "end_date"]
)
df

Unnamed: 0,source_file,label,start_date,end_date
0,amazonas_2020_thresh_0.8_sumbsample_3_positive...,1,2019-01-01,2020-01-01
1,MinesPos2018-2020Sentinel_points.geojson,1,2019-01-01,2020-01-01
2,v2.4_amazon_positives.geojson,1,2019-01-01,2020-01-01
3,bolivar_2020_thresh_0.8_sumbsample_5_positives...,1,2019-01-01,2020-01-01
4,bolivar_2020_thresh_0.8_1_negatives.geojson,0,2019-01-01,2020-01-01
5,amazonas_2020_thresh_0.5_2_negatives.geojson,0,2019-01-01,2020-01-01
6,full_amazon_v9_negatives.geojson,0,2019-01-01,2020-01-01
7,v2.0_bolivar_negatives.geojson,0,2019-01-01,2020-01-01
8,v2.1.1_bolivar_negatives.geojson,0,2019-01-01,2020-01-01
9,v2.4_amazonas_negatives.geojson,0,2019-01-01,2020-01-01


In [None]:
gdfs = []
for f, l, (start, end) in tqdm(zip(data_files, data_file_labels, dates)):
    gdf = gpd.read_file(os.path.join(WORK_DIR, f))   
    gdf = gdf.to_crs("EPSG:4326") 
    
    gdf["source_file"] = f
    gdf["label"] = l
    gdf["start_date"] = start
    gdf["end_date"] = end
    
    gdfs.append(gdf.loc[:, ["geometry", "source_file", "label", "start_date", "end_date"]])

gdf = gpd.GeoDataFrame(gpd.pd.concat(gdfs, ignore_index=True), crs=gdfs[0].crs)

In [42]:
gdf.label.value_counts()

label
0    28505
1     3172
Name: count, dtype: int64

In [47]:
rng = np.random.default_rng(seed=27)
gdf['split'] = rng.choice(['train', 'val'], size=len(gdf), p=[0.80, 0.20])
gdf.split.value_counts()

split
train    25335
val       6342
Name: count, dtype: int64

#### Separate some negatives for cloudier scenes

A clear_threshold 0.1 yields a handful of fully clouded scenes and a fair number with haze or whispy cloud artefacts. A clear_treshold of 0 yields predominantly clouds.

In [48]:
gdf['clear_threshold'] = 0.75

In [49]:
cloudy_samples = gdf[gdf.source_file == 'random_negatives25k_2025-08.geojson'].sample(n=1000, random_state=37).index
gdf.loc[cloudy_samples, 'clear_threshold'] = 0.1

In [50]:
gdf.clear_threshold.value_counts()

clear_threshold
0.75    30677
0.10     1000
Name: count, dtype: int64

In [52]:
gdf.to_file(os.path.join(WORK_DIR, 'collected_locations2025-08.geojson'))