In [1]:
import ee
import os
import pandas as pd
import geopandas as gpd
import numba
import numpy as np
from tqdm.notebook import tqdm


In [None]:
ee.Authenticate()
ee.Initialize(project='ee-gtikhonov')
print(ee.String('Hello from the Earth Engine servers!').getInfo())

In [2]:
path_data = "/home/gt/DATA/geolifeclef-2025"
file_path_po = os.path.join(path_data, "GLC25_P0_metadata_train.csv")
df_po = pd.read_csv(file_path_po)


In [None]:
df_un = df_po.groupby(["lat","lon"]).agg({"surveyId": lambda values : ",".join(values.unique().astype(int).astype(str)), "speciesId": lambda values : ",".join(values.unique().astype(str))}).reset_index()
df_un["id"] = np.arange(len(df_un))

In [None]:
gdf_un = gpd.GeoDataFrame(df_un, geometry=gpd.points_from_xy(df_un['lon'], df_un['lat']), crs='EPSG:4326')
gdf_un = gdf_un.to_crs(epsg=3857)

In [None]:
def row_to_feature(row):
    point = ee.Geometry.Point([row["lon"], row["lat"]])
    return ee.Feature(point, {"id": row["id"]})

grouped = gdf_un
features = [row_to_feature(row) for idx, row in tqdm(grouped.iterrows(),  total=len(grouped))]

In [None]:
chunk_size = 4500
chunks = [features[i:i + chunk_size] for i in range(0, len(features), chunk_size)]

# Загружаем WorldCover
worldcover = ee.Image("ESA/WorldCover/v100/2020")
cover = worldcover.select("Map")

results = {}
for i, chunk in enumerate(tqdm(chunks, desc="batch-requests")):
    fc = ee.FeatureCollection(chunk)
    sampled = cover.sampleRegions(collection=fc, scale=10, geometries=False)
    try:
        data = sampled.getInfo()
    except Exception as e:
        print(f"error chunk {i}: {e}")
        continue

    sids = [f["properties"]["id"] for f in data["features"]]
    vals = [f["properties"]["Map"] for f in data["features"]]
    results.update(zip(sids, vals))

grouped["class"] = grouped["id"].map(results)

In [None]:
res = df_un
res["class"] = res["id"].map(results)
res.drop("id", axis=1).to_csv(os.path.join(path_data, "worldcover", "po_train_survey_points_with_worldcover.csv"))

In [None]:
res.surveyId

In [22]:
tmp = df_po.value_counts(["surveyId"])
df_po.loc[df_po.loc[:,"surveyId"] == tmp.index[0]]

Unnamed: 0,publisher,year,month,day,lat,lon,geoUncertaintyInM,taxonRank,date,dayOfYear,speciesId,surveyId
2321490,iNaturalist.org,2020,11.0,1.0,44.83757,7.240146,31.0,SPECIES,2020-11-01,306,6753.0,1790060
2321491,iNaturalist.org,2020,5.0,2.0,44.83757,7.240146,31.0,SPECIES,2020-05-02,123,11028.0,1790060
2321492,iNaturalist.org,2020,4.0,12.0,44.83757,7.240146,31.0,SPECIES,2020-04-12,103,4748.0,1790060
2321493,iNaturalist.org,2020,5.0,14.0,44.83757,7.240146,31.0,SPECIES,2020-05-14,135,2852.0,1790060
2321494,iNaturalist.org,2020,5.0,3.0,44.83757,7.240146,31.0,SPECIES,2020-05-03,124,1842.0,1790060
...,...,...,...,...,...,...,...,...,...,...,...,...
2321909,iNaturalist.org,2020,4.0,24.0,44.83757,7.240146,31.0,SPECIES,2020-04-24,115,6612.0,1790060
2321910,iNaturalist.org,2020,6.0,14.0,44.83757,7.240146,31.0,SPECIES,2020-06-14,166,2249.0,1790060
2321911,iNaturalist.org,2020,7.0,1.0,44.83757,7.240146,31.0,SPECIES,2020-07-01,183,8410.0,1790060
2321912,iNaturalist.org,2020,5.0,11.0,44.83757,7.240146,31.0,SPECIES,2020-05-11,132,9317.0,1790060


In [19]:
df_po.loc[df_po.loc[:,"surveyId"] == tmp.index[0]].speciesId.unique()

array([ 6753., 11028.,  4748.,  2852.,  1842.,  1041.,  6686.,  2783.,
        1683.,   254.,  8383.,  3747.,   394.,  8654.,  8549.,  6612.,
         499.,  3451.,   477.,  6273.,  4341.,  2122.,  1472.,  2249.,
       10320.,  6171.,  8746.,  4871.,  7817.,  1124.,  4558.,   791.,
        7837.,  4102.,  7648.,  1793.,  2237., 11140.,  3438.,   976.,
        9669.,  3483.,  3792.,  3918.,  5445.,  7395., 10711.,  9024.,
        8224.,  3850.,  5071.,  4728.,  9610.,  6358., 10427.,  7576.,
       10852.,  8760.,  5850.,  6989.,  3969.,  2474.,  5704.,  1424.,
        7582.,   140.,  6666., 10113.,  5003.,  7149.,  7109.,  1201.,
        1746.,  2643.,  2893.,  9376.,  2142.,   740.,  2753.,  6634.,
        9388.,  3530.,  4609.,   981., 10683.,   423.,   130.,  6716.,
        3049.,  9317.,  6190.,  2250.,  7852.,  7880.,  6388., 10778.,
        4498.,   848.,  2885.,  5093.,  5499.,  8084.,  2386.,  8508.,
        9515.,  3594.,  1020.,  8818.,  7327.,  1995.,  9028.,  2569.,
      

In [None]:
tmp.iloc[:10000]

In [None]:
tmp.index[0]

In [None]:
gdf_points_po

In [20]:
tmp = df_po.value_counts(["lat","lon"])

In [21]:
df_po.loc[(df_po.loc[:,["lat","lon"]] == tmp.index[0]).all(axis=1)]

Unnamed: 0,publisher,year,month,day,lat,lon,geoUncertaintyInM,taxonRank,date,dayOfYear,speciesId,surveyId
297957,iNaturalist.org,2021,9.0,9.0,44.83757,7.240146,31.0,SPECIES,2021-09-09,252,8383.0,230642
297958,iNaturalist.org,2021,8.0,13.0,44.83757,7.240146,31.0,SPECIES,2021-08-13,225,6171.0,230642
297959,iNaturalist.org,2021,5.0,20.0,44.83757,7.240146,31.0,SPECIES,2021-05-20,140,7149.0,230642
297960,iNaturalist.org,2021,4.0,7.0,44.83757,7.240146,31.0,SPECIES,2021-04-07,97,2122.0,230642
297961,iNaturalist.org,2021,3.0,10.0,44.83757,7.240146,31.0,SPECIES,2021-03-10,69,6171.0,230642
...,...,...,...,...,...,...,...,...,...,...,...,...
2321909,iNaturalist.org,2020,4.0,24.0,44.83757,7.240146,31.0,SPECIES,2020-04-24,115,6612.0,1790060
2321910,iNaturalist.org,2020,6.0,14.0,44.83757,7.240146,31.0,SPECIES,2020-06-14,166,2249.0,1790060
2321911,iNaturalist.org,2020,7.0,1.0,44.83757,7.240146,31.0,SPECIES,2020-07-01,183,8410.0,1790060
2321912,iNaturalist.org,2020,5.0,11.0,44.83757,7.240146,31.0,SPECIES,2020-05-11,132,9317.0,1790060


In [None]:
gdf_points_po.loc[:,["lat","lon"]] == tmp.index[0]