In [1]:
import warnings
warnings.filterwarnings("ignore")
import os
import numpy as np
import rasterio
import tqdm
import pandas as pd
import rasterio
import random
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_fscore_support

In [None]:
path_data = "/home/gt/DATA/geolifeclef-2025"
train_metadata = pd.read_csv(os.path.join(path_data, "GLC25_PA_metadata_train.csv"))
train_metadata = train_metadata.dropna(subset="speciesId").reset_index(drop=True)
train_metadata['speciesId'] = train_metadata['speciesId'].astype(int)
train_label_dict = train_metadata.groupby('surveyId')['speciesId'].apply(list).to_dict()
train_metadata = train_metadata.drop_duplicates(subset="surveyId").reset_index(drop=True).drop('speciesId', axis=1)
train_metadata['areaInM2'].fillna((train_metadata['areaInM2'].mean()), inplace=True)
train_metadata["conFra"] = train_metadata["country"] == "France"
train_metadata["conDen"] = train_metadata["country"] == "Denmark"
train_metadata["conNet"] = train_metadata["country"] == "Netherlands"
train_metadata["conIta"] = train_metadata["country"] == "Italy"
train_metadata["conOther"] = ~train_metadata["country"].isin(["France","Denmark","Netherlands","Italy"])
train_elevation = pd.read_csv(os.path.join(path_data, "EnvironmentalValues", "Elevation", "GLC25-PA-train-elevation.csv"))
train_elevation['Elevation'].fillna((train_elevation['Elevation'].mean()), inplace=True)
train_soil = pd.read_csv(os.path.join(path_data, "EnvironmentalValues", "SoilGrids", "GLC25-PA-train-soilgrids.csv"))
meta_columns = ["areaInM2", "Elevation", "conFra", "conDen", "conNet", "conIta", "conOther"]

In [None]:
path_data = "/home/gt/DATA/geolifeclef-2025"
train_metadata = pd.read_csv(os.path.join(path_data, "GLC25_PA_metadata_train.csv"))
train_metadata['speciesId'] = train_metadata['speciesId'].astype(int)
train_metadata

In [None]:
display((train_metadata.areaInM2 == -np.inf).sum())
display(train_metadata.areaInM2.isna().sum())

In [None]:
tmp = train_metadata.groupby("surveyId").agg({"surveyId":"first", "lat":"first", "lon":"first", "areaInM2":lambda x: list(x.unique()), "geoUncertaintyInM":lambda x: list(x.unique()), "region":"first", "country":"first", "speciesId":list})
tmp.rename(columns={"areaInM2": "area", "geoUncertaintyInM": "uncert"}, inplace=True)
tmp.insert(np.where(tmp.columns=="area")[0][0]+1, "areaList", tmp["area"])
tmp["area"] = tmp["areaList"].apply(np.mean)
print((tmp.area == -np.inf).sum(), tmp.area.isna().sum())
tmp["area"] = tmp["areaList"].apply(lambda x: 1.0 if np.isinf(x).all() else np.mean(x, where=~np.isinf(x)))
print((tmp.area == -np.inf).sum(), tmp.area.isna().sum())
tmp['area'].fillna(tmp['area'].mean(), inplace=True)
print((tmp.area == -np.inf).sum(), tmp.area.isna().sum())
tmp.insert(np.where(tmp.columns=="uncert")[0][0]+1, "uncertList", tmp["uncert"])
tmp["uncert"] = tmp["uncert"].apply(np.mean)
tmp["speciesId"] = tmp["speciesId"].apply(np.sort)
tmp["spLenOrig"] = tmp["speciesId"].apply(len)
tmp["spLen"] = tmp["speciesId"].apply(np.unique).apply(len)
tmp.sort_values(["spLen","area"], ascending=[True,False], inplace=True)

In [None]:
train_worldcover = pd.read_csv(os.path.join(path_data, "worldcover", "s2_pa_train_survey_points_with_worldcover.csv"))
train_worldcover['surveyId'] = train_worldcover['surveyId'].astype(int)
train_worldcover.set_index("surveyId", inplace=True)
comb = pd.concat([tmp.sort_index(), train_worldcover.sort_values("surveyId")["class"]], axis=1)
comb

In [None]:
with pd.option_context('display.max_rows', 100):
    display(comb[comb["class"]==100].sort_values(["country","lat"]))

In [None]:
with pd.option_context('display.max_rows', 20):
    print(comb.loc[comb.spLen == 1].value_counts("country"))
    tmp = comb.sort_values(["spLen","area","country"], ascending=[True,False,True])
    display(tmp.loc[tmp.country=="Poland"].head(20))

In [None]:
fig = plt.figure()
ax = plt.gca()
plt.plot(tmp.spLen.value_counts(sort=False))
ax.set_xlabel("number of species")
ax.set_ylabel("counts in PA train")
plt.show()

In [None]:
train_landcover = pd.read_csv(os.path.join(path_data, "EnvironmentalValues", "LandCover", "GLC25-PA-train-landcover.csv"), index_col=0)
train_landcover = train_landcover.iloc[:,[0,2,3,5,8,11,12]]
train_wcdummy = pd.get_dummies(train_worldcover["class"], prefix="wc") + 0.0
covercomb = pd.concat([train_landcover, train_wcdummy], axis=1)

In [None]:
train_landcover.isna().sum(0)

In [None]:
test_landcover = pd.read_csv(os.path.join(path_data, "EnvironmentalValues", "LandCover", "GLC25-PA-test-landcover.csv"), index_col=0)
test_landcover.isna().sum(0)

In [None]:
comb.loc[comb.surveyId.isin([277331, 797067, 1295919, 2081447, 2436311, 2820287, 3373101, 3496885])]

In [None]:
f = plt.figure(figsize=(19, 15))
plt.matshow(covercomb.corr("spearman"), fignum=f.number, vmin=-1, vmax=1, cmap="RdBu")
plt.xticks(range(covercomb.select_dtypes(['number']).shape[1]), covercomb.select_dtypes(['number']).columns, fontsize=14, rotation=45, ha="left")
plt.yticks(range(covercomb.select_dtypes(['number']).shape[1]), covercomb.select_dtypes(['number']).columns, fontsize=14)
cb = plt.colorbar()
cb.ax.tick_params(labelsize=14)
plt.title('Correlation Matrix', fontsize=16);
plt.show()


In [None]:
fig = plt.figure()
ax = plt.gca()
ax.scatter(tmp.area, tmp.spLen)
ax.set_yscale('log')
ax.set_xscale('log')
ax.set_xlabel("area")
ax.set_ylabel("richness")

In [None]:
fig = plt.figure()
ax = plt.gca()
ax.scatter(tmp.spLenOrig, tmp.spLen)
ax.set_yscale('log')
ax.set_xscale('log')
ax.set_xlabel("original richness")
ax.set_ylabel("actual richness")
plt.axline([0,0],[1,1], color="gray")
plt.axline([10,1],[100,10], color="gray", linestyle="dashed")

In [None]:
train_worldcover[train_worldcover["class"]==70]

In [None]:
train_worldcover = pd.read_csv(os.path.join(path_data, "worldcover", "s2_pa_train_survey_points_with_worldcover.csv"))
display(train_worldcover.value_counts("class"))
train_wcdummy = pd.get_dummies(train_worldcover["class"], prefix="wc").set_index(train_worldcover.surveyId)
#pd.get_dummies(train_worldcover, columns=["class"])
train_wcdummy

In [None]:
test_worldcover.value_counts("class")

In [None]:
test_worldcover = pd.read_csv(os.path.join(path_data, "worldcover", "pa_test_survey_points_with_worldcover.csv"))
tmp = pd.get_dummies(test_worldcover["class"], prefix="wc").set_index(test_worldcover.surveyId)
tmp.insert(6, "wc_70", False)
tmp

In [None]:
test_worldcover

In [None]:
path_data = "/home/gt/DATA/geolifeclef-2025"
train_metadata = pd.read_csv(os.path.join(path_data, "GLC25_PA_metadata_train.csv"))
train_metadata = train_metadata.dropna(subset="speciesId").reset_index(drop=True)
train_metadata['speciesId'] = train_metadata['speciesId'].astype(int)
train_metadata["speciesIdOrig"] = train_metadata['speciesId']
tmp = train_metadata["speciesId"].value_counts() >= pa_presence_threshold
train_metadata.loc[~train_metadata["speciesId"].isin(tmp[tmp].index), "speciesId"] = -1
sp_categorical = train_metadata["speciesId"].astype("category").values
num_classes = len(sp_categorical.categories)
train_metadata['speciesId'] = sp_categorical.codes
train_label_series = train_metadata.groupby('surveyId')['speciesId'].apply(list)
train_label_series

In [None]:
train_path_sentinel = os.path.join(path_data, "SatelitePatches/PA-train")
train_path_landsat = os.path.join(path_data, "SateliteTimeSeries-Landsat/cubes/PA-train")
train_path_bioclim = os.path.join(path_data, "BioclimTimeSeries/cubes/PA-train")
train_metadata = pd.read_csv(os.path.join(path_data, "GLC25_PA_metadata_train.csv"))
train_metadata = train_metadata.dropna(subset="speciesId").reset_index(drop=True)
train_metadata['speciesId'] = train_metadata['speciesId'].astype(int)

In [None]:
train_label_series = train_metadata.groupby('speciesId')['surveyId'].apply(list)
species_label = pd.DataFrame(train_label_series)
#train_label_series = train_metadata.groupby('surveyId')['speciesId'].apply(list)
#train_label_series.apply(len)

In [None]:
pa_presence_threshold = 10
tmp = train_metadata["speciesId"].value_counts() >= pa_presence_threshold
train_metadata = train_metadata.loc[train_metadata["speciesId"].isin(tmp[tmp].index)]
train_metadata

In [None]:
sp_categorical = train_metadata["speciesId"].astype("category").values
sp_categorical.categories

In [None]:
train_metadata["speciesIdNew"] = sp_categorical.categories[sp_categorical.codes]

In [None]:
int(1e6)

In [None]:
train_metadata

In [None]:
train_elevation = pd.read_csv(os.path.join(path_data, "EnvironmentalValues", "Elevation", "GLC25-PA-train-elevation.csv"))
train_elevation['Elevation'].fillna((train_elevation['Elevation'].mean()), inplace=True)
print(train_elevation.shape, train_elevation.isin([-np.inf]).sum(0).values, train_elevation.isin([-np.inf]).sum(0).values, train_elevation.isna().sum(0).values)

In [None]:
test_elevation = pd.read_csv(os.path.join(path_data, "EnvironmentalValues", "Elevation", "GLC25-PA-test-elevation.csv"))
test_elevation['Elevation'].fillna((test_elevation['Elevation'].mean()), inplace=True)
print(test_elevation.shape, test_elevation.isin([-np.inf]).sum(0).values, test_elevation.isin([-np.inf]).sum(0).values, test_elevation.isna().sum(0).values)

In [None]:
train_soil = pd.read_csv(os.path.join(path_data, "EnvironmentalValues", "SoilGrids", "GLC25-PA-train-soilgrids.csv"))
for column in train_soil.columns: train_soil[column].fillna((train_soil[column].mean()), inplace=True)
print(train_soil.shape, train_soil.isin([-np.inf]).sum(0).values, train_soil.isin([-np.inf]).sum(0).values, train_soil.isna().sum(0).values, sep='\n')

In [None]:
test_soil = pd.read_csv(os.path.join(path_data, "EnvironmentalValues", "SoilGrids", "GLC25-PA-test-soilgrids.csv"))
for column in test_soil.columns: test_soil[column].fillna((test_soil[column].mean()), inplace=True)
print(test_soil.shape, test_soil.isin([-np.inf]).sum(0).values, test_soil.isin([-np.inf]).sum(0).values, test_soil.isna().sum(0).values, sep='\n')

In [None]:
combined = pd.concat([train_metadata, train_elevation.Elevation, train_soil.drop("surveyId", axis=1)], axis=1)
print(combined.isna().sum(0))
combined

In [None]:
meta_columns = ["areaInM2", "Elevation"] + list(train_soil.columns[1:])
meta_columns

In [None]:
train_metadata.value_counts(["areaInM2", "country"])

In [None]:
train_metadata = pd.read_csv(os.path.join(path_data, "GLC25_PA_metadata_train.csv"))
train_metadata.groupby('surveyId')['speciesId'].apply(list)[212]

In [None]:
print(train_dataset[0][1].shape)
#val = np.zeros([len(train_dataset)])
#for i, d in enumerate(tqdm.tqdm(train_dataset)):
#    val[i] = torch.sum(torch.sum(torch.reshape(torch.permute(d[1], [0,2,1]), [6, -1]), -2) == 0).numpy()

In [None]:
train_metadata.speciesId.drop_duplicates().sort_values()
prev_min = 10
train_metadata.value_counts("speciesId").sort_index().pipe(lambda x: x[x>=prev_min])

In [None]:
plt.plot(np.log10(train_metadata.value_counts("speciesId").values))

In [None]:
test_metadata = pd.read_csv(os.path.join(path_data, "GLC25_PA_metadata_test.csv"))
test_metadata

In [None]:
(train_metadata.country.value_counts().cumsum() / test_metadata.shape[0]).index[:10].values

In [None]:

test_metadata.country.isin(most_train_countries).sum()

In [None]:
train_metadata

In [None]:
train_metadata_surv = train_metadata.drop_duplicates("surveyId")
train_summary = pd.DataFrame(train_metadata_surv.value_counts("country"))
train_summary["prop"] = train_summary.loc[:,"count"] / train_summary.loc[:,"count"].sum()
train_summary

In [None]:
test_summary = pd.DataFrame(test_metadata.value_counts("country"))
test_summary["prop"] = test_summary.loc[:,"count"] / test_summary.loc[:,"count"].sum()
test_summary["cumprop"] = test_summary.loc[:,"prop"].cumsum()
test_summary

In [None]:
test_summary.join(train_summary, lsuffix="_test", rsuffix="_train")

In [None]:
def set_seed(seed):
    torch.manual_seed(seed) # Set seed for Python's built-in random number generator
    np.random.seed(seed) # Set seed for numpy
    if torch.cuda.is_available(): # Set seed for CUDA if available
        torch.cuda.manual_seed_all(seed)
        # Set cuDNN's random number generator seed for deterministic behavior
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

set_seed(42)
path_data = "/home/gt/DATA/geolifeclef-2025"
train_path_sentinel = os.path.join(path_data, "SatelitePatches/PA-train")
train_path_landsat = os.path.join(path_data, "SateliteTimeSeries-Landsat/cubes/PA-train")
train_path_bioclim = os.path.join(path_data, "BioclimTimeSeries/cubes/PA-train")
train_metadata = pd.read_csv(os.path.join(path_data, "GLC25_PA_metadata_train.csv"))
train_metadata = train_metadata.dropna(subset="speciesId").reset_index(drop=True)
train_metadata['speciesId'] = train_metadata['speciesId'].astype(int)
train_metadata["speciesIdOrig"] = train_metadata['speciesId']
tmp = train_metadata["speciesId"].value_counts() >= pa_presence_threshold
train_metadata.loc[~train_metadata["speciesId"].isin(tmp[tmp].index), "speciesId"] = -1
sp_categorical = train_metadata["speciesId"].astype("category").values
num_classes = len(sp_categorical.categories)
train_metadata['speciesId'] = sp_categorical.codes

tmp = train_metadata.groupby("surveyId").agg({"surveyId":"first", "lat":"first", "lon":"first", "areaInM2":lambda x: list(x.unique()), "region":"first", "country":"first", "speciesId":list})
train_label_series = tmp.set_index("surveyId").speciesId
train_metadata = tmp.drop(columns=["speciesId"]).set_index("surveyId", drop=False)
train_metadata["area"] = train_metadata["areaInM2"].apply(lambda x: 1.0 if np.isinf(x).all() else np.mean(x, where=~np.isinf(x)))
train_metadata["areaLog"] = np.log10(train_metadata["area"])

train_metadata['area'].fillna(train_metadata['area'].mean(), inplace=True)
train_metadata['areaLog'].fillna(train_metadata['areaLog'].mean(), inplace=True)
train_metadata["conFra"] = train_metadata["country"] == "France"
train_metadata["conDen"] = train_metadata["country"] == "Denmark"
train_metadata["conNet"] = train_metadata["country"] == "Netherlands"
train_metadata["conIta"] = train_metadata["country"] == "Italy"
train_metadata["conAus"] = train_metadata["country"] == "Austria"
train_metadata["conOther"] = ~train_metadata["country"].isin(["France","Denmark","Netherlands","Italy","Austria"])
train_elevation = pd.read_csv(os.path.join(path_data, "EnvironmentalValues", "Elevation", "GLC25-PA-train-elevation.csv"), index_col=0)
train_elevation['Elevation'].fillna((train_elevation['Elevation'].mean()), inplace=True)
train_soil = pd.read_csv(os.path.join(path_data, "EnvironmentalValues", "SoilGrids", "GLC25-PA-train-soilgrids.csv"), index_col=0)
for column in train_soil.columns: train_soil[column].fillna((train_soil[column].mean()), inplace=True)
train_worldcover = pd.read_csv(os.path.join(path_data, "worldcover", "s2_pa_train_survey_points_with_worldcover.csv"), index_col=0)
train_wcdummy = pd.get_dummies(train_worldcover["class"], prefix="wc")
train_wcdummy.drop(columns="wc_70", inplace=True)
train_wcdummy.drop(columns="wc_100", inplace=True)
train_landcover = pd.read_csv(os.path.join(path_data, "EnvironmentalValues", "LandCover", "GLC25-PA-train-landcover.csv"), index_col=0)
landcover_col_ind = [0,2,3,5,8,11,12]
train_landcover = train_landcover.iloc[:, landcover_col_ind]

print("All rows match: ", (train_metadata.index==train_elevation.index).all() and (train_metadata.index==train_soil.index).all() \
     and (train_metadata.index==train_worldcover.index).all() and (train_metadata.index==train_landcover.index).all())
cov_columns = ["areaLog", "Elevation", "conFra", "conDen", "conNet", "conIta", "conAus", "conOther"] + list(train_soil.columns) + list(train_wcdummy.columns) + list(train_landcover.columns)
train_combined = pd.concat([train_metadata, train_elevation.Elevation, train_soil, train_wcdummy, train_landcover], axis=1)
cov_norm_coef = train_combined.loc[:,cov_columns].agg(['mean', 'std'])
dummy_columns = ["conFra","conDen","conNet","conIta","conAus","conOther"] + list(train_wcdummy.columns)
cov_norm_coef.loc["mean",dummy_columns] = 0
cov_norm_coef.loc["std",dummy_columns] = 1
train_combined.loc[:,cov_columns] = (train_combined.loc[:,cov_columns] - cov_norm_coef.loc["mean"]) / cov_norm_coef.loc["std"]

val_ind = np.sort(train_combined.surveyId.sample(frac=validation_prop).values)
train_data, val_data = [x.reset_index(drop=True) for _, x in train_combined.groupby(train_combined.surveyId.isin(val_ind))]
train_label_dict = train_label_series[train_data.surveyId].to_dict()
val_label_dict = train_label_series[val_data.surveyId].to_dict()
train_dataset = TrainDataset(train_path_sentinel, train_path_landsat, train_path_bioclim, train_data, cov_columns, train_label_dict, 
                             subset="train", num_classes=num_classes, transform_sentinel=transform_sentinel, transform_landsat=transform_landsat)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
val_dataset = TrainDataset(train_path_sentinel, train_path_landsat, train_path_bioclim, val_data, cov_columns, val_label_dict,
                           subset="train", num_classes=num_classes, transform_sentinel=transform_sentinel_test, transform_landsat=transform_landsat_test)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)
print(train_dataset[0][0].shape, val_dataset[0][2].shape)

In [None]:
pred = pd.read_csv("prithvi/0424_225841/0425_012850_e075_vloss0.008629_vf0.3079.csv", index_col=0)
test_metadata = pd.read_csv(os.path.join(path_data, "GLC25_PA_metadata_test.csv"), index_col=-1)
pred = pd.concat([test_metadata, pred], axis=1)
pred_na = pred.loc[pred.predictions.isna()]

In [None]:
pred_na

In [None]:
pd.concat([test_metadata.value_counts("country"), pred_na.value_counts("country")], axis=1).fillna(0).astype(int)

In [None]:
train_snow = pd.read_csv(os.path.join(path_data, "EnvironmentalValues", "chelsa_snow", "pa_train_snowcover_chelsa_scd.csv"), index_col=0).sort_index()
train_snow

In [None]:
test_snow = pd.read_csv(os.path.join(path_data, "EnvironmentalValues", "chelsa_snow", "pa_test_snowcover_chelsa_scd.csv"), index_col=0).sort_index()
test_snow

In [None]:
train_human = pd.read_csv(os.path.join(path_data, "EnvironmentalValues", "chelsa_snow", "pa_train_snowcover_chelsa_scd.csv"), index_col=0).sort_index()
train_snow