In [83]:
import numpy as np
import pandas as pd
from pathlib import Path
from xgboost import XGBClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from skimage.io import imread
from tqdm.auto import tqdm

In [69]:
path = Path("../data")
obs_fr = pd.read_csv(path / "observations" / f"observations_fr_train.csv", sep=";")
obs_us = pd.read_csv(path / "observations" / f"observations_us_train.csv", sep=";")
obs = pd.concat([obs_fr, obs_us])

obs_fr_test = pd.read_csv(path / "observations" / f"observations_fr_test.csv", sep=";")
obs_us_test = pd.read_csv(path / "observations" / f"observations_us_test.csv", sep=";")
obs_test = pd.concat([obs_fr_test, obs_us_test])

In [57]:
obs

Unnamed: 0,observation_id,latitude,longitude,species_id,subset
0,10561949,45.705116,1.424622,241,train
1,10131188,45.146973,6.416794,101,train
2,10799362,46.783695,-2.072855,700,train
3,10392536,48.604866,-2.825003,1456,train
4,10335049,48.815567,-0.161431,157,train
...,...,...,...,...,...
956226,22068171,29.602327,-94.555860,5041,train
956227,22068172,38.029580,-122.883995,8688,train
956228,22068173,37.913720,-121.948814,12154,train
956229,22068174,37.206974,-122.067154,5309,val


In [5]:
obs.head(2)

Unnamed: 0,observation_id,latitude,longitude,species_id,subset
0,10561949,45.705116,1.424622,241,train
1,10131188,45.146973,6.416794,101,train


In [6]:
data_train = obs[obs["subset"] == "train"]
data_val = obs[obs["subset"] == "val"]
data_test = obs_test

In [7]:
# Environmental variables

df_env = pd.read_csv(path / "pre-extracted" / "environmental_vectors.csv", sep=";", index_col="observation_id")

In [8]:
len(df_env.columns)

27

In [9]:
# get train, val, test bioclimatic data
X_train = df_env.loc[data_train.observation_id.values]
X_val = df_env.loc[data_val.observation_id.values]
X_test = df_env.loc[data_test.observation_id.values]


In [10]:
# Landcover dothot encoding

def get_patch(sample, path="../data"):
    country_id = str(sample)[0]
    country = "fr" if country_id == "1" else "us"
    subfolder = str(sample)[-2:]
    subsubfolder = str(sample)[-4:-2]
    return path + "/patches-" + country + "/" + subfolder + "/" + subsubfolder

f_dothot = lambda land, v: (land == v).sum() / 65536

def land_dothot(obs_id):
    patch = get_patch(obs_id)
    land = patch + "/" + str(obs_id) + "_landcover.tif"
    land = imread(land)
    land_dothot = np.zeros(34, dtype="np.float32")
    land_dothot[np.unique(land)] += [f_dothot(land, v) for v in np.unique(land)]
    return land_dothot
    

In [26]:
#land_dothot_list = [land_dothot(obs_id) for obs_id in tqdm(obs["observation_id"].values)]

100%|██████████| 1627475/1627475 [1:19:48<00:00, 339.89it/s]


In [29]:
len(land_dothot_list)

1627475

In [30]:
df_dothot = pd.DataFrame(land_dothot_list) 
df_dothot["observation_id"] = obs["observation_id"].values
df_dothot.to_csv("land_dothot.csv", index=False)
df_dothot

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,30,31,32,33,observation_id
0,0.0,0.005493,0.220032,0.000000,0.000000,0.002747,0.000000,0.000000,0.076355,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,10561949
1,0.0,0.000000,0.000000,0.075684,0.022583,0.870605,0.000000,0.000000,0.028076,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,10131188
2,0.0,0.013428,0.000000,0.000000,0.000000,0.004272,0.009155,0.004578,0.121521,0.227417,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,10799362
3,0.0,0.011902,0.012512,0.001526,0.040588,0.000000,0.004578,0.000000,0.895630,0.033264,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,10392536
4,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,10335049
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1627470,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.799362,22068171
1627471,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.014664,0.460693,0.0,0.0,0.0,0.373840,22068172
1627472,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.802841,0.197159,0.000000,0.0,0.0,0.0,0.000000,22068173
1627473,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.493912,0.124237,0.000000,0.039688,0.0,0.0,0.0,0.000000,22068174


In [11]:
df_dothot = pd.read_csv("land_dothot.csv")
df_dothot.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,30,31,32,33,observation_id
0,0.0,0.005493,0.220032,0.0,0.0,0.002747,0.0,0.0,0.076355,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10561949
1,0.0,0.0,0.0,0.075684,0.022583,0.870605,0.0,0.0,0.028076,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10131188
2,0.0,0.013428,0.0,0.0,0.0,0.004272,0.009155,0.004578,0.121521,0.227417,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10799362
3,0.0,0.011902,0.012512,0.001526,0.040588,0.0,0.004578,0.0,0.89563,0.033264,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10392536
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10335049


In [10]:
# Altitude data

def alt_extraction(obs_id):
    patch = get_patch(obs_id)
    alt = patch + "/" + str(obs_id) + "_altitude.tif"
    alt = imread(alt)
    dif_alt, mean_alt = (alt.max() - alt.min()) / 5000., alt.mean() / 5000.       # min-max normalization max altitude in France is about 4000m and in USA i about 6000m
    return np.array([mean_alt, dif_alt], dtype=np.float32)

In [11]:
alt_extraction(10000000)

array([0.44829172, 0.0248    ], dtype=float32)

In [13]:
#alt_list = [alt_extraction(obs_id) for obs_id in tqdm(obs["observation_id"].values)]

100%|██████████| 1627475/1627475 [25:23<00:00, 1068.16it/s]


In [14]:
df_alt = pd.DataFrame(alt_list) 
df_alt["observation_id"] = obs["observation_id"].values
df_alt.to_csv("alt.csv", index=False)
df_alt

Unnamed: 0,0,1,observation_id
0,0.076467,0.0032,10561949
1,0.316419,0.0140,10131188
2,0.001000,0.0022,10799362
3,0.007368,0.0040,10392536
4,0.041398,0.0014,10335049
...,...,...,...
1627470,0.000242,0.0008,22068171
1627471,0.001234,0.0044,22068172
1627472,0.047532,0.0112,22068173
1627473,0.166207,0.0098,22068174


In [12]:
df_alt = pd.read_csv("alt.csv")

In [50]:
X_train.shape[0] + X_val.shape[0]

1627475

In [48]:
len(alt_list)

1627475

In [63]:
import torch

In [79]:
obs.loc[10561949, "multi_labels"].pop()

1

In [75]:
obs.at[10561949, "multi_labels"] = obs.loc[10561949, "multi_labels"] + [1,2,3]

In [70]:
obs["multi_labels"] = list(map(lambda x: [x], list(obs["species_id"].values)))
obs.set_index("observation_id", inplace=True)
obs

Unnamed: 0_level_0,latitude,longitude,species_id,subset,multi_labels
observation_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10561949,45.705116,1.424622,241,train,[241]
10131188,45.146973,6.416794,101,train,[101]
10799362,46.783695,-2.072855,700,train,[700]
10392536,48.604866,-2.825003,1456,train,[1456]
10335049,48.815567,-0.161431,157,train,[157]
...,...,...,...,...,...
22068171,29.602327,-94.555860,5041,train,[5041]
22068172,38.029580,-122.883995,8688,train,[8688]
22068173,37.913720,-121.948814,12154,train,[12154]
22068174,37.206974,-122.067154,5309,val,[5309]


In [82]:
# US
for lat0, lat1 in tqdm(zip(range(25, 50, 5), range(30, 55, 5))):
    for lon0, lon1 in tqdm(zip(range(-125, -65, 5), range(-120, -60, 5)), leave=False):
        sub_group = obs[(obs["latitude"].between(lat0, lat1) & obs["longitude"].between(lon0, lon1))]
        if not sub_group.empty:
            for i, _obs in sub_group.iterrows():
                _lat, _lon = _obs["latitude"], _obs["longitude"]
                neighbours = sub_group[(sub_group["latitude"].between(_lat - 0.02, _lat + 0.02) & sub_group["longitude"].between(_lon - 0.02, _lon + 0.02))]
                if not neighbours.empty:    
                    neighbours_labels = neighbours["species_id"].values
                    ml_i = obs.loc[i, "multi_labels"].copy()
                    ml_i = ml_i + list(neighbours_labels)
                    obs.at[i, "multi_labels"] = list(set(ml_i))

2it [00:00, 118.43it/s]
2it [00:00, 118.44it/s]
2it [00:01,  1.14it/s]
2it [01:05, 32.59s/it]
2it [00:00,  6.97it/s]
5it [01:07, 13.45s/it]


In [84]:
# FR
for lat0, lat1 in tqdm(zip(range(40, 55, 5), range(45, 60, 5))):
    for lon0, lon1 in tqdm(zip(range(-5, 10, 5), range(0, 15, 5)), leave=False):
        sub_group = obs[(obs["latitude"].between(lat0, lat1) & obs["longitude"].between(lon0, lon1))]
        if not sub_group.empty:
            for i, _obs in sub_group.iterrows():
                _lat, _lon = _obs["latitude"], _obs["longitude"]
                neighbours = sub_group[(sub_group["latitude"].between(_lat - 0.02, _lat + 0.02) & sub_group["longitude"].between(_lon - 0.02, _lon + 0.02))]
                if not neighbours.empty:    
                    neighbours_labels = neighbours["species_id"].values
                    ml_i = obs.loc[i, "multi_labels"].copy()
                    ml_i = ml_i + list(neighbours_labels)
                    obs.at[i, "multi_labels"] = list(set(ml_i))

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [90]:
l_labels = list(map(lambda x: len(x), obs["multi_labels"].values))

In [98]:
np.mean(l_labels)

99.05204626799183

In [87]:
obs["multi_labels"].to_csv("obs_multi_labels.csv")

In [14]:
obs[(obs["latitude"].between(35, 40) & obs["longitude"].between(-80, -75))]

Unnamed: 0_level_0,latitude,longitude,species_id,subset,multi_labels
observation_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
20000206,38.965546,-77.155220,4933,train,[4933]
20000222,39.263607,-76.584236,4946,train,[4946]
20000232,38.725403,-77.388695,4952,train,[4952]
20000267,38.935577,-76.991300,3072,val,[3072]
20000270,38.021440,-78.498490,2524,train,[2524]
...,...,...,...,...,...
22068101,39.693670,-76.109900,5273,train,[5273]
22068138,39.492905,-76.687065,33,train,[33]
22068144,35.889545,-78.584910,605,train,[605]
22068147,39.824284,-75.674380,6562,train,[6562]


In [None]:
# Extra code to compute class weights numpy file to be used in the model

from sklearn.utils import class_weight
import numpy as np
import torch

y = obs["species_id"].values
c_w = class_weight.compute_class_weight("balanced", classes=np.unique(y), y=y)
class_weights=class_weight.compute_class_weight('balanced',classes=np.unique(y),y=y)
class_weights = (c_w - 1) / 10 + 1
np.savetxt("class_weights.csv", class_weights, delimiter=",")
data = np.loadtxt("class_weights.csv", delimiter=",")