In [None]:
import duckdb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

path = "/home/etienne/projects/obsq/work/dev/data/data.duckdb"
target = 'expert_match'
pd.set_option('display.float_format', lambda x: '%.5f' % x)


In [None]:
train_size = 0.7
val_size = 0.15
test_size = 0.15

scaler = StandardScaler()
spatial_col = 'spatial_cluster'
time_col = 'tempo_year'
species_col = 'species'

In [14]:
n_spatial_bins = 6
n_time_bins = 10

In [9]:
table = 'combined'
schema = 'features'
con = duckdb.connect(path)
df = con.execute(f"""SELECT* FROM {schema}.{table}""" ).df()
#df = df.drop(columns= 'geom')
con.close()

In [40]:
X = df.drop(columns=[target])
y = df[target]

In [41]:
# Create spatial bins (quantile-based to ensure even distribution)
spatial_bins = pd.qcut(df[spatial_col], q=6, labels=False, duplicates='drop')
spatial_bins.head()

0    0
1    0
2    2
3    0
4    0
Name: spatial_cluster, dtype: int64

In [21]:
if df[time_col].dtype in ['int64', 'float64']:
    time_bins = pd.qcut(df[time_col], q=n_time_bins, labels=False, duplicates='drop')
time_bins.head()

0    2
1    3
2    3
3    3
4    2
Name: tempo_year, dtype: int64

In [30]:
species_counts = df[species_col].value_counts()
rare_threshold = len(df) * 0.01  # Species with <1% of observations
species_mapped = df[species_col].apply(
    lambda x: x if species_counts[x] >= rare_threshold else 'rare_species'
)
species_labels = pd.factorize(species_mapped)[0]

In [33]:
strat_var = (
spatial_bins.astype(str) + '_' + 
species_labels.astype(str) + '_' + 
time_bins.astype(str)
)
strat_var.head()

0    0_0_2
1    0_0_3
2    2_0_3
3    0_0_3
4    0_0_2
dtype: object

In [39]:
strat_counts = strat_var.value_counts()
small_strata = strat_counts[strat_counts < 2].index
strat_var = strat_var.apply(
            lambda x: 'mixed_stratum' if x in small_strata else x
        )
strat_var.head(10)

0    0_0_2
1    0_0_3
2    2_0_3
3    0_0_3
4    0_0_2
5    2_0_2
6    1_0_3
7    2_0_3
8    0_0_2
9    1_0_2
dtype: object

In [42]:
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, 
    test_size=test_size,
    stratify=strat_var,
    random_state=42
)

# Get stratification var for temp set
strat_temp = strat_var.loc[X_temp.index]

In [43]:
val_ratio = val_size / (train_size + val_size)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp,
    test_size=val_ratio,
    stratify=strat_temp,
    random_state=42
)

In [44]:
n_samples = len(y_train)
n_pos = y_train.sum()
n_neg = n_samples - n_pos
scale_pos_weight = n_neg / n_pos