In [1]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
from skmap.misc import find_files, GoogleSheet, ttprint
import warnings
import multiprocess as mp
import time
from scipy.special import expit, logit
import warnings
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import train_test_split, cross_val_score, HalvingGridSearchCV, KFold, GroupKFold, cross_val_predict
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPRegressor
import joblib
import pickle
from sklearn.metrics import r2_score, mean_squared_error, make_scorer#, root_mean_squared_error
from scipy.stats import pearsonr
from sklearn.preprocessing import StandardScaler
# from cubist import Cubist
from sklearn.base import BaseEstimator, TransformerMixin
from pathlib import Path
from trees_rf import TreesRandomForestRegressor
from model_fit import read_features, cfi_calc, parameter_fine_tuning, calc_ccc, separate_data, rscfi 
from model_fit import accuracy_plot, plot_top_features, pdp_hexbin, plot_histogram, calc_metrics, accuracy_strata_plot
import warnings
warnings.filterwarnings('ignore')
import os
from scipy.stats import boxcox
from scipy.special import inv_boxcox
import json
from datetime import datetime

folder_path = '/mnt/ripley/global_soc/scikit-map/global-soil-mapping'
# prop_list = ['ocd', 'soc', 'bulk.density', 'ph.h2o' ,'coarse']
prop_list = ['ocd', 'soc', 'bulk.density', 'ph.h2o']

transforms_dict = {
    'ocd': 'log1p',
    'soc': 'log1p',
    'bulk.density': None,
    'ph.h2o': None,
    'coarse': 'log1p'
}
version = '20250204'

# generate the production model with all the data we have

In [3]:
for prop in prop_list:
    print(f'\n{prop}--------------------------------------------------------------')
    df = pd.read_parquet(f'{folder_path}/material/soil_pnts_organized_v{version}.pq')
    space = transforms_dict[prop]
    output_folder = folder_path+'/'+prop
    os.makedirs(output_folder, exist_ok=True)
    
    if space == 'log1p':
        df[f'{prop}_log1p'] = np.log1p(df[prop])
        tgt = f'{prop}_log1p'
    elif space == 'boxcox':
        tgt = f'{prop}_boxcox'
        cal[f'{prop}_boxcox'], fitted_lambda = boxcox(cal['ocd'], lmbda=None)
    else:
        tgt = prop
    
    covs = read_features(f'{output_folder}/feature_selected_{prop}_v{version}.txt')  # Read in
    
    model = joblib.load(f'{output_folder}/model_rf.{prop}_ccc_v{version}.joblib')
    model.n_jobs = 90
    
    # Drop rows with NaN values in the target column
    df = df.dropna(subset=[tgt])
    df = df.reset_index(drop=True)
    y = df[tgt]
    X = df[covs]
    
    nan_percentage = (X.isna().sum() / len(X)) * 100
    nan_percentage = nan_percentage.sort_values(ascending=False)
    print("\nNaN Percentage per Column (Sorted):")
    for nn, (col, perc) in enumerate(nan_percentage.items()):
        print(f"{col}: {perc:.2f}%")
        if nn > 10:
            break
    
    print(f"Number of samples: {X.shape[0]}")
    X_np = X.to_numpy()
    print(f"Number of covariates: {len(covs)}")
    print(f"Percentage of NaN in X: {np.sum(np.sum(np.isnan(X_np))) / X_np.size * 100:.2f}%, filling them with median")
   
    X = X.fillna(X.median())
    model.fit(X, y)
    
    y_pred = model.predict(X)
    r2 = r2_score(y, y_pred)
    
    print(f"R² score on the training set: {r2:.4f}")
    
    joblib.dump(model, f'{folder_path}/production/model_rf.{prop}_production_v{version}.joblib')


ocd--------------------------------------------------------------

NaN Percentage per Column (Sorted):
ndvi_glad.swa.ard2_p25_30m_s_YYYY0101_YYYY1231_go_epsg.4326_v1: 1.70%
blue_glad.swa.ard2_p25_30m_s_YYYY0101_YYYY1231_go_epsg.4326_v1: 1.70%
green_glad.swa.ard2_p25_30m_s_YYYY0101_YYYY1231_go_epsg.4326_v1: 1.70%
savi_glad.swa.ard2_p25_30m_s_YYYY0101_YYYY1231_go_epsg.4326_v1: 1.70%
ndti_glad.swa.ard2_p25_30m_s_YYYY0101_YYYY1231_go_epsg.4326_v1: 1.70%
ndsi_glad.swa.ard2_p25_30m_s_YYYY0101_YYYY1231_go_epsg.4326_v1: 1.70%
red_glad.swa.ard2_p25_30m_s_YYYY0101_YYYY1231_go_epsg.4326_v1: 1.70%
gpw_ugpp_lue.model_m_30m_s_YYYY0101_YYYY1231_go_epsg.4326_v1: 1.42%
gpw_ugpp.daily_lue.model_m_30m_s_YYYY0101_YYYY0228_go_epsg.4326_v1: 1.42%
gpw_ugpp.daily_lue.model_m_30m_s_YYYY0501_YYYY0630_go_epsg.4326_v1: 1.42%
gpw_ugpp.daily_lue.model_m_30m_s_YYYY0901_YYYY1031_go_epsg.4326_v1: 1.42%
gpw_ugpp.daily_lue.model_m_30m_s_YYYY0701_YYYY0831_go_epsg.4326_v1: 1.42%
Number of samples: 215936
Number of covari