In [2]:
import json, os, glob
import random
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import pearsonr
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
import optuna
from data_preparation.dataset import YieldDataset
import torch
import pickle as pkl
from models.utils import evalMetrics

## prepare data

In [7]:

config = {
    'results_path': '/app/dev/Seasonal_Climate/results',
    'predictor_path': '/app/dev/Seasonal_Climate/onedrive/cy_bench_8daybins_wheat_US.csv',
    'yield_path': '/app/dev/Seasonal_Climate/cybench/cybench-data/wheat/US/yield_wheat_US.csv',
    'feature_selector': None,  
    'max_timesteps': 46,
    'temporal_truncation': None,  
    'proportion': 100,
    'state_selector': ['US-08', 'US-20', 'US-31', 'US-40', 'US-46', 'US-48'],  # ['BR41' 'BR42' 'BR43'] 
    'aez_selector': None,
    'train_years': list(range(2004, 2018)),
    'val_years': [2018, 2019, 2020],
    'test_years': [2021, 2022],
    'display_step': 20,
    'input_dim': 19,
    'num_workers': 4,
    'seed': [3407],
    'optimizer_switch': 'ADAM',
    'epochs':100,
    'num_trials':1
}

## random forest block

In [10]:
def rf_train(config, exp_name='rf'):
    
    # prepare output_dir 
    exp_params = {}
    
    # make directory for experiment
    output_dir = os.path.join(config['results_path'], exp_name)
    os.makedirs(output_dir, exist_ok=True)

    # -- collector for metrics
    metrics_train = []
    metrics_val = []
    metrics_test = []   

    for s in config['seed']:
        np.random.seed(s)
        random.seed(s)
        torch.manual_seed(s)

        # Initialize YieldDataset with various parameters
        train_dataset = YieldDataset(
            predictor_path= config['predictor_path'],
            yield_path= config['yield_path'],
            norm = None,
            years= config['train_years'],
            feature_selector= config['feature_selector'],
            max_timesteps= config['max_timesteps'],
            temporal_truncation= config['temporal_truncation'],
            proportion= config['proportion'],
            state_selector= config['state_selector'],
            aez_selector= config['aez_selector']
        )

        val_dataset = YieldDataset(
            predictor_path= config['predictor_path'],
            yield_path= config['yield_path'],
            norm = train_dataset.norm_values,
            years= config['val_years'],
            feature_selector= config['feature_selector'],
            max_timesteps= config['max_timesteps'],
            temporal_truncation= config['temporal_truncation'],
            proportion= config['proportion'],
            state_selector= config['state_selector'],
            aez_selector= config['aez_selector']
        )

        test_dataset = YieldDataset(
            predictor_path= config['predictor_path'],
            yield_path= config['yield_path'],
            norm = train_dataset.norm_values,
            years= config['test_years'],
            feature_selector= config['feature_selector'],
            max_timesteps= config['max_timesteps'],
            temporal_truncation= config['temporal_truncation'],
            proportion= config['proportion'],
            state_selector= config['state_selector'],
            aez_selector= config['aez_selector']
        )

        # ----- create array for experiment settings
        X_t, y_t  = train_dataset.truncated_data.reshape(train_dataset.truncated_data.shape[0], -1), train_dataset.target
        X_v, y_v = val_dataset.truncated_data.reshape(val_dataset.truncated_data.shape[0], -1), val_dataset.target
        X_d, y_d = test_dataset.truncated_data.reshape(test_dataset.truncated_data.shape[0], -1), test_dataset.target


        # tune only once
        if s == config['seed'][0]:

            def objective(trial):

                params = {
                    'n_estimators' : trial.suggest_int('n_estimators', 100, 500),
                    'max_depth' : trial.suggest_int('max_depth', 3, 10),
                    # 'max_features' : trial.suggest_categorical('max_features', ['auto', 'sqrt']), 
                    'min_samples_split' : trial.suggest_int('min_samples_split', 5, 15),
                    'bootstrap' : trial.suggest_categorical('bootstrap', [True, False]),
                    'n_jobs' : trial.suggest_categorical('n_jobs', [-1]), 
                    'random_state' : trial.suggest_categorical('random_state', [s])
                }

                clf = RandomForestRegressor(**params)
                clf.fit(X_t, y_t)

                # intermediate_value = clf.score(X_v, y_v)
                y_p = clf.predict(X_v)
                intermediate_value =  mean_squared_error(y_v, y_p)

                if trial.should_prune():
                    raise optuna.TrialPruned()

                return intermediate_value
            
            # optimize study
            study = optuna.create_study(direction="minimize")
            study.optimize(objective, n_trials=config['num_trials']) 
            print(study.best_params)
            
            # save study
            study_df = study.trials_dataframe(attrs=('number', 'value', 'params', 'state'))
            study_df.to_csv(os.path.join(output_dir, 'study_params_'+'{}.csv'.format(exp_name)))
            

            # dump best parameters
            exp_params[exp_name] = study.best_params


## ============================== ACTIVATE BLOCK TO SAVE METRICS FOR TRAIN, VALIDATION AND TEST

    #     # use best params from tuning
    #     rf_opt = RandomForestRegressor(**study.best_params)
    #     rf_opt.fit(X_t, y_t)

    #     # predict training year
    #     y_pred_t = rf_opt.predict(X_t)

    #     # predict validation year
    #     y_pred_v = rf_opt.predict(X_v)

    #     #predict test year
    #     y_pred_d = rf_opt.predict(X_d)
    #     pkl.dump(y_d, open(os.path.join(output_dir, 'y_true_test_data.pkl'), 'wb'))
    #     pkl.dump(y_pred_d, open(os.path.join(output_dir, 'y_pred_test_data.pkl'), 'wb'))


    #     # print('training metrics')
    #     mape, rmse, nrmse, r2, r = evalMetrics(y_t, y_pred_t)
    #     metrics_train.append([exp_name, 'train', str(s), mape, rmse, nrmse, r2, r])
    #     # print('MAPE= {} , RMSE = {} , NRMSE = {} , r2 ={} , R = {}'.format(mape, rmse, nrmse, r2, r))

    #     # print('validation metrics')
    #     mape, rmse, nrmse, r2, r = evalMetrics(y_v, y_pred_v)
    #     metrics_val.append([exp_name, 'validation', str(s), mape, rmse, nrmse, r2, r])
    #     # print('MAPE= {} , RMSE = {} ,NRMSE = {} , r2 ={} , R = {}'.format(mape, rmse, nrmse, r2, r))


    #     # # print('test metrics d')
    #     mape, rmse, nrmse, r2, r = evalMetrics(y_d, y_pred_d)
    #     # print('MAPE= {} , RMSE = {} , NRMSE = {} , r2 ={} , R = {}'.format(mape, rmse, nrmse, r2, r))
    #     metrics_test.append([ exp_name,'test', str(s), mape, rmse, nrmse, r2, r])



    
    # results_df_train = pd.DataFrame(metrics_train, columns=['exp_name', 'mode','seed', 'MAPE', 'RMSE', 'NRMSE', 'R2', 'r'])
    # results_df_val = pd.DataFrame(metrics_val, columns=['exp_name', 'mode', 'seed', 'MAPE', 'RMSE', 'NRMSE', 'R2', 'r'])
    # results_df_test = pd.DataFrame(metrics_test, columns=['exp_name', 'mode','seed', 'MAPE', 'RMSE','NRMSE', 'R2', 'r'])
    # df_combined = pd.concat([results_df_train, results_df_val, results_df_test], ignore_index=True)
    # df_combined.to_csv(os.path.join(output_dir, '{}.csv'.format(exp_name)))

    # # save exp parameters
    # with open(os.path.join(output_dir, 'params.json'), 'w') as file:
    #     file.write(json.dumps(exp_params, indent=4))

    # # save test ids
    # pkl.dump(test_dataset.ids , open(os.path.join(output_dir, 'test_geoid.pkl'), 'wb'))

    # # save test years
    # pkl.dump(test_dataset.years , open(os.path.join(output_dir, 'test_years.pkl'), 'wb'))
        
    # print(exp_name, 'done')

    #==========================================================================================

In [11]:
rf_train(config, exp_name='rf')

[32m[I 2024-08-26 06:22:38,867][0m A new study created in memory with name: no-name-d8fea0d2-51ac-4fd0-8aa5-b306451a0db6[0m
[32m[I 2024-08-26 06:23:02,458][0m Trial 0 finished with value: 0.727690694519767 and parameters: {'n_estimators': 428, 'max_depth': 8, 'min_samples_split': 13, 'bootstrap': False, 'n_jobs': -1, 'random_state': 3407}. Best is trial 0 with value: 0.727690694519767.[0m


{'n_estimators': 428, 'max_depth': 8, 'min_samples_split': 13, 'bootstrap': False, 'n_jobs': -1, 'random_state': 3407}
rf done
