In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn import preprocessing
from sklearn.model_selection import StratifiedKFold
import optuna
import os
import numpy as np

## load data

In [2]:
country = 'sudan'
mode = 'Train'
feature = 'indices'

path = '/app/stella/dev/GeoITU/data'

X_indices = np.load(os.path.join(path, '{}_{}_{}.npy'.format(country, mode, feature)), allow_pickle=True)
y_labels = np.load(os.path.join(path, '{}_{}_labels.npy'.format(country, mode)), allow_pickle=True)

            
print(X_indices.shape, y_labels.shape)
assert X_indices.shape[0] == y_labels.shape[0]

X_indices = X_indices.reshape(X_indices.shape[0], -1)

(500, 51, 15) (500,)


## rf

In [11]:
# Setting up the stratified k-fold cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

accuracies = []  # List to store accuracy of each fold
f1_scores = []
fold_num = 0
num_trial = 50

# 5-fold Stratified Cross Validation loop
for train_index, test_index in skf.split(X_indices, y_labels):
    fold_num += 1

    # Splitting the dataset for this fold
    X_train, X_test = [X_indices[i] for i in train_index], [X_indices[i] for i in test_index]
    y_train_labels, y_test_labels = [y_labels[i] for i in train_index], [
        y_labels[i] for i in test_index
    ]
    
    
    # optimize params for single fold
    if fold_num == 1:  


        def objective(trial):
            params = {
            'n_estimators' : trial.suggest_int('n_estimators', 100, 1000),
            'max_depth' : trial.suggest_int('max_depth', 3, 15),
            # 'max_features' : trial.suggest_categorical('max_features', ['auto', 'sqrt']), 
            'min_samples_split' : trial.suggest_int('min_samples_split', 5, 32),
            'bootstrap' : trial.suggest_categorical('bootstrap', [True, False]),
            'n_jobs' : trial.suggest_categorical('n_jobs', [-1]) #fixed. use all cpus
            }

            clf = RandomForestClassifier(**params)
            clf.fit(X_train, y_train_labels)

            # Making predictions on the test set
            y_pred = clf.predict(X_test)

            # Calculating and reporting the accuracy
            accuracy = accuracy_score(y_test_labels, y_pred)
            return accuracy


        # optimize study
        study = optuna.create_study(direction="maximize")
        study.optimize(objective, n_trials=num_trial) 
        print(study.best_params)
                        
    # Model training
    clf = RandomForestClassifier(**study.best_params)  # change classifier here
    clf.fit(X_train, y_train_labels)
    
    # clf = xgb.XGBClassifier()
    # clf.fit(X_train, le.transform(y_train_labels))

    # Making predictions on the test set
    y_pred = clf.predict(X_test)

    # Calculating and reporting the accuracy
    accuracy = accuracy_score(y_test_labels, y_pred)
    accuracies.append(accuracy)  # Storing the accuracy
    
    # Calculating and reporting the fscore
    f_score = f1_score(y_test_labels, y_pred, average='weighted')
    f_scores.append(f_score)  # Storing the accuracy
    print(f"Fold {fold_num} Accuracy: {accuracy}")
    print(f"Fold {fold_num} F_score: {f_score}")

# Reporting the final results
avg_accuracy = np.mean(accuracies)
avg_fscore = np.mean(f_scores)
print(f"Average Accuracy across all folds: {avg_accuracy:.4f}")
print(f"Average Fscore across all folds: {avg_fscore:.4f}")


[32m[I 2023-10-03 17:07:18,590][0m A new study created in memory with name: no-name-90a34b10-6fe1-499c-8aab-ed0e11326611[0m
[32m[I 2023-10-03 17:07:20,856][0m Trial 0 finished with value: 0.98 and parameters: {'n_estimators': 668, 'max_depth': 12, 'min_samples_split': 16, 'bootstrap': True, 'n_jobs': -1}. Best is trial 0 with value: 0.98.[0m
[32m[I 2023-10-03 17:07:21,514][0m Trial 1 finished with value: 0.99 and parameters: {'n_estimators': 167, 'max_depth': 14, 'min_samples_split': 31, 'bootstrap': True, 'n_jobs': -1}. Best is trial 1 with value: 0.99.[0m
[32m[I 2023-10-03 17:07:23,227][0m Trial 2 finished with value: 0.99 and parameters: {'n_estimators': 684, 'max_depth': 6, 'min_samples_split': 26, 'bootstrap': False, 'n_jobs': -1}. Best is trial 1 with value: 0.99.[0m
[32m[I 2023-10-03 17:07:24,692][0m Trial 3 finished with value: 0.99 and parameters: {'n_estimators': 573, 'max_depth': 11, 'min_samples_split': 30, 'bootstrap': False, 'n_jobs': -1}. Best is trial 1 wi

{'n_estimators': 167, 'max_depth': 14, 'min_samples_split': 31, 'bootstrap': True, 'n_jobs': -1}
Fold 1 Accuracy: 0.99
Fold 1 F_score: 0.98999899989999
Fold 2 Accuracy: 0.99
Fold 2 F_score: 0.98999899989999
Fold 3 Accuracy: 0.98
Fold 3 F_score: 0.98
Fold 4 Accuracy: 0.99
Fold 4 F_score: 0.98999899989999
Fold 5 Accuracy: 0.97
Fold 5 F_score: 0.9699729756781104
Average Accuracy across all folds: 0.9840
Average Fscore across all folds: 0.9800
