In [7]:
import pandas as pd
from pathlib import Path
import os
import warnings
warnings.filterwarnings(action="ignore", category=UserWarning)
import optuna
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn import model_selection
import joblib


In [8]:
#Se leen los datos y se seleccionan las variables que presentan una diferencia
# entre los individuos que presentan estres y los que no
data_path = Path(os.getcwd()).parent / "data" / "gold"

In [9]:
#Loads different feature sets
df_manual = pd.read_csv(os.path.join(data_path,'manual_features_train.csv'))
X_manual = df_manual.iloc[:,:-1]
y_manual = df_manual.iloc[:,-1]

df_tfidf = pd.read_csv(os.path.join(data_path,'tfidf_features_train.csv'))
X_tfidf = df_tfidf.iloc[:,:-1]
y_tfidf = df_tfidf.iloc[:,-1]

df_mpnet = pd.read_csv(os.path.join(data_path,'mpnet_features_train.csv'))
X_mpnet = df_mpnet.iloc[:,:-1]
y_mpnet = df_mpnet.iloc[:,-1]

df_distilroberta = pd.read_csv(os.path.join(data_path,'distilroberta_features_train.csv'))
X_distilroberta = df_distilroberta.iloc[:,:-1]
y_distilroberta = df_distilroberta.iloc[:,-1]

### Random Forest

##### Manual

In [11]:
def objective_manual_rf(trial):
    
    num_trees = trial.suggest_int("rf_n_estimators", 10, 1000)
    criterion = trial.suggest_categorical('rf_criterion', ['gini', 'entropy'])
    max_depth = trial.suggest_int("rf_max_depth", 2, 1000)
    classifier_obj = RandomForestClassifier(n_estimators=num_trees, criterion=criterion, max_depth=max_depth)

    # Step 3: Scoring method:
    score = model_selection.cross_val_score(classifier_obj, X_manual, y_manual, n_jobs=-1, cv=3, scoring="f1")
    fscore = score.mean()
    return fscore

study = optuna.create_study(direction="maximize",study_name="manual_rf",
                             sampler=optuna.samplers.NSGAIISampler())

# Adding Attributes to Study
study.set_user_attr('contributors', ['Francisco'])
study.set_user_attr('dataset', 'Manual')

# Store and load using joblib:
study_path = os.path.join(data_path,'manual_rf.pkl')
if not os.path.isfile(study_path):
    joblib.dump(study, study_path)

study = joblib.load(study_path)

study.optimize(objective_manual_rf, n_trials=100)
joblib.dump(study, study_path)

[I 2024-06-26 05:14:29,077] A new study created in memory with name: manual_rf
[I 2024-06-26 05:14:29,889] Trial 100 finished with value: 0.5973599965430418 and parameters: {'rf_n_estimators': 282, 'rf_criterion': 'gini', 'rf_max_depth': 704}. Best is trial 1 with value: 0.6902527492618468.
[I 2024-06-26 05:14:47,889] Trial 101 finished with value: 0.6041289476462776 and parameters: {'rf_n_estimators': 5887, 'rf_criterion': 'gini', 'rf_max_depth': 8320}. Best is trial 1 with value: 0.6902527492618468.
[I 2024-06-26 05:15:11,632] Trial 102 finished with value: 0.6042503735445656 and parameters: {'rf_n_estimators': 6487, 'rf_criterion': 'entropy', 'rf_max_depth': 2418}. Best is trial 1 with value: 0.6902527492618468.
[I 2024-06-26 05:15:41,840] Trial 103 finished with value: 0.6033670771393015 and parameters: {'rf_n_estimators': 9879, 'rf_criterion': 'gini', 'rf_max_depth': 6144}. Best is trial 1 with value: 0.6902527492618468.
[I 2024-06-26 05:16:01,932] Trial 104 finished with value: 0

KeyboardInterrupt: 

##### TFIDF

In [12]:
def objective_tfidf_rf(trial):
    
    num_trees = trial.suggest_int("rf_n_estimators", 10, 1000)
    criterion = trial.suggest_categorical('rf_criterion', ['gini', 'entropy'])
    max_depth = trial.suggest_int("rf_max_depth", 2, 1000)
    classifier_obj = RandomForestClassifier(n_estimators=num_trees, criterion=criterion, max_depth=max_depth)

    # Step 3: Scoring method:
    score = model_selection.cross_val_score(classifier_obj, X_tfidf, y_tfidf, n_jobs=-1, cv=3, scoring="f1")
    fscore = score.mean()
    return fscore

study = optuna.create_study(direction="maximize",study_name="tfidf_rf",
                             sampler=optuna.samplers.NSGAIISampler())

# Adding Attributes to Study
study.set_user_attr('contributors', ['Francisco'])
study.set_user_attr('dataset', 'tfidf')

# Store and load using joblib:
study_path = os.path.join(data_path,'tfidf_rf.pkl')
if not os.path.isfile(study_path):
    joblib.dump(study, study_path)

study = joblib.load(study_path)

study.optimize(objective_tfidf_rf, n_trials=100)
joblib.dump(study, study_path)

[I 2024-06-26 05:17:20,569] A new study created in memory with name: tfidf_rf
[I 2024-06-26 05:17:25,227] Trial 0 finished with value: 0.7409969664443287 and parameters: {'rf_n_estimators': 430, 'rf_criterion': 'gini', 'rf_max_depth': 19}. Best is trial 0 with value: 0.7409969664443287.
[I 2024-06-26 05:17:27,359] Trial 1 finished with value: 0.7285736115686757 and parameters: {'rf_n_estimators': 54, 'rf_criterion': 'entropy', 'rf_max_depth': 719}. Best is trial 0 with value: 0.7409969664443287.
[I 2024-06-26 05:17:44,599] Trial 2 finished with value: 0.7401123905880231 and parameters: {'rf_n_estimators': 856, 'rf_criterion': 'gini', 'rf_max_depth': 632}. Best is trial 0 with value: 0.7409969664443287.
[I 2024-06-26 05:18:02,200] Trial 3 finished with value: 0.737647614804691 and parameters: {'rf_n_estimators': 817, 'rf_criterion': 'entropy', 'rf_max_depth': 495}. Best is trial 0 with value: 0.7409969664443287.
[I 2024-06-26 05:18:06,779] Trial 4 finished with value: 0.734441272069226 

['c:\\Users\\franc.FJMOYAO\\Desktop\\Programming Projects\\depression_detection\\data\\gold\\tfidf_rf.pkl']

In [13]:
def objective_mpnet_rf(trial):
    
    num_trees = trial.suggest_int("rf_n_estimators", 10, 1000)
    criterion = trial.suggest_categorical('rf_criterion', ['gini', 'entropy'])
    max_depth = trial.suggest_int("rf_max_depth", 2, 1000)
    classifier_obj = RandomForestClassifier(n_estimators=num_trees, criterion=criterion, max_depth=max_depth)

    # Step 3: Scoring method:
    score = model_selection.cross_val_score(classifier_obj, X_mpnet, y_mpnet, n_jobs=-1, cv=3, scoring="f1")
    fscore = score.mean()
    return fscore

study = optuna.create_study(direction="maximize",study_name="mpnet_rf",
                             sampler=optuna.samplers.NSGAIISampler())

# Adding Attributes to Study
study.set_user_attr('contributors', ['Francisco'])
study.set_user_attr('dataset', 'mpnet')

# Store and load using joblib:
study_path = os.path.join(data_path,'mpnet_rf.pkl')
if not os.path.isfile(study_path):
    joblib.dump(study, study_path)

study = joblib.load(study_path)

study.optimize(objective_mpnet_rf, n_trials=100)
joblib.dump(study, study_path)

[I 2024-06-26 08:34:24,373] A new study created in memory with name: mpnet_rf
[I 2024-06-26 08:34:59,426] Trial 0 finished with value: 0.7883702569583727 and parameters: {'rf_n_estimators': 865, 'rf_criterion': 'gini', 'rf_max_depth': 866}. Best is trial 0 with value: 0.7883702569583727.
[I 2024-06-26 08:35:38,172] Trial 1 finished with value: 0.7882284030021772 and parameters: {'rf_n_estimators': 981, 'rf_criterion': 'gini', 'rf_max_depth': 99}. Best is trial 0 with value: 0.7883702569583727.
[I 2024-06-26 08:35:47,971] Trial 2 finished with value: 0.7827437712251072 and parameters: {'rf_n_estimators': 228, 'rf_criterion': 'gini', 'rf_max_depth': 837}. Best is trial 0 with value: 0.7883702569583727.
[I 2024-06-26 08:36:07,117] Trial 3 finished with value: 0.7873829766032507 and parameters: {'rf_n_estimators': 458, 'rf_criterion': 'gini', 'rf_max_depth': 579}. Best is trial 0 with value: 0.7883702569583727.
[I 2024-06-26 08:36:43,051] Trial 4 finished with value: 0.7815142110763031 and

['c:\\Users\\franc.FJMOYAO\\Desktop\\Programming Projects\\depression_detection\\data\\gold\\mpnet_rf.pkl']

In [14]:
def objective_distilroberta_rf(trial):
    
    num_trees = trial.suggest_int("rf_n_estimators", 10, 1000)
    criterion = trial.suggest_categorical('rf_criterion', ['gini', 'entropy'])
    max_depth = trial.suggest_int("rf_max_depth", 2, 1000)
    classifier_obj = RandomForestClassifier(n_estimators=num_trees, criterion=criterion, max_depth=max_depth)

    # Step 3: Scoring method:
    score = model_selection.cross_val_score(classifier_obj, X_distilroberta, y_distilroberta, n_jobs=-1, cv=3, scoring="f1")
    fscore = score.mean()
    return fscore

study = optuna.create_study(direction="maximize",study_name="distilroberta_rf",
                             sampler=optuna.samplers.NSGAIISampler())

# Adding Attributes to Study
study.set_user_attr('contributors', ['Francisco'])
study.set_user_attr('dataset', 'distilroberta')

# Store and load using joblib:
study_path = os.path.join(data_path,'distilroberta_rf.pkl')
if not os.path.isfile(study_path):
    joblib.dump(study, study_path)

study = joblib.load(study_path)

study.optimize(objective_distilroberta_rf, n_trials=100)
joblib.dump(study, study_path)

[I 2024-06-26 09:24:03,327] A new study created in memory with name: distilroberta_rf
[I 2024-06-26 09:24:36,905] Trial 0 finished with value: 0.7925157629356789 and parameters: {'rf_n_estimators': 826, 'rf_criterion': 'gini', 'rf_max_depth': 72}. Best is trial 0 with value: 0.7925157629356789.
[I 2024-06-26 09:24:45,199] Trial 1 finished with value: 0.7875852634765493 and parameters: {'rf_n_estimators': 603, 'rf_criterion': 'gini', 'rf_max_depth': 3}. Best is trial 0 with value: 0.7925157629356789.
[I 2024-06-26 09:25:24,193] Trial 2 finished with value: 0.7979639651002582 and parameters: {'rf_n_estimators': 913, 'rf_criterion': 'entropy', 'rf_max_depth': 731}. Best is trial 2 with value: 0.7979639651002582.
[I 2024-06-26 09:25:54,747] Trial 3 finished with value: 0.7946011018032619 and parameters: {'rf_n_estimators': 720, 'rf_criterion': 'entropy', 'rf_max_depth': 39}. Best is trial 2 with value: 0.7979639651002582.
[I 2024-06-26 09:25:55,440] Trial 4 finished with value: 0.734078940

['c:\\Users\\franc.FJMOYAO\\Desktop\\Programming Projects\\depression_detection\\data\\gold\\distilroberta_rf.pkl']

In [15]:
rf_studies = [os.path.join(data_path,x) for x in os.listdir(data_path) if "rf" in x]

In [16]:
rf_experiments = pd.DataFrame()
for study_path in rf_studies:
    cur_study = joblib.load(study_path)
    cur_study_df = cur_study.trials_dataframe()
    cur_study_df["dataset"] = cur_study.user_attrs["dataset"]
    rf_experiments = pd.concat([rf_experiments, cur_study_df])


In [17]:
rf_experiments.groupby("dataset")["value"].max()

dataset
Manual           0.690253
distilroberta    0.804725
mpnet            0.792190
tfidf            0.747701
Name: value, dtype: float64