In [5]:
import pandas as pd
from pathlib import Path
import os
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning)
import optuna
import pandas as pd
from sklearn import linear_model
from sklearn import model_selection
import joblib
from xgboost import XGBClassifier

In [6]:
#Se leen los datos y se seleccionan las variables que presentan una diferencia
# entre los individuos que presentan estres y los que no
data_path = Path(os.getcwd()).parent / "data" / "gold"

In [7]:
#Loads different feature sets
df_manual = pd.read_csv(os.path.join(data_path,'manual_features_train.csv'))
X_manual = df_manual.iloc[:,:-1]
y_manual = df_manual.iloc[:,-1]

df_tfidf = pd.read_csv(os.path.join(data_path,'tfidf_features_train.csv'))
X_tfidf = df_tfidf.iloc[:,:-1]
y_tfidf = df_tfidf.iloc[:,-1]

df_mpnet = pd.read_csv(os.path.join(data_path,'mpnet_features_train.csv'))
X_mpnet = df_mpnet.iloc[:,:-1]
y_mpnet = df_mpnet.iloc[:,-1]

df_distilroberta = pd.read_csv(os.path.join(data_path,'distilroberta_features_train.csv'))
X_distilroberta = df_distilroberta.iloc[:,:-1]
y_distilroberta = df_distilroberta.iloc[:,-1]

### XGBoost

##### Manual

In [11]:
def objective_manual_xgboost(trial):
    
    lrate = trial.suggest_float("learning_rate", 0.0001, 0.4, log=True)
    max_depth = trial.suggest_int("max_depth", 10, 1000)
    n_estimators =trial.suggest_int("n_estimators", 10, 1000)
    classifier_obj = XGBClassifier(n_estimators=n_estimators, max_depth=max_depth, learning_rate=lrate, 
                                   objective='binary:logistic',
                                   device='cuda')

    # Step 3: Scoring method:
    score = model_selection.cross_val_score(classifier_obj, X_manual, y_manual, n_jobs=-1, cv=3, scoring="f1")
    fscore = score.mean()
    return fscore

study = optuna.create_study(direction="maximize",study_name="manual_xgboost",
                             sampler=optuna.samplers.NSGAIISampler())

# Adding Attributes to Study
study.set_user_attr('contributors', ['Francisco'])
study.set_user_attr('dataset', 'Manual')

# Store and load using joblib:
study_path = os.path.join(data_path,'manual_xgboost.pkl')
if not os.path.isfile(study_path):
    joblib.dump(study, study_path)

study = joblib.load(study_path)

study.optimize(objective_manual_xgboost, n_trials=100)
joblib.dump(study, study_path)

[I 2024-06-26 10:51:04,913] A new study created in memory with name: manual_xgboost
[I 2024-06-26 10:51:27,957] Trial 0 finished with value: 0.5880252903080302 and parameters: {'learning_rate': 0.02469228085809638, 'max_depth': 337, 'n_estimators': 663}. Best is trial 0 with value: 0.5880252903080302.
[I 2024-06-26 10:51:43,863] Trial 1 finished with value: 0.6183945797826244 and parameters: {'learning_rate': 0.003492300301119997, 'max_depth': 295, 'n_estimators': 536}. Best is trial 1 with value: 0.6183945797826244.
[I 2024-06-26 10:51:49,826] Trial 2 finished with value: 0.5842940679555225 and parameters: {'learning_rate': 0.25517514284028475, 'max_depth': 168, 'n_estimators': 165}. Best is trial 1 with value: 0.6183945797826244.
[I 2024-06-26 10:52:02,792] Trial 3 finished with value: 0.5801584130858497 and parameters: {'learning_rate': 0.12295296782150383, 'max_depth': 380, 'n_estimators': 392}. Best is trial 1 with value: 0.6183945797826244.
[I 2024-06-26 10:52:14,352] Trial 4 fin

['c:\\Users\\franc.FJMOYAO\\Desktop\\Programming Projects\\depression_detection\\data\\gold\\manual_xgboost.pkl']

##### TFIDF

In [18]:
def objective_tfidf_xgboost(trial):
    
    lrate = trial.suggest_float("learning_rate", 0.0001, 0.4, log=True)
    max_depth = trial.suggest_int("max_depth", 2, 1000)
    n_estimators =trial.suggest_int("n_estimators", 2, 1000)
    classifier_obj = XGBClassifier(n_estimators=n_estimators, max_depth=max_depth, learning_rate=lrate, 
                                   objective='binary:logistic',
                                   device='cuda')

    # Step 3: Scoring method:
    score = model_selection.cross_val_score(classifier_obj, X_tfidf, y_tfidf, n_jobs=-1, cv=3, scoring="f1")
    fscore = score.mean()
    return fscore

study = optuna.create_study(direction="maximize",study_name="tfidf_xgboost",
                             sampler=optuna.samplers.NSGAIISampler())

# Adding Attributes to Study
study.set_user_attr('contributors', ['Francisco'])
study.set_user_attr('dataset', 'tfidf')

# Store and load using joblib:
study_path = os.path.join(data_path,'tfidf_xgboost.pkl')
if not os.path.isfile(study_path):
    joblib.dump(study, study_path)

study = joblib.load(study_path)

study.optimize(objective_tfidf_xgboost, n_trials=10)
joblib.dump(study, study_path)

[I 2024-06-26 12:55:45,012] A new study created in memory with name: tfidf_xgboost
[I 2024-06-26 12:57:44,474] Trial 0 finished with value: 0.6931127871885815 and parameters: {'learning_rate': 0.006064090298562169, 'max_depth': 362, 'n_estimators': 339}. Best is trial 0 with value: 0.6931127871885815.
[I 2024-06-26 13:03:22,192] Trial 1 finished with value: 0.6749624665992404 and parameters: {'learning_rate': 0.0008923476476256534, 'max_depth': 707, 'n_estimators': 948}. Best is trial 0 with value: 0.6931127871885815.
[I 2024-06-26 13:08:00,104] Trial 2 finished with value: 0.7046896080324946 and parameters: {'learning_rate': 0.00014046003573146124, 'max_depth': 400, 'n_estimators': 808}. Best is trial 2 with value: 0.7046896080324946.
[I 2024-06-26 13:10:09,090] Trial 3 finished with value: 0.6846548253620962 and parameters: {'learning_rate': 0.0008516260922581012, 'max_depth': 911, 'n_estimators': 366}. Best is trial 2 with value: 0.7046896080324946.
[I 2024-06-26 13:13:00,342] Trial

['c:\\Users\\franc.FJMOYAO\\Desktop\\Programming Projects\\depression_detection\\data\\gold\\tfidf_xgboost.pkl']

In [17]:
def objective_mpnet_xgboost(trial):
    
    lrate = trial.suggest_float("learning_rate", 0.0001, 0.4, log=True)
    max_depth = trial.suggest_int("max_depth", 10, 1000)
    n_estimators =trial.suggest_int("n_estimators", 10, 1000)
    classifier_obj = XGBClassifier(n_estimators=n_estimators, max_depth=max_depth, learning_rate=lrate, 
                                   objective='binary:logistic',
                                   device='cuda')

    # Step 3: Scoring method:
    score = model_selection.cross_val_score(classifier_obj, X_mpnet, y_mpnet, n_jobs=-1, cv=3, scoring="f1")
    fscore = score.mean()
    return fscore

study = optuna.create_study(direction="maximize",study_name="mpnet_xgboost",
                             sampler=optuna.samplers.NSGAIISampler())

# Adding Attributes to Study
study.set_user_attr('contributors', ['Francisco'])
study.set_user_attr('dataset', 'mpnet')

# Store and load using joblib:
study_path = os.path.join(data_path,'mpnet_xgboost.pkl')   
if not os.path.isfile(study_path):
    joblib.dump(study, study_path)

study = joblib.load(study_path)

study.optimize(objective_mpnet_xgboost, n_trials=10)
joblib.dump(study, study_path)

[I 2024-06-26 11:45:00,412] A new study created in memory with name: mpnet_xgboost
[I 2024-06-26 11:53:41,949] Trial 0 finished with value: 0.7718134228556068 and parameters: {'learning_rate': 0.003261971368266794, 'max_depth': 870, 'n_estimators': 647}. Best is trial 0 with value: 0.7718134228556068.
[I 2024-06-26 12:01:05,975] Trial 1 finished with value: 0.775559456304039 and parameters: {'learning_rate': 0.0029077866359644234, 'max_depth': 778, 'n_estimators': 871}. Best is trial 1 with value: 0.775559456304039.
[I 2024-06-26 12:02:22,761] Trial 2 finished with value: 0.7908173821072445 and parameters: {'learning_rate': 0.044360495126544996, 'max_depth': 433, 'n_estimators': 635}. Best is trial 2 with value: 0.7908173821072445.
[I 2024-06-26 12:04:00,037] Trial 3 finished with value: 0.7896686009492303 and parameters: {'learning_rate': 0.036987173238199195, 'max_depth': 345, 'n_estimators': 246}. Best is trial 2 with value: 0.7908173821072445.
[I 2024-06-26 12:10:07,863] Trial 4 fi

['c:\\Users\\franc.FJMOYAO\\Desktop\\Programming Projects\\depression_detection\\data\\gold\\mpnet_xgboost.pkl']

In [19]:
def objective_distilroberta_xgboost(trial):
    
    lrate = trial.suggest_float("learning_rate", 0.0001, 0.4, log=True)
    max_depth = trial.suggest_int("max_depth", 10, 1000)
    n_estimators =trial.suggest_int("n_estimators", 10, 1000)
    classifier_obj = XGBClassifier(n_estimators=n_estimators, max_depth=max_depth, learning_rate=lrate, 
                                   objective='binary:logistic',
                                   device='cuda')

    # Step 3: Scoring method:
    score = model_selection.cross_val_score(classifier_obj, X_distilroberta, y_distilroberta, n_jobs=-1, cv=3, scoring="f1")
    fscore = score.mean()
    return fscore

study = optuna.create_study(direction="maximize",study_name="distilroberta_xgboost",
                             sampler=optuna.samplers.NSGAIISampler())

# Adding Attributes to Study
study.set_user_attr('contributors', ['Francisco'])
study.set_user_attr('dataset', 'distilroberta')

# Store and load using joblib:
study_path = os.path.join(data_path,'distilroberta_xgboost.pkl')
if not os.path.isfile(study_path):  
    joblib.dump(study, study_path)

study = joblib.load(study_path)

study.optimize(objective_distilroberta_xgboost, n_trials=10)
joblib.dump(study, study_path)

[I 2024-06-26 13:21:16,426] A new study created in memory with name: distilroberta_xgboost
[I 2024-06-26 13:22:40,800] Trial 0 finished with value: 0.8021432752231589 and parameters: {'learning_rate': 0.18671982240811918, 'max_depth': 782, 'n_estimators': 935}. Best is trial 0 with value: 0.8021432752231589.
[I 2024-06-26 13:37:57,128] Trial 1 finished with value: 0.7412384536392022 and parameters: {'learning_rate': 0.0015832746633105547, 'max_depth': 329, 'n_estimators': 466}. Best is trial 0 with value: 0.8021432752231589.
[I 2024-06-26 13:43:26,886] Trial 2 finished with value: 0.7397994539181134 and parameters: {'learning_rate': 0.0006192987925892433, 'max_depth': 939, 'n_estimators': 713}. Best is trial 0 with value: 0.8021432752231589.
[I 2024-06-26 13:49:54,732] Trial 3 finished with value: 0.7157111067210571 and parameters: {'learning_rate': 0.00010126147272962402, 'max_depth': 944, 'n_estimators': 534}. Best is trial 0 with value: 0.8021432752231589.
[I 2024-06-26 13:50:36,635

['c:\\Users\\franc.FJMOYAO\\Desktop\\Programming Projects\\depression_detection\\data\\gold\\distilroberta_xgboost.pkl']

In [20]:
xgboost_studies = [os.path.join(data_path,x) for x in os.listdir(data_path) if "xgboost" in x]

In [21]:
xgboost_experiments = pd.DataFrame()
for study_path in xgboost_studies:
    cur_study = joblib.load(study_path)
    cur_study_df = cur_study.trials_dataframe()
    cur_study_df["dataset"] = cur_study.user_attrs["dataset"]
    xgboost_experiments = pd.concat([xgboost_experiments, cur_study_df])


In [22]:
xgboost_experiments.groupby("dataset")["value"].max()

dataset
Manual           0.701863
distilroberta    0.802143
mpnet            0.795932
tfidf            0.704690
Name: value, dtype: float64