In [1]:
import pandas as pd
from pathlib import Path
import os
import warnings
warnings.filterwarnings(action="ignore", category=UserWarning)
import optuna
import pandas as pd
import lightgbm as lgb
from sklearn import model_selection
import joblib

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
#Se leen los datos y se seleccionan las variables que presentan una diferencia
# entre los individuos que presentan estres y los que no
data_path = Path(os.getcwd()).parent / "data" / "gold"

In [7]:
#Loads different feature sets
df_manual = pd.read_csv(os.path.join(data_path,'manual_features_train.csv'))
X_manual = df_manual.iloc[:,:-1]
y_manual = df_manual.iloc[:,-1]

df_tfidf = pd.read_csv(os.path.join(data_path,'tfidf_features_train.csv'))
X_tfidf = df_tfidf.iloc[:,:-1]
y_tfidf = df_tfidf.iloc[:,-1]

df_mpnet = pd.read_csv(os.path.join(data_path,'mpnet_features_train.csv'))
X_mpnet = df_mpnet.iloc[:,:-1]
y_mpnet = df_mpnet.iloc[:,-1]

df_distilroberta = pd.read_csv(os.path.join(data_path,'distilroberta_features_train.csv'))
X_distilroberta = df_distilroberta.iloc[:,:-1]
y_distilroberta = df_distilroberta.iloc[:,-1]

### LightGBM

##### Manual

In [9]:
def objective_manual_lgbm(trial):
    
    num_leaves = trial.suggest_int("num_leaves", 10, 1000)
    n_estimators = trial.suggest_int("n_estimators", 10, 1000)
    learning_rate=trial.suggest_float("lrate", 0.0001, 0.4, log=True)
    
    classifier_obj = lgb.LGBMClassifier(objective= "binary", num_leaves =num_leaves,
                                        n_estimators=n_estimators, learning_rate=learning_rate)

    # Step 3: Scoring method:
    score = model_selection.cross_val_score(classifier_obj, X_manual, y_manual, n_jobs=-1, cv=3, scoring="f1")
    fscore = score.mean()
    return fscore

study = optuna.create_study(direction="maximize",study_name="manual_lgbm",
                             sampler=optuna.samplers.NSGAIISampler())

# Adding Attributes to Study
study.set_user_attr('contributors', ['Francisco'])
study.set_user_attr('dataset', 'Manual')

# Store and load using joblib:
study_path = os.path.join(data_path,'manual_lgbm.pkl')
if not os.path.isfile(study_path):
    joblib.dump(study, study_path)

study = joblib.load(study_path)

study.optimize(objective_manual_lgbm, n_trials=100)
joblib.dump(study, study_path)

[I 2024-06-26 16:07:04,696] A new study created in memory with name: manual_lgbm
[I 2024-06-26 16:07:08,588] Trial 0 finished with value: 0.6996852249437046 and parameters: {'num_leaves': 651, 'n_estimators': 351, 'lrate': 0.00033413903360545155}. Best is trial 0 with value: 0.6996852249437046.
[I 2024-06-26 16:07:11,498] Trial 1 finished with value: 0.652230270135048 and parameters: {'num_leaves': 455, 'n_estimators': 299, 'lrate': 0.003258689273030405}. Best is trial 0 with value: 0.6996852249437046.
[I 2024-06-26 16:07:17,507] Trial 2 finished with value: 0.6484798530625747 and parameters: {'num_leaves': 623, 'n_estimators': 802, 'lrate': 0.0012492292739613475}. Best is trial 0 with value: 0.6996852249437046.
[I 2024-06-26 16:07:21,262] Trial 3 finished with value: 0.5786170301975617 and parameters: {'num_leaves': 942, 'n_estimators': 474, 'lrate': 0.2002727976377556}. Best is trial 0 with value: 0.6996852249437046.
[I 2024-06-26 16:07:22,244] Trial 4 finished with value: 0.66568410

['c:\\Users\\franc.FJMOYAO\\Desktop\\Programming Projects\\depression_detection\\data\\gold\\manual_lgbm.pkl']

##### TFIDF

In [10]:
def objective_tfidf_lgbm(trial):
    
    num_leaves = trial.suggest_int("num_leaves", 10, 1000)
    n_estimators = trial.suggest_int("n_estimators", 10, 1000)
    learning_rate=trial.suggest_float("lrate", 0.0001, 0.4, log=True)
    
    classifier_obj = lgb.LGBMClassifier(objective= "binary", num_leaves =num_leaves,
                                        n_estimators=n_estimators, learning_rate=learning_rate)

    # Step 3: Scoring method:
    score = model_selection.cross_val_score(classifier_obj, X_tfidf, y_tfidf, n_jobs=-1, cv=3, scoring="f1")
    fscore = score.mean()
    return fscore

study = optuna.create_study(direction="maximize",study_name="tfidf_lgbm",
                             sampler=optuna.samplers.NSGAIISampler())

# Adding Attributes to Study
study.set_user_attr('contributors', ['Francisco'])
study.set_user_attr('dataset', 'tfidf')

# Store and load using joblib:
study_path = os.path.join(data_path,'tfidf_lgbm.pkl')
if not os.path.isfile(study_path):
    joblib.dump(study, study_path)

study = joblib.load(study_path)

study.optimize(objective_tfidf_lgbm, n_trials=100)
joblib.dump(study, study_path)

[I 2024-06-26 16:11:52,310] A new study created in memory with name: tfidf_lgbm
[I 2024-06-26 16:11:58,833] Trial 0 finished with value: 0.709943367999224 and parameters: {'num_leaves': 41, 'n_estimators': 960, 'lrate': 0.002578693326304843}. Best is trial 0 with value: 0.709943367999224.
[I 2024-06-26 16:12:04,957] Trial 1 finished with value: 0.7103865889177402 and parameters: {'num_leaves': 441, 'n_estimators': 540, 'lrate': 0.007087496636211073}. Best is trial 1 with value: 0.7103865889177402.
[I 2024-06-26 16:12:09,088] Trial 2 finished with value: 0.6939764194941977 and parameters: {'num_leaves': 129, 'n_estimators': 374, 'lrate': 0.0005015592522345143}. Best is trial 1 with value: 0.7103865889177402.
[I 2024-06-26 16:12:14,278] Trial 3 finished with value: 0.6992138585973514 and parameters: {'num_leaves': 338, 'n_estimators': 844, 'lrate': 0.17939366819843866}. Best is trial 1 with value: 0.7103865889177402.
[I 2024-06-26 16:12:15,806] Trial 4 finished with value: 0.691337786170

['c:\\Users\\franc.FJMOYAO\\Desktop\\Programming Projects\\depression_detection\\data\\gold\\tfidf_lgbm.pkl']

In [11]:
def objective_mpnet_lgbm(trial):
    
    num_leaves = trial.suggest_int("num_leaves", 10, 1000)
    n_estimators = trial.suggest_int("n_estimators", 10, 1000)
    learning_rate=trial.suggest_float("lrate", 0.0001, 0.4, log=True)
    
    classifier_obj = lgb.LGBMClassifier(objective= "binary", num_leaves =num_leaves,
                                        n_estimators=n_estimators, learning_rate=learning_rate)

    # Step 3: Scoring method:
    score = model_selection.cross_val_score(classifier_obj, X_mpnet, y_mpnet, n_jobs=-1, cv=3, scoring="f1")
    fscore = score.mean()
    return fscore

study = optuna.create_study(direction="maximize",study_name="mpnet_lgbm",
                             sampler=optuna.samplers.NSGAIISampler())

# Adding Attributes to Study
study.set_user_attr('contributors', ['Francisco'])
study.set_user_attr('dataset', 'mpnet')

# Store and load using joblib:
study_path = os.path.join(data_path,'mpnet_lgbm.pkl')
if not os.path.isfile(study_path):
    joblib.dump(study, study_path)

study = joblib.load(study_path)

study.optimize(objective_mpnet_lgbm, n_trials=100)
joblib.dump(study, study_path)

[I 2024-06-26 16:20:05,199] A new study created in memory with name: mpnet_lgbm
[I 2024-06-26 16:20:36,597] Trial 0 finished with value: 0.784976238522264 and parameters: {'num_leaves': 921, 'n_estimators': 406, 'lrate': 0.017746881926804122}. Best is trial 0 with value: 0.784976238522264.
[I 2024-06-26 16:20:47,136] Trial 1 finished with value: 0.7867799710518426 and parameters: {'num_leaves': 844, 'n_estimators': 818, 'lrate': 0.2660766202412685}. Best is trial 1 with value: 0.7867799710518426.
[I 2024-06-26 16:21:04,762] Trial 2 finished with value: 0.7952122734436564 and parameters: {'num_leaves': 679, 'n_estimators': 223, 'lrate': 0.08345307640255689}. Best is trial 2 with value: 0.7952122734436564.
[I 2024-06-26 16:22:05,202] Trial 3 finished with value: 0.7734903480006405 and parameters: {'num_leaves': 250, 'n_estimators': 899, 'lrate': 0.002168509994000225}. Best is trial 2 with value: 0.7952122734436564.
[I 2024-06-26 16:22:39,310] Trial 4 finished with value: 0.76095511808986

['c:\\Users\\franc.FJMOYAO\\Desktop\\Programming Projects\\depression_detection\\data\\gold\\mpnet_lgbm.pkl']

In [12]:
def objective_distilroberta_lgbm(trial):
    
    num_leaves = trial.suggest_int("num_leaves", 10, 1000)
    n_estimators = trial.suggest_int("n_estimators", 10, 1000)
    learning_rate=trial.suggest_float("lrate", 0.0001, 0.4, log=True)
    
    classifier_obj = lgb.LGBMClassifier(objective= "binary", num_leaves =num_leaves,
                                        n_estimators=n_estimators, learning_rate=learning_rate)

    # Step 3: Scoring method:
    score = model_selection.cross_val_score(classifier_obj, X_distilroberta, y_distilroberta, n_jobs=-1, cv=3, scoring="f1")
    fscore = score.mean()
    return fscore

study = optuna.create_study(direction="maximize",study_name="distilroberta_lgbm",
                             sampler=optuna.samplers.NSGAIISampler())

# Adding Attributes to Study
study.set_user_attr('contributors', ['Francisco'])
study.set_user_attr('dataset', 'distilroberta')

# Store and load using joblib:
study_path = os.path.join(data_path,'distilroberta_lgbm.pkl')
if not os.path.isfile(study_path):
    joblib.dump(study, study_path)

study = joblib.load(study_path)

study.optimize(objective_distilroberta_lgbm, n_trials=100)
joblib.dump(study, study_path)

[I 2024-06-26 17:07:20,901] A new study created in memory with name: distilroberta_lgbm
[I 2024-06-26 17:07:24,727] Trial 0 finished with value: 0.6880262098607804 and parameters: {'num_leaves': 66, 'n_estimators': 58, 'lrate': 0.00011490361945642318}. Best is trial 0 with value: 0.6880262098607804.
[I 2024-06-26 17:07:26,462] Trial 1 finished with value: 0.7642948142008711 and parameters: {'num_leaves': 898, 'n_estimators': 11, 'lrate': 0.16928615748493983}. Best is trial 1 with value: 0.7642948142008711.
[I 2024-06-26 17:07:29,637] Trial 2 finished with value: 0.6880262098607804 and parameters: {'num_leaves': 283, 'n_estimators': 38, 'lrate': 0.00013816908319016423}. Best is trial 1 with value: 0.7642948142008711.
[I 2024-06-26 17:07:47,188] Trial 3 finished with value: 0.7439415596020785 and parameters: {'num_leaves': 227, 'n_estimators': 252, 'lrate': 0.00042434608531811323}. Best is trial 1 with value: 0.7642948142008711.
[I 2024-06-26 17:07:57,659] Trial 4 finished with value: 0.

['c:\\Users\\franc.FJMOYAO\\Desktop\\Programming Projects\\depression_detection\\data\\gold\\distilroberta_lgbm.pkl']

In [16]:
lgbm_studies = [os.path.join(data_path,x) for x in os.listdir(data_path) if "lgbm" in x]

In [17]:
lgbm_experiments = pd.DataFrame()
for study_path in lgbm_studies:
    cur_study = joblib.load(study_path)
    cur_study_df = cur_study.trials_dataframe()
    cur_study_df["dataset"] = cur_study.user_attrs["dataset"]
    lgbm_experiments = pd.concat([lgbm_experiments, cur_study_df])


In [18]:
lgbm_experiments.groupby("dataset")["value"].max()

dataset
Manual           0.702077
distilroberta    0.807987
mpnet            0.795212
tfidf            0.715452
Name: value, dtype: float64