In [1]:
import pandas as pd
from pathlib import Path
import os
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
import utils
warnings.filterwarnings(action='ignore', category=UserWarning)
import optuna
import pandas as pd
from sklearn import linear_model
from sklearn import ensemble
from sklearn import datasets
from sklearn import model_selection
import joblib

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#Se leen los datos y se seleccionan las variables que presentan una diferencia
# entre los individuos que presentan estres y los que no
data_path = Path(os.getcwd()).parent / "data" / "gold"

In [3]:
#Loads different feature sets
df_manual = pd.read_csv(os.path.join(data_path,'manual_features_train.csv'))
X_manual = df_manual.iloc[:,:-1]
y_manual = df_manual.iloc[:,-1]

df_tfidf = pd.read_csv(os.path.join(data_path,'tfidf_features_train.csv'))
X_tfidf = df_tfidf.iloc[:,:-1]
y_tfidf = df_tfidf.iloc[:,-1]

df_mpnet = pd.read_csv(os.path.join(data_path,'mpnet_features_train.csv'))
X_mpnet = df_mpnet.iloc[:,:-1]
y_mpnet = df_mpnet.iloc[:,-1]

df_distilroberta = pd.read_csv(os.path.join(data_path,'distilroberta_features_train.csv'))
X_distilroberta = df_distilroberta.iloc[:,:-1]
y_distilroberta = df_distilroberta.iloc[:,-1]

### Logistic Regression

##### Manual

In [4]:
def objective_manual_logreg(trial):
    
    logreg_c = trial.suggest_float("logreg_c", 1e-10, 1e10, log=True)
    classifier_obj = linear_model.LogisticRegression(C=logreg_c)

    # Step 3: Scoring method:
    score = model_selection.cross_val_score(classifier_obj, X_manual, y_manual, n_jobs=-1, cv=3, scoring="f1")
    fscore = score.mean()
    return fscore

study = optuna.create_study(direction="maximize",study_name="manual_logistic_regression",
                             sampler=optuna.samplers.NSGAIISampler())

# Adding Attributes to Study
study.set_user_attr('contributors', ['Francisco'])
study.set_user_attr('dataset', 'Manual')

# Store and load using joblib:
study_path = os.path.join(data_path,'manual_logreg.pkl')
if not os.path.isfile(study_path):
    joblib.dump(study, study_path)

study = joblib.load(study_path)

study.optimize(objective_manual_logreg, n_trials=1000)
joblib.dump(study, study_path)

[I 2024-06-26 03:34:17,600] A new study created in memory with name: manual_logistic_regression
[I 2024-06-26 03:34:18,765] Trial 0 finished with value: 0.6880262098607804 and parameters: {'logreg_c': 8.048753605376842e-07}. Best is trial 0 with value: 0.6880262098607804.
[I 2024-06-26 03:34:19,506] Trial 1 finished with value: 0.6880262098607804 and parameters: {'logreg_c': 4.187646535662168e-09}. Best is trial 0 with value: 0.6880262098607804.
[I 2024-06-26 03:34:20,279] Trial 2 finished with value: 0.6748218273415384 and parameters: {'logreg_c': 0.05401402275821551}. Best is trial 0 with value: 0.6880262098607804.
[I 2024-06-26 03:34:20,991] Trial 3 finished with value: 0.7059478374120198 and parameters: {'logreg_c': 0.0006234145234575011}. Best is trial 3 with value: 0.7059478374120198.
[I 2024-06-26 03:34:21,007] Trial 4 finished with value: 0.6731833113308987 and parameters: {'logreg_c': 0.047486832911573504}. Best is trial 3 with value: 0.7059478374120198.
[I 2024-06-26 03:34:21

['c:\\Users\\franc.FJMOYAO\\Desktop\\Programming Projects\\depression_detection\\data\\gold\\manual_logreg.pkl']

##### TFIDF

In [5]:
def objective_tfidf_logreg(trial): #1.modificar nombre de la funcion
    
    logreg_c = trial.suggest_float("logreg_c", 1e-10, 1e10, log=True)
    classifier_obj = linear_model.LogisticRegression(C=logreg_c)

    # Step 3: Scoring method:
    score = model_selection.cross_val_score(classifier_obj, X_tfidf, y_tfidf, n_jobs=-1, cv=3, scoring="f1") #2. modificar x y
    fscore = score.mean()
    return fscore

study = optuna.create_study(direction="maximize",study_name="tfidf_logistic_regression", #3. modificar el nombre del estudio
                             sampler=optuna.samplers.NSGAIISampler())

# Adding Attributes to Study
study.set_user_attr('contributors', ['Francisco'])
study.set_user_attr('dataset', 'TFIDF') #4. modificar el nombre del dataset

# Store and load using joblib:
study_path = os.path.join(data_path,'tfidf_logreg.pkl') #5. modificar el nombre de study path
if not os.path.isfile(study_path):
    joblib.dump(study, study_path)

study = joblib.load(study_path)

study.optimize(objective_tfidf_logreg, n_trials=1000)
joblib.dump(study, study_path)


[I 2024-06-26 03:34:39,250] A new study created in memory with name: tfidf_logistic_regression
[I 2024-06-26 03:34:39,684] Trial 0 finished with value: 0.6880262098607804 and parameters: {'logreg_c': 6.63104200861721e-07}. Best is trial 0 with value: 0.6880262098607804.
[I 2024-06-26 03:34:40,170] Trial 1 finished with value: 0.7266670215551753 and parameters: {'logreg_c': 168424661.36866713}. Best is trial 1 with value: 0.7266670215551753.
[I 2024-06-26 03:34:40,688] Trial 2 finished with value: 0.6880262098607804 and parameters: {'logreg_c': 3.7262702976553616e-06}. Best is trial 1 with value: 0.7266670215551753.
[I 2024-06-26 03:34:41,095] Trial 3 finished with value: 0.6880262098607804 and parameters: {'logreg_c': 2.637483187610834e-07}. Best is trial 1 with value: 0.7266670215551753.
[I 2024-06-26 03:34:41,602] Trial 4 finished with value: 0.7249739644494807 and parameters: {'logreg_c': 724.46448023705}. Best is trial 1 with value: 0.7266670215551753.
[I 2024-06-26 03:34:42,090] T

['c:\\Users\\franc.FJMOYAO\\Desktop\\Programming Projects\\depression_detection\\data\\gold\\tfidf_logreg.pkl']

In [6]:
def objective_mpnet_logreg(trial): #1.modificar nombre de la funcion
    
    logreg_c = trial.suggest_float("logreg_c", 1e-10, 1e10, log=True)
    classifier_obj = linear_model.LogisticRegression(C=logreg_c)

    # Step 3: Scoring method:
    score = model_selection.cross_val_score(classifier_obj, X_mpnet, y_mpnet, n_jobs=-1, cv=3, scoring="f1") #2. modificar x y
    fscore = score.mean()
    return fscore

study = optuna.create_study(direction="maximize",study_name="mpnet_logistic_regression", #3. modificar el nombre del estudio
                             sampler=optuna.samplers.NSGAIISampler())

# Adding Attributes to Study
study.set_user_attr('contributors', ['Francisco'])
study.set_user_attr('dataset', 'MPNET') #4. modificar el nombre del dataset

# Store and load using joblib:
study_path = os.path.join(data_path,'mpnet_logreg.pkl') #5. modificar el nombre de study path
if not os.path.isfile(study_path):
    joblib.dump(study, study_path)

study = joblib.load(study_path)

study.optimize(objective_mpnet_logreg, n_trials=1000) #6. modificar estudio
joblib.dump(study, study_path)

[I 2024-06-26 03:42:25,321] A new study created in memory with name: mpnet_logistic_regression
[I 2024-06-26 03:42:25,520] Trial 0 finished with value: 0.6880262098607804 and parameters: {'logreg_c': 5.7846409758341016e-08}. Best is trial 0 with value: 0.6880262098607804.
[I 2024-06-26 03:42:25,821] Trial 1 finished with value: 0.6880262098607804 and parameters: {'logreg_c': 4.109236134605843e-08}. Best is trial 0 with value: 0.6880262098607804.
[I 2024-06-26 03:42:26,007] Trial 2 finished with value: 0.7895776815080038 and parameters: {'logreg_c': 0.07183196082395338}. Best is trial 2 with value: 0.7895776815080038.
[I 2024-06-26 03:42:26,196] Trial 3 finished with value: 0.6880262098607804 and parameters: {'logreg_c': 5.3634436716685207e-08}. Best is trial 2 with value: 0.7895776815080038.
[I 2024-06-26 03:42:26,541] Trial 4 finished with value: 0.7127996940637097 and parameters: {'logreg_c': 8502.907526332512}. Best is trial 2 with value: 0.7895776815080038.
[I 2024-06-26 03:42:26,7

['c:\\Users\\franc.FJMOYAO\\Desktop\\Programming Projects\\depression_detection\\data\\gold\\mpnet_logreg.pkl']

In [7]:
def objective_distilroberta_logreg(trial): #1.modificar nombre de la funcion
    
    logreg_c = trial.suggest_float("logreg_c", 1e-10, 1e10, log=True)
    classifier_obj = linear_model.LogisticRegression(C=logreg_c)

    # Step 3: Scoring method:
    score = model_selection.cross_val_score(classifier_obj, X_distilroberta, y_distilroberta, n_jobs=-1, cv=3, scoring="f1") #2. modificar x y
    fscore = score.mean()
    return fscore

study = optuna.create_study(direction="maximize",study_name="distilroberta_logistic_regression", #3. modificar el nombre del estudio
                             sampler=optuna.samplers.NSGAIISampler())

# Adding Attributes to Study
study.set_user_attr('contributors', ['Francisco'])
study.set_user_attr('dataset', 'distilroberta') #4. modificar el nombre del dataset

# Store and load using joblib:
study_path = os.path.join(data_path,'distilroberta_logreg.pkl') #5. modificar el nombre de study path
if not os.path.isfile(study_path):
    joblib.dump(study, study_path)

study = joblib.load(study_path)

study.optimize(objective_distilroberta_logreg, n_trials=1000) #6. modificar estudio
joblib.dump(study, study_path)

[I 2024-06-26 03:46:29,239] A new study created in memory with name: distilroberta_logistic_regression
[I 2024-06-26 03:46:29,544] Trial 0 finished with value: 0.7908241985210408 and parameters: {'logreg_c': 62.20928954562191}. Best is trial 0 with value: 0.7908241985210408.
[I 2024-06-26 03:46:29,732] Trial 1 finished with value: 0.6880262098607804 and parameters: {'logreg_c': 7.249190770284022e-07}. Best is trial 0 with value: 0.7908241985210408.
[I 2024-06-26 03:46:29,918] Trial 2 finished with value: 0.6880262098607804 and parameters: {'logreg_c': 2.2199520471673302e-06}. Best is trial 0 with value: 0.7908241985210408.
[I 2024-06-26 03:46:30,103] Trial 3 finished with value: 0.6880262098607804 and parameters: {'logreg_c': 8.821790971157276e-07}. Best is trial 0 with value: 0.7908241985210408.
[I 2024-06-26 03:46:30,450] Trial 4 finished with value: 0.7488811018622578 and parameters: {'logreg_c': 70712.2024759635}. Best is trial 0 with value: 0.7908241985210408.
[I 2024-06-26 03:46:

['c:\\Users\\franc.FJMOYAO\\Desktop\\Programming Projects\\depression_detection\\data\\gold\\distilroberta_logreg.pkl']

In [8]:
logreg_studies = [os.path.join(data_path,x) for x in os.listdir(data_path) if "logreg" in x]

In [9]:
logreg_experiments = pd.DataFrame()
for study_path in logreg_studies:
    cur_study = joblib.load(study_path)
    cur_study_df = cur_study.trials_dataframe()
    cur_study_df["dataset"] = cur_study.user_attrs["dataset"]
    logreg_experiments = pd.concat([logreg_experiments, cur_study_df])


In [10]:
logreg_experiments.groupby("dataset")["value"].max()

dataset
MPNET            0.802525
Manual           0.707575
TFIDF            0.762898
distilroberta    0.814122
Name: value, dtype: float64