In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, f1_score

# Read/rename data

In [None]:
df = pd.read_csv("content/mental_health_diagnosis_treatment_.csv")

df = df.rename(columns={
    'Therapy Type': 'TherapyType',
    'Symptom Severity (1-10)': 'SymptomSeverity',
    'Mood Score (1-10)': 'MoodScore',
    'Sleep Quality (1-10)': 'SleepQuality',
    'Physical Activity (hrs/week)': 'PhysicalActivity',
    'Treatment Duration (weeks)': 'TreatmentDuration',
    'Stress Level (1-10)': 'StressLevel',
    'Treatment Progress (1-10)': 'TreatmentProgress',
    'Adherence to Treatment (%)': 'TreatmentAdherence',
})

df.dtypes


# Convert classes to Id

In [None]:

def classToId(df: pd.DataFrame, featureName):
  classes = pd.DataFrame({featureName: df[featureName].unique()})
  classes = classes.sort_values(by=[featureName]).reset_index(drop=True)
  classes[f"{featureName}Id"] = classes.index
  return classes

genderMap = classToId(df[["Gender"]], "Gender")
diagnosisMap = classToId(df[["Diagnosis"]], "Diagnosis")
medicationMap = classToId(df[["Medication"]], "Medication")
therapyTypeMap = classToId(df[["TherapyType"]], "TherapyType")

# Outcome OutcomeId
outcomeMap = pd.DataFrame({"Outcome": ["Deteriorated", "No Change", "Improved"], "OutcomeId": [-1, 0, 1]})

df = df.merge(right=genderMap, how="left", on="Gender")
df = df.merge(right=diagnosisMap, how="left", on="Diagnosis")
df = df.merge(right=medicationMap, how="left", on="Medication")
df = df.merge(right=therapyTypeMap, how="left", on="TherapyType")
df = df.merge(right=outcomeMap, how="left", on="Outcome")


# Analysis

In [None]:
df.describe()

# Classify age

In [None]:
def ageGroupClassify(age):
    # Min age 18
    # Max age 60
    # (60 - 18) / 3 groups = range 14
    if 18 <= age < 32:
        return 1
    elif 32 <= age < 46:
        return 2
    else:
        return 3

df["AgeGroupId"] = df["Age"].apply(ageGroupClassify)

# Therapy by

In [None]:

def getTherapyByAttributeScore(attribute):
    dfTherapyBy = df.groupby(by=[attribute, "TherapyTypeId", "OutcomeId"]).agg({"Outcome": "count"}).reset_index()

    def getTotalOutcomes(row):
        return dfTherapyBy[(dfTherapyBy[attribute] == row[attribute]) & (dfTherapyBy["TherapyTypeId"] == row["TherapyTypeId"])]["Outcome"].sum()

    def getImprovedRate(row):
        successCount = dfTherapyBy[(dfTherapyBy[attribute] == row[attribute]) & (dfTherapyBy["TherapyTypeId"] == row["TherapyTypeId"]) & (dfTherapyBy["OutcomeId"] == 1)]["Outcome"].sum()
        return successCount / row["TotalOutcomes"]

    dfTherapyByScore = dfTherapyBy[[attribute, "TherapyTypeId"]].drop_duplicates()
    dfTherapyByScore["TotalOutcomes"] = dfTherapyByScore.apply(getTotalOutcomes, axis=1)
    dfTherapyByScore[f"{attribute}ImprovedRate"] = dfTherapyByScore.apply(getImprovedRate, axis=1)
    dfTherapyByScore[f"{attribute}ByTherapyScore"] = dfTherapyByScore[f"{attribute}ImprovedRate"] + np.log2(dfTherapyByScore["TotalOutcomes"])
    return dfTherapyByScore


featuresClasses = ["DiagnosisId", "AgeGroupId", "MedicationId"]
for feature in featuresClasses:
    featureScore = getTherapyByAttributeScore(attribute=feature)
    df = df.merge(right=featureScore[[feature, "TherapyTypeId", f"{feature}ImprovedRate", f"{feature}ByTherapyScore"]], on=[feature, "TherapyTypeId"])


In [None]:
df_corr = df[["Age", "SymptomSeverity", "MoodScore", "SleepQuality", "PhysicalActivity", "TreatmentDuration", "StressLevel", "TreatmentProgress", "TreatmentAdherence", "GenderId", "DiagnosisId", "MedicationId", "TherapyTypeId", "OutcomeId"]].corr(method="pearson")

def maxCorrelation(row):
    filtered_row = row[(row.abs() != 1)].abs()  # Exclude 1
    if not filtered_row.empty:
        return filtered_row.max()  # Return max value if there's any value left
    return None  # Return None if no value other than 1 is found

# Apply the function to each row
df_corr['MaxCorrelation'] = df_corr.apply(maxCorrelation, axis=1)

df_corr

# Set Features

In [None]:
# X = df[["Age", "SymptomSeverity", "MoodScore", "SleepQuality", "PhysicalActivity", "TreatmentDuration", "StressLevel", "TreatmentProgress", "TreatmentAdherence", "GenderId", "MedicationId", "TherapyTypeId", "DiagnosisId"]].values
# y = df["OutcomeId"].values

dfSuccessOutcome = df[df["OutcomeId"] == 1]
scoreFeatures = np.concatenate([ [f"{feature}ImprovedRate", f"{feature}ByTherapyScore"] for feature in featuresClasses ])

# X = dfSuccessOutcome[["Age", "SymptomSeverity", "MoodScore", "SleepQuality", "PhysicalActivity", "TreatmentDuration", "StressLevel", "TreatmentProgress", "TreatmentAdherence", "GenderId", "MedicationId", "DiagnosisId"]].values
X = dfSuccessOutcome[scoreFeatures].values
y = dfSuccessOutcome["TherapyTypeId"].values



# Set Hyperparams

In [None]:
# Definir espaços de hiperparâmetros para cada modelo
models_hyperparams = {
    'NaiveBayes': {
        'var_smoothing': [1e-9, 1e-8, 1e-7]  # Controls variance in GaussianNB
    },
    'DecisionTree': {
        'criterion': ['gini', 'entropy'],  # Measure for quality of a split
        'max_depth': [None, 10, 20, 30],   # Limits tree depth to avoid overfitting
        'min_samples_split': [2, 5, 10],   # Minimum samples required to split
        'min_samples_leaf': [1, 2, 4]      # Minimum samples at a leaf node
    },
    'SVM': {
        'C': [0.1, 1, 10, 100],            # Regularization parameter
        'kernel': ['linear', 'rbf', 'poly'],  # Kernel types
        'gamma': ['scale'],        # Kernel coefficient
        # 'gamma': ['scale', 'auto'],        # Kernel coefficient
    },
    'KNN': {
        'n_neighbors': [3, 5, 7, 9],       # Number of neighbors
        'weights': ['uniform', 'distance'], # Weight function
        'metric': ['euclidean', 'manhattan'] # Distance metric
    },
    'NeuralNetwork': {
        'hidden_layer_sizes': [(50,), (100,), (100, 50), (50, 50, 50)],  # Various architectures
        'activation': ['relu', 'tanh', 'logistic'],                      # Activation functions
        'solver': ['adam', 'sgd'],                                       # Optimizers
        'learning_rate': ['constant', 'adaptive'],                       # Learning rate schedule
        'max_iter': [200, 500, 1000],                                    # Iteration limits
        'alpha': [0.0001, 0.001, 0.01],                                  # L2 regularization term
    }
}

def optimize_model(model, param_grid, X_train, y_train):
    pipeline = Pipeline(steps=[('model', model)])
    param_grid_pipeline = {f"model__{key}": value for key, value in param_grid.items()}

    # Executa o GridSearchCV no pipeline
    grid_search = GridSearchCV(pipeline, param_grid_pipeline, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_


# Train Models

In [None]:
models = {
    "NaiveBayes": GaussianNB(),
    "DecisionTree": DecisionTreeClassifier(),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier(),
    "NeuralNetwork": MLPClassifier()
}

cycles = 1
results = {name: {"accuracy": [], "f1_score": []} for name in models.keys()}
for cycle in range(cycles):
    print(f"--- Cycle {cycle+1} ---")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7, random_state=42)
    for model_name in models:
        print(f"Training {model_name}...")
        best_model = optimize_model(models[model_name], models_hyperparams[model_name], X_train, y_train)
        best_model.fit(X_train, y_train)
        y_pred = best_model.predict(X_test)

        results[model_name]["accuracy"].append( accuracy_score(y_test, y_pred) )
        results[model_name]["f1_score"].append( f1_score(y_test, y_pred, average='macro') )

print("Train done.")

In [None]:
summary = {
    "Model": [],
    "Accuracy": [],
    "F1-score": []
}

for name, metrics in results.items():
    summary["Model"].append(name)
    summary["Accuracy"].append(np.mean(metrics["accuracy"]))
    summary["F1-score"].append(np.mean(metrics["f1_score"]))

summary_df = pd.DataFrame(summary)
display(summary_df)