In [55]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, f1_score

# Read/rename data

In [56]:
df = pd.read_csv("content/mental_health_diagnosis_treatment_.csv")

df = df.drop(["Patient ID", "AI-Detected Emotional State", "Treatment Start Date"], axis=1)

df = df.rename(columns={
    'Therapy Type': 'TherapyType',
    'Symptom Severity (1-10)': 'SymptomSeverity',
    'Mood Score (1-10)': 'MoodScore',
    'Sleep Quality (1-10)': 'SleepQuality',
    'Physical Activity (hrs/week)': 'PhysicalActivity',
    'Treatment Duration (weeks)': 'TreatmentDuration',
    'Stress Level (1-10)': 'StressLevel',
    'Treatment Progress (1-10)': 'TreatmentProgress',
    'Adherence to Treatment (%)': 'TreatmentAdherence',
})

df.dtypes


Age                    int64
Gender                object
Diagnosis             object
SymptomSeverity        int64
MoodScore              int64
SleepQuality           int64
PhysicalActivity       int64
Medication            object
TherapyType           object
TreatmentDuration      int64
StressLevel            int64
Outcome               object
TreatmentProgress      int64
TreatmentAdherence     int64
dtype: object

# Analysis

In [57]:
df.describe()

Unnamed: 0,Age,SymptomSeverity,MoodScore,SleepQuality,PhysicalActivity,TreatmentDuration,StressLevel,TreatmentProgress,TreatmentAdherence
count,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0
mean,38.708,7.478,5.482,6.472,5.216,12.11,7.542,7.436,75.454
std,12.712433,1.706265,1.707486,1.668167,2.829374,2.440864,1.709409,1.725067,9.08697
min,18.0,5.0,3.0,4.0,1.0,8.0,5.0,5.0,60.0
25%,28.0,6.0,4.0,5.0,3.0,10.0,6.0,6.0,67.0
50%,38.0,8.0,5.0,6.0,5.0,12.0,8.0,7.0,76.0
75%,50.0,9.0,7.0,8.0,8.0,14.0,9.0,9.0,84.0
max,60.0,10.0,8.0,9.0,10.0,16.0,10.0,10.0,90.0


In [58]:
df_corr = df[["Age", "SymptomSeverity", "MoodScore", "SleepQuality", "PhysicalActivity", "TreatmentDuration", "StressLevel", "TreatmentProgress", "TreatmentAdherence"]].corr(method="pearson")

def maxCorrelation(row):
    filtered_row = row[(row.abs() != 1)].abs()  # Exclude 1
    if not filtered_row.empty:
        return filtered_row.max()  # Return max value if there's any value left
    return None  # Return None if no value other than 1 is found

# Apply the function to each row
df_corr['MaxCorrelation'] = df_corr.apply(maxCorrelation, axis=1)

df_corr

Unnamed: 0,Age,SymptomSeverity,MoodScore,SleepQuality,PhysicalActivity,TreatmentDuration,StressLevel,TreatmentProgress,TreatmentAdherence,MaxCorrelation
Age,1.0,-0.029307,0.033086,-0.015412,0.000531,-0.107012,-0.038351,0.014498,-0.022374,0.107012
SymptomSeverity,-0.029307,1.0,0.050077,0.037451,-0.003165,0.008522,-0.018921,-0.005586,-0.04931,0.050077
MoodScore,0.033086,0.050077,1.0,0.064199,0.021962,-0.078622,-0.075265,0.052336,0.041277,0.078622
SleepQuality,-0.015412,0.037451,0.064199,1.0,-0.007632,0.007402,0.036606,0.082247,0.018225,0.082247
PhysicalActivity,0.000531,-0.003165,0.021962,-0.007632,1.0,-0.043202,0.013451,0.002838,-0.053863,0.053863
TreatmentDuration,-0.107012,0.008522,-0.078622,0.007402,-0.043202,1.0,0.037554,-0.001418,-0.065593,0.107012
StressLevel,-0.038351,-0.018921,-0.075265,0.036606,0.013451,0.037554,1.0,0.001933,-0.030839,0.075265
TreatmentProgress,0.014498,-0.005586,0.052336,0.082247,0.002838,-0.001418,0.001933,1.0,-0.028505,0.082247
TreatmentAdherence,-0.022374,-0.04931,0.041277,0.018225,-0.053863,-0.065593,-0.030839,-0.028505,1.0,0.065593


# Convert classes to Id

In [59]:

def classToId(df: pd.DataFrame, featureName):
  classes = pd.DataFrame({featureName: df[featureName].unique()})
  classes = classes.sort_values(by=[featureName]).reset_index(drop=True)
  classes[f"{featureName}Id"] = classes.index
  return classes

genderMap = classToId(df[["Gender"]], "Gender")
diagnosisMap = classToId(df[["Diagnosis"]], "Diagnosis")
medicationMap = classToId(df[["Medication"]], "Medication")
therapyTypeMap = classToId(df[["TherapyType"]], "TherapyType")

# Outcome OutcomeId
outcomeMap = pd.DataFrame({"Outcome": ["Deteriorated", "No Change", "Improved"], "OutcomeId": [-1, 0, 1]})

df = df.merge(right=genderMap, how="left", on="Gender")
df = df.merge(right=diagnosisMap, how="left", on="Diagnosis")
df = df.merge(right=medicationMap, how="left", on="Medication")
df = df.merge(right=therapyTypeMap, how="left", on="TherapyType")
df = df.merge(right=outcomeMap, how="left", on="Outcome")


# Set Hyperparams

In [60]:
# Definir espaços de hiperparâmetros para cada modelo
models_hyperparams = {
    'NaiveBayes': {
        'var_smoothing': [1e-9, 1e-8, 1e-7]  # Controls variance in GaussianNB
    },
    'DecisionTree': {
        'criterion': ['gini', 'entropy'],  # Measure for quality of a split
        'max_depth': [None, 10, 20, 30],   # Limits tree depth to avoid overfitting
        'min_samples_split': [2, 5, 10],   # Minimum samples required to split
        'min_samples_leaf': [1, 2, 4]      # Minimum samples at a leaf node
    },
    'SVM': {
        'C': [0.1, 1, 10, 100],            # Regularization parameter
        'kernel': ['linear', 'rbf', 'poly'],  # Kernel types
        'gamma': ['scale'],        # Kernel coefficient
        # 'gamma': ['scale', 'auto'],        # Kernel coefficient
    },
    'KNN': {
        'n_neighbors': [3, 5, 7, 9],       # Number of neighbors
        'weights': ['uniform', 'distance'], # Weight function
        'metric': ['euclidean', 'manhattan'] # Distance metric
    },
    'NeuralNetwork': {
        'hidden_layer_sizes': [(50,), (100,), (100, 50), (50, 50, 50)],  # Various architectures
        'activation': ['relu', 'tanh', 'logistic'],                      # Activation functions
        'solver': ['adam', 'sgd'],                                       # Optimizers
        'learning_rate': ['constant', 'adaptive'],                       # Learning rate schedule
        'max_iter': [200, 500, 1000],                                    # Iteration limits
        'alpha': [0.0001, 0.001, 0.01],                                  # L2 regularization term
    }
}

def optimize_model(model, param_grid, X_train, y_train):
    pipeline = Pipeline(steps=[('model', model)])
    param_grid_pipeline = {f"model__{key}": value for key, value in param_grid.items()}

    # Executa o GridSearchCV no pipeline
    grid_search = GridSearchCV(pipeline, param_grid_pipeline, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_


# Set Features

In [61]:
dfSuccessOutcome = df[df["OutcomeId"] == 1]

X = dfSuccessOutcome[["Age", "SymptomSeverity", "MoodScore", "SleepQuality", "PhysicalActivity", "TreatmentDuration", "StressLevel", "TreatmentProgress", "TreatmentAdherence", "GenderId", "MedicationId", "DiagnosisId", "OutcomeId"]].values
y = dfSuccessOutcome["TherapyTypeId"].values


# Train Models

In [62]:
models = {
    "NaiveBayes": GaussianNB(),
    "DecisionTree": DecisionTreeClassifier(),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier(),
    "NeuralNetwork": MLPClassifier()
}

cycles = 3
results = {name: {"accuracy": []} for name in models.keys()}
for cycle in range(cycles):
    print(f"--- Cycle {cycle+1} ---")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7, random_state=42)
    for model_name in models:
        print(f"Training {model_name}...")
        best_model = optimize_model(models[model_name], models_hyperparams[model_name], X_train, y_train)
        best_model.fit(X_train, y_train)
        y_pred = best_model.predict(X_test)

        results[model_name]["accuracy"].append( accuracy_score(y_test, y_pred) )

print("Train done.")

--- Cycle 1 ---
Training NaiveBayes...
Training DecisionTree...
Training SVM...
Training KNN...
Training NeuralNetwork...




--- Cycle 2 ---
Training NaiveBayes...
Training DecisionTree...
Training SVM...
Training KNN...
Training NeuralNetwork...




--- Cycle 3 ---
Training NaiveBayes...
Training DecisionTree...
Training SVM...
Training KNN...
Training NeuralNetwork...
Train done.




In [63]:
summary = {
    "Model": [],
    "Accuracy": []
}

for name, metrics in results.items():
    summary["Model"].append(name)
    summary["Accuracy"].append(np.mean(metrics["accuracy"]))

summary_df = pd.DataFrame(summary)
display(summary_df)

Unnamed: 0,Model,Accuracy
0,NaiveBayes,0.268908
1,DecisionTree,0.260504
2,SVM,0.285714
3,KNN,0.235294
4,NeuralNetwork,0.282913


# Classify age

In [64]:
def ageGroupClassify(age):
    # Min age 18
    # Max age 60
    # (60 - 18) / 3 groups = range 14
    if 18 <= age < 32:
        return 1
    elif 32 <= age < 46:
        return 2
    else:
        return 3

df["AgeGroupId"] = df["Age"].apply(ageGroupClassify)

# Therapy by

In [65]:

def getTherapyByAttributeScore(attribute):
    dfTherapyBy = df.groupby(by=[attribute, "TherapyTypeId", "OutcomeId"]).agg({"Outcome": "count"}).reset_index()

    def getTotalOutcomes(row):
        return dfTherapyBy[(dfTherapyBy[attribute] == row[attribute]) & (dfTherapyBy["TherapyTypeId"] == row["TherapyTypeId"])]["Outcome"].sum()

    def getImprovedRate(row):
        successCount = dfTherapyBy[(dfTherapyBy[attribute] == row[attribute]) & (dfTherapyBy["TherapyTypeId"] == row["TherapyTypeId"]) & (dfTherapyBy["OutcomeId"] == 1)]["Outcome"].sum()
        return successCount / row["TotalOutcomes"]

    dfTherapyByScore = dfTherapyBy[[attribute, "TherapyTypeId"]].drop_duplicates()
    dfTherapyByScore["TotalOutcomes"] = dfTherapyByScore.apply(getTotalOutcomes, axis=1)
    dfTherapyByScore[f"{attribute}ImprovedRate"] = dfTherapyByScore.apply(getImprovedRate, axis=1)
    dfTherapyByScore[f"{attribute}ByTherapyScore"] = dfTherapyByScore[f"{attribute}ImprovedRate"] + np.log2(dfTherapyByScore["TotalOutcomes"])
    return dfTherapyByScore


featuresClasses = ["DiagnosisId", "AgeGroupId", "MedicationId"]
for feature in featuresClasses:
    featureScore = getTherapyByAttributeScore(attribute=feature)
    df = df.merge(right=featureScore[[feature, "TherapyTypeId", f"{feature}ImprovedRate", f"{feature}ByTherapyScore"]], on=[feature, "TherapyTypeId"])


# Set Features 2

In [66]:
dfSuccessOutcome = df[df["OutcomeId"] == 1]
scoreFeatures = np.concatenate([ [f"{feature}ImprovedRate", f"{feature}ByTherapyScore"] for feature in featuresClasses ])

X = dfSuccessOutcome[scoreFeatures].values
y = dfSuccessOutcome["TherapyTypeId"].values

# Train Models 2

In [67]:
models = {
    "NaiveBayes": GaussianNB(),
    "DecisionTree": DecisionTreeClassifier(),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier(),
    "NeuralNetwork": MLPClassifier()
}

cycles = 3
results = {name: {"accuracy": []} for name in models.keys()}
for cycle in range(cycles):
    print(f"--- Cycle {cycle+1} ---")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7, random_state=42)
    for model_name in models:
        print(f"Training {model_name}...")
        best_model = optimize_model(models[model_name], models_hyperparams[model_name], X_train, y_train)
        best_model.fit(X_train, y_train)
        y_pred = best_model.predict(X_test)

        results[model_name]["accuracy"].append( accuracy_score(y_test, y_pred) )

print("Train done.")

--- Cycle 1 ---
Training NaiveBayes...
Training DecisionTree...
Training SVM...
Training KNN...
Training NeuralNetwork...
--- Cycle 2 ---
Training NaiveBayes...
Training DecisionTree...
Training SVM...
Training KNN...
Training NeuralNetwork...




--- Cycle 3 ---
Training NaiveBayes...
Training DecisionTree...
Training SVM...
Training KNN...
Training NeuralNetwork...
Train done.




In [68]:
summary = {
    "Model": [],
    "Accuracy": []
}

for name, metrics in results.items():
    summary["Model"].append(name)
    summary["Accuracy"].append(np.mean(metrics["accuracy"]))

summary_df = pd.DataFrame(summary)
display(summary_df)

Unnamed: 0,Model,Accuracy
0,NaiveBayes,0.705882
1,DecisionTree,0.826331
2,SVM,0.689076
3,KNN,0.773109
4,NeuralNetwork,0.537815
