In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, f1_score

# Read/rename data

In [5]:
df = pd.read_csv("content/mental_health_diagnosis_treatment_.csv")

df = df.rename(columns={
    'Therapy Type': 'TherapyType',
    'Symptom Severity (1-10)': 'SymptomSeverity',
    'Mood Score (1-10)': 'MoodScore',
    'Sleep Quality (1-10)': 'SleepQuality',
    'Physical Activity (hrs/week)': 'PhysicalActivity',
    'Treatment Duration (weeks)': 'TreatmentDuration',
    'Stress Level (1-10)': 'StressLevel',
    'Treatment Progress (1-10)': 'TreatmentProgress',
    'Adherence to Treatment (%)': 'TreatmentAdherence',
})


# Convert classes to Id

In [6]:

def classToId(df: pd.DataFrame, featureName):
  classes = pd.DataFrame({featureName: df[featureName].unique()})
  classes = classes.sort_values(by=[featureName]).reset_index(drop=True)
  classes[f"{featureName}Id"] = classes.index
  return classes

genderMap = classToId(df[["Gender"]], "Gender")
diagnosisMap = classToId(df[["Diagnosis"]], "Diagnosis")
medicationMap = classToId(df[["Medication"]], "Medication")
therapyTypeMap = classToId(df[["TherapyType"]], "TherapyType")

# Outcome	OutcomeId
outcomeMap = pd.DataFrame({"Outcome": ["Deteriorated", "No Change", "Improved"], "OutcomeId": [-1, 0, 1]})

df = df.merge(right=genderMap, how="left", on="Gender")
df = df.merge(right=diagnosisMap, how="left", on="Diagnosis")
df = df.merge(right=medicationMap, how="left", on="Medication")
df = df.merge(right=therapyTypeMap, how="left", on="TherapyType")
df = df.merge(right=outcomeMap, how="left", on="Outcome")


In [53]:
# Associar Diagnostico e Terapia para comparar o sucesso em cada combinacao e descobrir qual melhor terapia baseado no diagnostico

dfDiagnoseTherapy = df.groupby(by=["DiagnosisId", "TherapyTypeId", "OutcomeId"]).agg({"Outcome": "count"}).reset_index()

# dfDiagnoseTherapyScore = dfDiagnoseTherapy.groupby(by=["DiagnosisId", "TherapyTypeId"]).agg({"Outcome": "sum"}).reset_index().rename(columns={"Outcome": "TotalOutcomes"})

def getTotalOutcomes(row):
    return dfDiagnoseTherapy[(dfDiagnoseTherapy["DiagnosisId"] == row["DiagnosisId"]) & (dfDiagnoseTherapy["TherapyTypeId"] == row["TherapyTypeId"])]["Outcome"].sum()

def getImprovedRate(row):
    successCount = dfDiagnoseTherapy[(dfDiagnoseTherapy["DiagnosisId"] == row["DiagnosisId"]) & (dfDiagnoseTherapy["TherapyTypeId"] == row["TherapyTypeId"]) & (dfDiagnoseTherapy["OutcomeId"] == 1)]["Outcome"].sum()
    return successCount / row["TotalOutcomes"]

dfDiagnoseTherapyScore = dfDiagnoseTherapy[["DiagnosisId", "TherapyTypeId"]].drop_duplicates()
dfDiagnoseTherapyScore["TotalOutcomes"] = dfDiagnoseTherapyScore.apply(getTotalOutcomes, axis=1)
dfDiagnoseTherapyScore["ImprovedRate"] = dfDiagnoseTherapyScore.apply(getImprovedRate, axis=1)
dfDiagnoseTherapyScore["DiagnosisTherapyScore"] = dfDiagnoseTherapyScore["ImprovedRate"] + np.log2(dfDiagnoseTherapyScore["TotalOutcomes"])


display(dfDiagnoseTherapyScore)



Unnamed: 0,DiagnosisId,TherapyTypeId,TotalOutcomes,ImprovedRate,DiagnosisTherapyScore
0,0,0,24,0.291667,4.876629
3,0,1,30,0.4,5.306891
6,0,2,30,0.333333,5.240224
9,0,3,40,0.375,5.696928
12,1,0,24,0.416667,5.001629
15,1,1,37,0.324324,5.533778
18,1,2,37,0.405405,5.614859
21,1,3,37,0.432432,5.641886
24,2,0,35,0.371429,5.500712
27,2,1,30,0.233333,5.140224


# Analysis

In [4]:
df.describe()

Unnamed: 0,Patient ID,Age,SymptomSeverity,MoodScore,SleepQuality,PhysicalActivity,TreatmentDuration,StressLevel,TreatmentProgress,TreatmentAdherence,GenderId,DiagnosisId,MedicationId,TherapyTypeId,OutcomeId
count,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0
mean,250.5,38.708,7.478,5.482,6.472,5.216,12.11,7.542,7.436,75.454,0.532,1.466,2.584,1.534,0.976
std,144.481833,12.712433,1.706265,1.707486,1.668167,2.829374,2.440864,1.709409,1.725067,9.08697,0.499475,1.100576,1.710203,1.113249,0.812863
min,1.0,18.0,5.0,3.0,4.0,1.0,8.0,5.0,5.0,60.0,0.0,0.0,0.0,0.0,0.0
25%,125.75,28.0,6.0,4.0,5.0,3.0,10.0,6.0,6.0,67.0,0.0,1.0,1.0,1.0,0.0
50%,250.5,38.0,8.0,5.0,6.0,5.0,12.0,8.0,7.0,76.0,1.0,1.0,3.0,2.0,1.0
75%,375.25,50.0,9.0,7.0,8.0,8.0,14.0,9.0,9.0,84.0,1.0,2.0,4.0,3.0,2.0
max,500.0,60.0,10.0,8.0,9.0,10.0,16.0,10.0,10.0,90.0,1.0,3.0,5.0,3.0,2.0


In [5]:
df_corr = df[["Age", "SymptomSeverity",	"MoodScore",	"SleepQuality",	"PhysicalActivity",	"TreatmentDuration",	"StressLevel",	"TreatmentProgress",	"TreatmentAdherence",	"GenderId",	"DiagnosisId",	"MedicationId",	"TherapyTypeId",	"OutcomeId"]].corr(method="pearson")

def maxCorrelation(row):
    filtered_row = row[(row.abs() != 1)].abs()  # Exclude 1
    if not filtered_row.empty:
        return filtered_row.max()  # Return max value if there's any value left
    return None  # Return None if no value other than 1 is found

# Apply the function to each row
df_corr['MaxCorrelation'] = df_corr.apply(maxCorrelation, axis=1)

df_corr

Unnamed: 0,Age,SymptomSeverity,MoodScore,SleepQuality,PhysicalActivity,TreatmentDuration,StressLevel,TreatmentProgress,TreatmentAdherence,GenderId,DiagnosisId,MedicationId,TherapyTypeId,OutcomeId,MaxCorrelation
Age,1.0,-0.029307,0.033086,-0.015412,0.000531,-0.107012,-0.038351,0.014498,-0.022374,-0.0263,-0.059438,0.014496,-0.016006,-0.120919,0.120919
SymptomSeverity,-0.029307,1.0,0.050077,0.037451,-0.003165,0.008522,-0.018921,-0.005586,-0.04931,-0.073244,-0.066564,0.027075,0.074246,-0.003271,0.074246
MoodScore,0.033086,0.050077,1.0,0.064199,0.021962,-0.078622,-0.075265,0.052336,0.041277,-0.054543,0.022068,0.002921,-0.055553,0.034341,0.078622
SleepQuality,-0.015412,0.037451,0.064199,1.0,-0.007632,0.007402,0.036606,0.082247,0.018225,-0.10475,-0.055642,-0.045535,-0.021608,-0.010842,0.10475
PhysicalActivity,0.000531,-0.003165,0.021962,-0.007632,1.0,-0.043202,0.013451,0.002838,-0.053863,0.039059,-0.022092,0.039729,-0.074231,-0.056993,0.074231
TreatmentDuration,-0.107012,0.008522,-0.078622,0.007402,-0.043202,1.0,0.037554,-0.001418,-0.065593,0.012723,-0.031056,-0.032703,-0.080661,0.054865,0.107012
StressLevel,-0.038351,-0.018921,-0.075265,0.036606,0.013451,0.037554,1.0,0.001933,-0.030839,-0.019181,0.006087,-0.004979,-0.080786,-0.02956,0.080786
TreatmentProgress,0.014498,-0.005586,0.052336,0.082247,0.002838,-0.001418,0.001933,1.0,-0.028505,0.051224,0.093322,-0.009043,-0.013996,0.083222,0.093322
TreatmentAdherence,-0.022374,-0.04931,0.041277,0.018225,-0.053863,-0.065593,-0.030839,-0.028505,1.0,-0.02418,-0.054861,0.018883,0.007485,0.025353,0.065593
GenderId,-0.0263,-0.073244,-0.054543,-0.10475,0.039059,0.012723,-0.019181,0.051224,-0.02418,1.0,0.025679,-0.019575,0.021466,-0.007976,0.10475


# Set Features

In [7]:
X = df[["Age", "SymptomSeverity",	"MoodScore",	"SleepQuality",	"PhysicalActivity",	"TreatmentDuration",	"StressLevel",	"TreatmentProgress",	"TreatmentAdherence",	"GenderId",	"MedicationId",	"TherapyTypeId",	"DiagnosisId"]].values
y = df["OutcomeId"].values



# Set Hyperparams

In [8]:
# Definir espaços de hiperparâmetros para cada modelo
models_hyperparams = {
    'NaiveBayes': {
        'var_smoothing': [1e-9, 1e-8, 1e-7]  # Controls variance in GaussianNB
    },
    'DecisionTree': {
        'criterion': ['gini', 'entropy'],  # Measure for quality of a split
        'max_depth': [None, 10, 20, 30],   # Limits tree depth to avoid overfitting
        'min_samples_split': [2, 5, 10],   # Minimum samples required to split
        'min_samples_leaf': [1, 2, 4]      # Minimum samples at a leaf node
    },
    'SVM': {
        'C': [0.1, 1, 10, 100],            # Regularization parameter
        'kernel': ['linear', 'rbf', 'poly'],  # Kernel types
        'gamma': ['scale'],        # Kernel coefficient
        # 'gamma': ['scale', 'auto'],        # Kernel coefficient
    },
    'KNN': {
        'n_neighbors': [3, 5, 7, 9],       # Number of neighbors
        'weights': ['uniform', 'distance'], # Weight function
        'metric': ['euclidean', 'manhattan'] # Distance metric
    },
    'NeuralNetwork': {
        'hidden_layer_sizes': [(50,), (100,), (100, 50), (50, 50, 50)],  # Various architectures
        'activation': ['relu', 'tanh', 'logistic'],                      # Activation functions
        'solver': ['adam', 'sgd'],                                       # Optimizers
        'learning_rate': ['constant', 'adaptive'],                       # Learning rate schedule
        'max_iter': [200, 500, 1000],                                    # Iteration limits
        'alpha': [0.0001, 0.001, 0.01],                                  # L2 regularization term
    }
}

def optimize_model(model, param_grid, X_train, y_train):
    pipeline = Pipeline(steps=[('model', model)])
    param_grid_pipeline = {f"model__{key}": value for key, value in param_grid.items()}

    # Executa o GridSearchCV no pipeline
    grid_search = GridSearchCV(pipeline, param_grid_pipeline, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_


# Train Models

In [12]:
models = {
    "NaiveBayes": GaussianNB(),
    "DecisionTree": DecisionTreeClassifier(),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier(),
    "NeuralNetwork": MLPClassifier()
}

cycles = 3
results = {name: {"accuracy": [], "f1_score": []} for name in models.keys()}
for cycle in range(cycles):
    print(f"--- Cycle {cycle+1} ---")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7, random_state=42)
    for model_name in models:
        print(f"Training {model_name}...")
        best_model = optimize_model(models[model_name], models_hyperparams[model_name], X_train, y_train)
        best_model.fit(X_train, y_train)
        y_pred = best_model.predict(X_test)

        results[model_name]["accuracy"].append( accuracy_score(y_test, y_pred) )
        results[model_name]["f1_score"].append( f1_score(y_test, y_pred, average='macro') )

        # cm = confusion_matrix(y_test, y_pred)
        # ConfusionMatrixDisplay(confusion_matrix=cm).plot()
        # mse = np.sqrt(mean_squared_error(y_test, y_pred))
        # results[model_name].append(mse)
print("Train done.")

--- Cycle 1 ---
Training NaiveBayes...
Training DecisionTree...
Training SVM...
Training KNN...
Training NeuralNetwork...
--- Cycle 2 ---
Training NaiveBayes...
Training DecisionTree...
Training SVM...
Training KNN...
Training NeuralNetwork...
--- Cycle 3 ---
Training NaiveBayes...
Training DecisionTree...
Training SVM...
Training KNN...
Training NeuralNetwork...
Train done.


In [13]:
summary = {
    "Model": [],
    "Accuracy": [],
    "F1-score": []
}

for name, metrics in results.items():
    summary["Model"].append(name)
    summary["Accuracy"].append(np.mean(metrics["accuracy"]))
    summary["F1-score"].append(np.mean(metrics["f1_score"]))

summary_df = pd.DataFrame(summary)
display(summary_df)

Unnamed: 0,Model,Accuracy,F1-score
0,NaiveBayes,0.34,0.332435
1,DecisionTree,0.319048,0.315549
2,SVM,0.334286,0.167024
3,KNN,0.374286,0.374214
4,NeuralNetwork,0.333333,0.166666
