In [4]:
import pandas as pd
import numpy as np

# Relative path to the dataset
file_path_Alzheimer = "../../data/raw/ALZHEIMER_Dataset/Dataset/alzheimers_disease_data.csv"

# Load the dataset
data_Alzheimer = pd.read_csv(file_path_Alzheimer)

print(data_Alzheimer.info()) ## Check if there's any Null Info and check variable type.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2149 entries, 0 to 2148
Data columns (total 35 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   PatientID                  2149 non-null   int64  
 1   Age                        2149 non-null   int64  
 2   Gender                     2149 non-null   int64  
 3   Ethnicity                  2149 non-null   int64  
 4   EducationLevel             2149 non-null   int64  
 5   BMI                        2149 non-null   float64
 6   Smoking                    2149 non-null   int64  
 7   AlcoholConsumption         2149 non-null   float64
 8   PhysicalActivity           2149 non-null   float64
 9   DietQuality                2149 non-null   float64
 10  SleepQuality               2149 non-null   float64
 11  FamilyHistoryAlzheimers    2149 non-null   int64  
 12  CardiovascularDisease      2149 non-null   int64  
 13  Diabetes                   2149 non-null   int64

In [8]:
## Verify if Data is imbalanced or balaneced

print("Alzheimer Diagnosis Distribution:\n", data_Alzheimer['Diagnosis'].value_counts())

## This step is vital because if an Algorithm is trained with most of the patients having no Alzheimer, it will 
## Tend to favour training towards that aspect

## Proportion of 0s (no Alzheimer's): 64.6 %
## Proportion of 1s (Alzheimer's): 35.4 %

Alzheimer Diagnosis Distribution:
 Diagnosis
0    1389
1     760
Name: count, dtype: int64


In [11]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

## Cleaning Data
columns_to_drops = ['DoctorInCharge', 'EducationLevel', 'PatientID', 'Diagnosis']
numeric_columns = ['Age', 'BMI', 'AlcoholConsumption', 'PhysicalActivity', 'SleepQuality' ,'DietQuality', 'ADL', 'FunctionalAssessment', 'CholesterolTriglycerides', 'CholesterolHDL', 'CholesterolLDL', 'CholesterolTotal', 'DiastolicBP', 'SystolicBP']

# Calcular a matriz de correlação

# Remover as colunas 'EducationLevel' e 'DoctorInCharge' do conjunto de dados de treinamento
data_Alzheimer_cleaned = data_Alzheimer.drop(columns=columns_to_drops)

In [14]:
from sklearn.model_selection import train_test_split

X_Alzheimer = data_Alzheimer_cleaned.copy()
y_Alzheimer = data_Alzheimer['Diagnosis']

X_train_alzheimer, X_test_alzheimer, y_train_alzheimer, y_test_alzheimer = train_test_split( X_Alzheimer, y_Alzheimer, test_size=0.2, random_state=42 )

X_train_alzheimer.head()

Unnamed: 0,Age,Gender,Ethnicity,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,SleepQuality,FamilyHistoryAlzheimers,...,MMSE,FunctionalAssessment,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness
1433,87,1,2,27.764232,1,16.54317,0.281379,5.923418,7.836104,0,...,25.399206,3.085543,0,0,6.643693,0,0,0,0,0
630,70,0,0,37.098744,0,1.360202,9.24299,1.819284,5.218052,1,...,8.292136,5.61683,0,1,3.884562,0,0,0,0,1
78,82,1,3,15.908275,0,16.329031,1.915913,6.607292,6.146166,0,...,21.042238,3.662461,0,0,4.013722,0,0,1,1,0
366,76,1,0,30.302432,1,11.81403,6.28117,6.204349,6.825155,0,...,28.609438,4.648135,0,0,9.3557,1,0,1,0,1
1996,61,0,0,24.565357,1,2.273373,9.976581,2.057188,4.715534,0,...,2.629135,9.601238,1,0,8.818932,0,1,0,0,0


In [17]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# Aplicar SMOTE apenas no conjunto de treino
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_alzheimer, y_train_alzheimer)

# Aplicar Random Under Sampling para undersampling após SMOTE
undersampler = RandomUnderSampler(random_state=42)
X_train_balanced, y_train_balanced = undersampler.fit_resample(X_train_smote, y_train_smote)

# print(y_train_alzheimer.value_counts())
# print(y_train_smote.value_counts())
# print(y_train_balanced.value_counts())

In [24]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


####################################################################

## ARVORE DE DECISÃO ## 

# Inicializar o modelo com hiperparâmetros básicos
clf = DecisionTreeClassifier(criterion='gini', max_depth=5, random_state=42)

clf.fit(X_train_alzheimer, y_train_alzheimer)
# clf.fit(X_train_normalized_minmax, y_train_balanced) ## Normalizados + Over and Undersampled

y_pred = clf.predict(X_test_alzheimer)


# Evaluate accuracy
accuracy = accuracy_score(y_test_alzheimer, y_pred)
print(f'Accuracy: {accuracy:.4f}')

# Confusion matrix
conf_matrix = confusion_matrix(y_test_alzheimer, y_pred)
print(f'Confusion Matrix:\n{conf_matrix}')

# Classification report
class_report = classification_report(y_test_alzheimer, y_pred)
print(f'Classification Report:\n{class_report}')

Accuracy: 0.9302
Confusion Matrix:
[[267  10]
 [ 20 133]]
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.96      0.95       277
           1       0.93      0.87      0.90       153

    accuracy                           0.93       430
   macro avg       0.93      0.92      0.92       430
weighted avg       0.93      0.93      0.93       430

