In [24]:
import pandas as pd
import numpy as np

# Relative path to the dataset
file_path_Alzheimer = "../data/raw/ALZHEIMER_Dataset/Dataset/alzheimers_disease_data.csv"

# Load the dataset
data_Alzheimer = pd.read_csv(file_path_Alzheimer)

data_Alzheimer.drop(columns=['DoctorInCharge', 'EducationLevel', 'PatientID'], inplace=True)

print(data_Alzheimer.info()) ## Check if there's any Null Info and check variable type.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2149 entries, 0 to 2148
Data columns (total 33 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   PatientID                  2149 non-null   int64  
 1   Age                        2149 non-null   int64  
 2   Gender                     2149 non-null   int64  
 3   Ethnicity                  2149 non-null   int64  
 4   BMI                        2149 non-null   float64
 5   Smoking                    2149 non-null   int64  
 6   AlcoholConsumption         2149 non-null   float64
 7   PhysicalActivity           2149 non-null   float64
 8   DietQuality                2149 non-null   float64
 9   SleepQuality               2149 non-null   float64
 10  FamilyHistoryAlzheimers    2149 non-null   int64  
 11  CardiovascularDisease      2149 non-null   int64  
 12  Diabetes                   2149 non-null   int64  
 13  Depression                 2149 non-null   int64

In [63]:
print("Alzheimer Diagnosis Distribution:\n", data_Alzheimer['Diagnosis'].value_counts())

Alzheimer Diagnosis Distribution:
 Diagnosis
0    1389
1     760
Name: count, dtype: int64


In [57]:
# #RESAMPLE
# from sklearn.utils import resample
#
# # Separate majority and minority classes
# data_majority = data_Alzheimer[data_Alzheimer['Diagnosis'] == 0]
# data_minority = data_Alzheimer[data_Alzheimer['Diagnosis'] == 1]
#
# # Oversample minority class
# data_minority_oversampled = resample(data_minority,
#                                      replace=True,  # sample with replacement
#                                      n_samples=len(data_majority),  # to match the majority class size
#                                      random_state=42)  # for reproducibility
#
# # Combine majority class with oversampled minority class
# data_oversampled = pd.concat([data_majority, data_minority_oversampled])
#
# print("Alzheimer Diagnosis Distribution after balancing:\n", data_oversampled['Diagnosis'].value_counts())


Alzheimer Diagnosis Distribution after balancing:
 Diagnosis
0    1389
1    1389
Name: count, dtype: int64


In [69]:
from sklearn.utils import resample

# Making copies of majority and minority classes to avoid modifying original data
data_majority = data_Alzheimer[data_Alzheimer['Diagnosis'] == 0].copy()
data_minority = data_Alzheimer[data_Alzheimer['Diagnosis'] == 1].copy()

# Oversample minority class
data_minority_oversampled = resample(data_minority,
                                     replace=True,  # sample with replacement
                                     n_samples=len(data_majority),  # to match the majority class size
                                     random_state=42)  # for reproducibility

# Combine majority class with oversampled minority class
data_oversampled = pd.concat([data_majority, data_minority_oversampled])

# Determine the number of samples to keep from each part
n_samples_to_keep = 700

# Ensure not to delete any original data by making copies for reduction
majority_for_reduction = data_majority.copy()
minority_for_reduction = data_minority_oversampled.copy()

# Randomly sample from majority and oversampled minority classes
reduced_majority_sample = resample(majority_for_reduction,
                                   replace=False,  # sample without replacement
                                   n_samples=n_samples_to_keep,  # number of samples to keep
                                   random_state=42)  # for reproducibility

reduced_minority_sample = resample(minority_for_reduction,
                                   replace=False,  # sample without replacement
                                   n_samples=n_samples_to_keep,  # number of samples to keep
                                   random_state=42)  # for reproducibility

# Combine the reduced samples to create a balanced dataset
data_balanced = pd.concat([reduced_majority_sample, reduced_minority_sample])

# Display new class counts
print("Alzheimer Diagnosis Distribution after balancing:\n", data_balanced['Diagnosis'].value_counts())

Alzheimer Diagnosis Distribution after balancing:
 Diagnosis
0    700
1    700
Name: count, dtype: int64


In [70]:
#SPLIT TRAINING AND TESTING DATA
from sklearn.model_selection import train_test_split

X_Alzheimer = data_balanced.drop(columns=['Diagnosis'])
y_Alzheimer = data_balanced['Diagnosis']

X_train_alzheimer, X_test_alzheimer, y_train_alzheimer, y_test_alzheimer = train_test_split(
    X_Alzheimer,
    y_Alzheimer,
    test_size=0.2,
    random_state=42
)

X_train_alzheimer.head()

Unnamed: 0,PatientID,Age,Gender,Ethnicity,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,SleepQuality,...,MMSE,FunctionalAssessment,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness
1118,5869,65,0,1,21.022235,1,15.44717,7.075594,6.792224,8.222566,...,19.960056,9.734954,1,1,9.313624,0,1,0,0,1
648,5399,63,0,0,39.056533,0,18.861933,4.360621,1.126333,6.751384,...,3.090384,5.54202,0,0,5.414518,1,0,0,1,0
1613,6364,87,0,0,26.57142,0,1.555887,5.311958,2.395883,7.922703,...,10.081491,9.631243,0,0,7.027009,0,0,1,1,1
866,5617,66,0,0,24.274355,1,14.016364,6.320907,5.228418,8.451672,...,28.709078,8.797741,0,0,1.513744,0,0,0,0,0
1473,6224,72,1,0,23.094319,0,4.181355,6.003548,0.100197,4.486343,...,6.940924,1.576872,1,0,8.472254,0,0,0,1,0


In [67]:
#LOGISTIC REGRESSION
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the training data, and transform the test data
X_train_scaled_alzheimer = scaler.fit_transform(X_train_alzheimer)
X_test_scaled_alzheimer = scaler.transform(X_test_alzheimer)
X_train_scaled_alzheimer_resampled = scaler.fit_transform(data_balanced.drop(columns=['Diagnosis']))

# Instantiate the Logistic Regression model
model = LogisticRegression(max_iter=1000)
model_resampled = LogisticRegression(class_weight='balanced', max_iter=1000)

# Train the model
model.fit(X_train_scaled_alzheimer, y_train_alzheimer)
model_resampled.fit(X_train_scaled_alzheimer_resampled, data_balanced['Diagnosis'])

# Predict on the test set
y_pred_alzheimer = model.predict(X_test_scaled_alzheimer)
y_pred_resampled = model_resampled.predict(X_test_scaled_alzheimer)

# Evaluate accuracy
accuracy = accuracy_score(y_test_alzheimer, y_pred_alzheimer)
accuracy_resampled = accuracy_score(y_test_alzheimer, y_pred_resampled)
print(f'Accuracy: {accuracy:.4f}')
print(f'Accuracy (resampled): {accuracy_resampled:.4f}')

# Confusion matrix
conf_matrix = confusion_matrix(y_test_alzheimer, y_pred_alzheimer)
conf_matrix_resampled = confusion_matrix(y_test_alzheimer, y_pred_resampled)
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Confusion Matrix (resampled):\n{conf_matrix_resampled}')

# Classification report
class_report = classification_report(y_test_alzheimer, y_pred_alzheimer)
class_report_resampled = classification_report(y_test_alzheimer, y_pred_resampled)
print(f'Classification Report:\n{class_report}')
print(f'Classification Report (resampled):\n{class_report_resampled}')

Accuracy: 0.8214
Accuracy (resampled): 0.8643
Confusion Matrix:
[[117  23]
 [ 27 113]]
Confusion Matrix (resampled):
[[121  19]
 [ 19 121]]
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.84      0.82       140
           1       0.83      0.81      0.82       140

    accuracy                           0.82       280
   macro avg       0.82      0.82      0.82       280
weighted avg       0.82      0.82      0.82       280

Classification Report (resampled):
              precision    recall  f1-score   support

           0       0.86      0.86      0.86       140
           1       0.86      0.86      0.86       140

    accuracy                           0.86       280
   macro avg       0.86      0.86      0.86       280
weighted avg       0.86      0.86      0.86       280



In [43]:
#SUPPORT VECTOR MACHINE
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

svm_model = SVC(kernel='linear')
svm_model_resampled = SVC(kernel='linear', class_weight='balanced')

# Train the model
svm_model.fit(X_train_scaled_alzheimer, y_train_alzheimer)
svm_model_resampled.fit(X_train_scaled_alzheimer_resampled, data_balanced['Diagnosis'])

# Predict on the test set
y_pred_alzheimer = svm_model.predict(X_test_scaled_alzheimer)
y_pred_resampled = svm_model_resampled.predict(X_test_scaled_alzheimer)

# Evaluate accuracy
accuracy = accuracy_score(y_test_alzheimer, y_pred_alzheimer)
accuracy_resampled = accuracy_score(y_test_alzheimer, y_pred_resampled)
print(f'Accuracy: {accuracy:.4f}')
print(f'Accuracy (resampled): {accuracy_resampled:.4f}')

# Confusion matrix
conf_matrix = confusion_matrix(y_test_alzheimer, y_pred_alzheimer)
conf_matrix_resampled = confusion_matrix(y_test_alzheimer, y_pred_resampled)
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Confusion Matrix (resampled):\n{conf_matrix_resampled}')

# Classification report
class_report = classification_report(y_test_alzheimer, y_pred_alzheimer)
class_report_resampled = classification_report(y_test_alzheimer, y_pred_resampled)
print(f'Classification Report:\n{class_report}')
print(f'Classification Report (resampled):\n{class_report_resampled}')


Accuracy: 0.8125
Accuracy (resampled): 0.8257
Confusion Matrix:
[[128  29]
 [ 28 119]]
Confusion Matrix (resampled):
[[130  27]
 [ 26 121]]
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.82      0.82       157
           1       0.80      0.81      0.81       147

    accuracy                           0.81       304
   macro avg       0.81      0.81      0.81       304
weighted avg       0.81      0.81      0.81       304

Classification Report (resampled):
              precision    recall  f1-score   support

           0       0.83      0.83      0.83       157
           1       0.82      0.82      0.82       147

    accuracy                           0.83       304
   macro avg       0.83      0.83      0.83       304
weighted avg       0.83      0.83      0.83       304



In [72]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, matthews_corrcoef

# Initialize Stratified K-Fold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Instantiate models with resampling
dt_model = DecisionTreeClassifier()
dt_model_resampled = DecisionTreeClassifier(class_weight='balanced')

# Perform stratified cross-validation for the original model
accuracy_scores = cross_val_score(dt_model, X_train_scaled_alzheimer, y_train_alzheimer, cv=skf, scoring='accuracy')
y_pred_cross_val = cross_val_predict(dt_model, X_train_scaled_alzheimer, y_train_alzheimer, cv=skf)

# Perform stratified cross-validation for the resampled model
accuracy_scores_resampled = cross_val_score(dt_model_resampled, X_train_scaled_alzheimer_resampled, data_balanced['Diagnosis'], cv=skf, scoring='accuracy')
y_pred_cross_val_resampled = cross_val_predict(dt_model_resampled, X_train_scaled_alzheimer_resampled, data_balanced['Diagnosis'], cv=skf)

# Fit the original model
dt_model.fit(X_train_scaled_alzheimer, y_train_alzheimer)
y_pred_alzheimer = dt_model.predict(X_test_scaled_alzheimer)

# Fit the resampled model
dt_model_resampled.fit(X_train_scaled_alzheimer_resampled, data_balanced['Diagnosis'])
y_pred_resampled = dt_model_resampled.predict(X_test_scaled_alzheimer)

# Evaluate cross-validated accuracy
print("Cross-validated Accuracy Scores (original model):")
print(accuracy_scores)
print(f'Cross-validated Mean Accuracy: {accuracy_scores.mean():.4f}')

print("Cross-validated Accuracy Scores (resampled model):")
print(accuracy_scores_resampled)
print(f'Cross-validated Mean Accuracy (resampled): {accuracy_scores_resampled.mean():.4f}')

# Evaluate accuracy on test set
accuracy = accuracy_score(y_test_alzheimer, y_pred_alzheimer)
accuracy_resampled = accuracy_score(y_test_alzheimer, y_pred_resampled)
print(f'Accuracy: {accuracy:.4f}')
print(f'Accuracy (resampled): {accuracy_resampled:.4f}')

# Confusion matrix
conf_matrix = confusion_matrix(y_test_alzheimer, y_pred_alzheimer)
conf_matrix_resampled = confusion_matrix(y_test_alzheimer, y_pred_resampled)
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Confusion Matrix (resampled):\n{conf_matrix_resampled}')

# Classification report
class_report = classification_report(y_test_alzheimer, y_pred_alzheimer)
class_report_resampled = classification_report(y_test_alzheimer, y_pred_resampled)
print(f'Classification Report:\n{class_report}')
print(f'Classification Report (resampled):\n{class_report_resampled}')

# Additional metrics: ROC-AUC and Matthews Correlation Coefficient
roc_auc = roc_auc_score(y_test_alzheimer, y_pred_alzheimer)
roc_auc_resampled = roc_auc_score(y_test_alzheimer, y_pred_resampled)
mcc = matthews_corrcoef(y_test_alzheimer, y_pred_alzheimer)
mcc_resampled = matthews_corrcoef(y_test_alzheimer, y_pred_resampled)

print(f'ROC-AUC: {roc_auc:.4f}')
print(f'ROC-AUC (resampled): {roc_auc_resampled:.4f}')
print(f'Matthews Correlation Coefficient: {mcc:.4f}')
print(f'Matthews Correlation Coefficient (resampled): {mcc_resampled:.4f}')

Cross-validated Accuracy Scores (original model):
[0.90178571 0.95982143 0.94196429 0.94196429 0.95089286]
Cross-validated Mean Accuracy: 0.9393
Cross-validated Accuracy Scores (resampled model):
[0.95714286 0.96071429 0.95714286 0.96071429 0.96428571]
Cross-validated Mean Accuracy (resampled): 0.9600
Accuracy: 0.9500
Accuracy (resampled): 0.9964
Confusion Matrix:
[[133   7]
 [  7 133]]
Confusion Matrix (resampled):
[[139   1]
 [  0 140]]
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.95      0.95       140
           1       0.95      0.95      0.95       140

    accuracy                           0.95       280
   macro avg       0.95      0.95      0.95       280
weighted avg       0.95      0.95      0.95       280

Classification Report (resampled):
              precision    recall  f1-score   support

           0       1.00      0.99      1.00       140
           1       0.99      1.00      1.00       140

    accu