In [7]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.svm import SVC
from collections import Counter
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.utils import resample
from catboost import CatBoostClassifier
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier
from scipy.stats import friedmanchisquare, rankdata
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split, cross_val_predict, StratifiedKFold, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc


In [8]:
# read data
data = pd.read_csv("alzheimers_disease_data.csv")
data.shape
data.drop(['PatientID', 'DoctorInCharge'], axis=1, inplace=True)

In [9]:
class_counts = data.iloc[:, -1].value_counts(normalize=True) * 100
print("Original Class Distribution:\n", class_counts)

# Splitting features and target variable
X = data.iloc[:, :-1]  # Features
y = data.iloc[:, -1]   # Target

# Separate majority and minority classes
majority_class = data[y == 0]  # Non-Alzheimer's (64.6%)
minority_class = data[y == 1]  # Alzheimer's (35.4%)

# Undersample the majority class to match the minority class size
majority_downsampled = resample(majority_class, 
                                replace=False,  # Without replacement
                                n_samples=len(minority_class),  # Match minority class size
                                random_state=42)

# Combine the downsampled majority class with the original minority class
balanced_data = pd.concat([majority_downsampled, minority_class])

# Shuffle the dataset
balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

# Check new class distribution
new_class_counts = balanced_data.iloc[:, -1].value_counts(normalize=True) * 100
print("Balanced Class Distribution:\n", new_class_counts)

Original Class Distribution:
 Diagnosis
0    64.634714
1    35.365286
Name: proportion, dtype: float64
Balanced Class Distribution:
 Diagnosis
0    50.0
1    50.0
Name: proportion, dtype: float64


In [10]:
# Step 1: Select the nominal categorical features
nominal_features = ['Ethnicity', 'EducationLevel']

# Step 2: Initialize One-Hot Encoder without dropping any category
ohe = OneHotEncoder(drop=None, sparse_output=False)  # Keep all categories

# Step 3: Fit and transform the categorical features
encoded_features = ohe.fit_transform(balanced_data[nominal_features])

# Step 4: Convert the encoded features into a DataFrame
encoded_df = pd.DataFrame(encoded_features, columns=ohe.get_feature_names_out(nominal_features))

# Step 5: Drop the original categorical columns and merge encoded features
data = balanced_data.drop(columns=nominal_features).reset_index(drop=True)  # Drop original categorical columns
data = pd.concat([data, encoded_df], axis=1)  # Merge encoded data

# Step 6: Verify the transformed data
print("Updated Dataset After One-Hot Encoding:")
data.head()

Updated Dataset After One-Hot Encoding:


Unnamed: 0,Age,Gender,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,SleepQuality,FamilyHistoryAlzheimers,CardiovascularDisease,...,Forgetfulness,Diagnosis,Ethnicity_0,Ethnicity_1,Ethnicity_2,Ethnicity_3,EducationLevel_0,EducationLevel_1,EducationLevel_2,EducationLevel_3
0,80,1,16.834968,0,19.053565,4.352272,3.432055,7.361459,0,0,...,0,0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1,88,1,35.353244,1,0.768943,8.883326,4.085773,7.450835,0,0,...,1,1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,63,0,32.72655,0,16.971929,8.569751,8.744619,9.227229,0,0,...,0,1,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3,75,1,38.66896,1,6.669039,7.328895,7.973275,9.966551,0,0,...,1,0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,72,0,30.646711,0,4.452856,0.768016,4.978013,7.715735,0,1,...,0,0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [11]:
selected_features = ['FunctionalAssessment', 'ADL', 'MemoryComplaints', 'MMSE', 'BehavioralProblems', 'SleepQuality', 'CholesterolHDL', 'CholesterolLDL', 'BMI', 'CholesterolTriglycerides', 'Age', 'PhysicalActivity', 'DietQuality', 'DiastolicBP', 'Gender']

In [57]:
top_selected_features = ['FunctionalAssessment', 'ADL', 'MemoryComplaints', 'MMSE', 'BehavioralProblems']

In [59]:
# Correct filtering for Male and Female
male = data[data['Gender'] == 0]  # Assuming 0 represents Male
female = data[data['Gender'] == 1]  # Assuming 1 represents Female

X = data[selected_features]  
X_top = data[top_selected_features]  
y = data['Diagnosis']


# Split features and target for Males
X_male = male[selected_features]  
y_male = male['Diagnosis']

# Split features and target for Females
X_female = female[selected_features]  
y_female = female['Diagnosis']




In [22]:
male.shape

(763, 39)

In [24]:
female.shape

(757, 39)

In [61]:
X_train_top, X_test_top, y_train_top, y_test_top = train_test_split(X_top, y, test_size=0.2, random_state=42)

scaler_top = StandardScaler()

# Fit on training data and transform both train and test separately
X_train_scaled_top = scaler.fit_transform(X_train_top)  
X_test_scaled_top = scaler.transform(X_test_top)  


# Split the data into 80% train and 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


scaler = StandardScaler()

# Fit on training data and transform both train and test separately
X_train_scaled = scaler.fit_transform(X_train)  
X_test_scaled = scaler.transform(X_test)  

# Split Male Data
X_male_train, X_male_test, y_male_train, y_male_test = train_test_split(
    X_male, y_male, test_size=0.2, random_state=42
)

# Split Female Data
X_female_train, X_female_test, y_female_train, y_female_test = train_test_split(
    X_female, y_female, test_size=0.2, random_state=42
)

# Initialize StandardScaler
scaler_male = StandardScaler()
scaler_female = StandardScaler()

# Scale Male Data
X_male_train_scaled = scaler_male.fit_transform(X_male_train)
X_male_test_scaled = scaler_male.transform(X_male_test)

# Scale Female Data
X_female_train_scaled = scaler_female.fit_transform(X_female_train)
X_female_test_scaled = scaler_female.transform(X_female_test)

## Top

In [63]:
catboost = CatBoostClassifier(
    border_count=128, 
    depth=4, 
    iterations=500, 
    l2_leaf_reg=3, 
    learning_rate=0.01, 
    loss_function='Logloss', 
    random_seed=42,  
    verbose=0  
)

catboost.fit(X_train_scaled_top, y_train_top)

catboost_y_pred = catboost.predict(X_test_scaled_top)

print("Accuracy:", accuracy_score(y_test_top, catboost_y_pred))
print("\nClassification Report:\n", classification_report(y_test_top, catboost_y_pred, digits=3))
print("\nConfusion Matrix:\n", confusion_matrix(y_test_top, catboost_y_pred))


Accuracy: 0.9407894736842105

Classification Report:
               precision    recall  f1-score   support

           0      0.929     0.954     0.941       151
           1      0.953     0.928     0.940       153

    accuracy                          0.941       304
   macro avg      0.941     0.941     0.941       304
weighted avg      0.941     0.941     0.941       304


Confusion Matrix:
 [[144   7]
 [ 11 142]]


In [53]:
catboost = CatBoostClassifier(
    border_count=128, 
    depth=4, 
    iterations=500, 
    l2_leaf_reg=3, 
    learning_rate=0.01, 
    loss_function='Logloss', 
    random_seed=42,  
    verbose=0  
)

catboost.fit(X_train_scaled, y_train)

catboost_y_pred = catboost.predict(X_test_scaled)

print("Accuracy:", accuracy_score(y_test, catboost_y_pred))
print("\nClassification Report:\n", classification_report(y_test, catboost_y_pred, digits=3))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, catboost_y_pred))


Accuracy: 0.9407894736842105

Classification Report:
               precision    recall  f1-score   support

           0      0.929     0.954     0.941       151
           1      0.953     0.928     0.940       153

    accuracy                          0.941       304
   macro avg      0.941     0.941     0.941       304
weighted avg      0.941     0.941     0.941       304


Confusion Matrix:
 [[144   7]
 [ 11 142]]


## man

In [45]:
catboost = CatBoostClassifier(
    border_count=128, 
    depth=4, 
    iterations=500, 
    l2_leaf_reg=3, 
    learning_rate=0.01, 
    loss_function='Logloss', 
    random_seed=42,  
    verbose=0  
)

catboost.fit(X_male_train_scaled, y_male_train)

catboost_y_pred = catboost.predict(X_male_test_scaled)

print("Accuracy:", accuracy_score(y_male_test, catboost_y_pred))
print("\nClassification Report:\n", classification_report(y_male_test, catboost_y_pred, digits=3))
print("\nConfusion Matrix:\n", confusion_matrix(y_male_test, catboost_y_pred))


Accuracy: 0.9084967320261438

Classification Report:
               precision    recall  f1-score   support

           0      0.857     0.938     0.896        64
           1      0.952     0.888     0.919        89

    accuracy                          0.908       153
   macro avg      0.904     0.913     0.907       153
weighted avg      0.912     0.908     0.909       153


Confusion Matrix:
 [[60  4]
 [10 79]]


In [43]:
catboost = CatBoostClassifier(
    border_count=128, 
    depth=4, 
    iterations=500, 
    l2_leaf_reg=3, 
    learning_rate=0.01, 
    loss_function='Logloss', 
    random_seed=42,  
    verbose=0  
)

catboost.fit(X_female_train_scaled, y_female_train)

catboost_y_pred = catboost.predict(X_female_test_scaled)

print("Accuracy:", accuracy_score(y_female_test, catboost_y_pred))
print("\nClassification Report:\n", classification_report(y_female_test, catboost_y_pred, digits=3))
print("\nConfusion Matrix:\n", confusion_matrix(y_female_test, catboost_y_pred))


Accuracy: 0.9473684210526315

Classification Report:
               precision    recall  f1-score   support

           0      0.937     0.961     0.949        77
           1      0.959     0.933     0.946        75

    accuracy                          0.947       152
   macro avg      0.948     0.947     0.947       152
weighted avg      0.948     0.947     0.947       152


Confusion Matrix:
 [[74  3]
 [ 5 70]]
