In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

In [2]:
# read data
data = pd.read_csv("alzheimers_disease_data.csv")
data.shape

(2149, 35)

In [3]:
data.drop(['PatientID', 'DoctorInCharge'], axis=1, inplace=True)

In [4]:
data.shape

(2149, 33)

### under sampling balance 

In [9]:
from sklearn.utils import resample

class_counts = data.iloc[:, -1].value_counts(normalize=True) * 100
print("Original Class Distribution:\n", class_counts)

# Splitting features and target variable
X = data.iloc[:, :-1]  # Features
y = data.iloc[:, -1]   # Target

# Separate majority and minority classes
majority_class = data[y == 0]  # Non-Alzheimer's (64.6%)
minority_class = data[y == 1]  # Alzheimer's (35.4%)

# Undersample the majority class to match the minority class size
majority_downsampled = resample(majority_class, 
                                replace=False,  # Without replacement
                                n_samples=len(minority_class),  # Match minority class size
                                random_state=42)

# Combine the downsampled majority class with the original minority class
balanced_data = pd.concat([majority_downsampled, minority_class])

# Shuffle the dataset
balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

# Check new class distribution
new_class_counts = balanced_data.iloc[:, -1].value_counts(normalize=True) * 100
print("Balanced Class Distribution:\n", new_class_counts)

Original Class Distribution:
 Diagnosis
0    64.634714
1    35.365286
Name: proportion, dtype: float64
Balanced Class Distribution:
 Diagnosis
0    50.0
1    50.0
Name: proportion, dtype: float64


In [11]:
balanced_data.shape

(1520, 33)

### over sampling 

In [48]:
from imblearn.over_sampling import SMOTE

class_counts = data.iloc[:, -1].value_counts(normalize=True) * 100
print("Original Class Distribution:\n", class_counts)

# Splitting features and target variable
X = data.iloc[:, :-1]  # Features
y = data.iloc[:, -1]   # Target

# Apply SMOTE for oversampling the minority class
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Create a new balanced DataFrame
balanced_data = pd.concat([pd.DataFrame(X_resampled), pd.DataFrame(y_resampled, columns=[y.name])], axis=1)

# Check new class distribution
new_class_counts = balanced_data.iloc[:, -1].value_counts(normalize=True) * 100
print("Balanced Class Distribution:\n", new_class_counts)


Original Class Distribution:
 Diagnosis
0    64.634714
1    35.365286
Name: proportion, dtype: float64
Balanced Class Distribution:
 Diagnosis
0    50.0
1    50.0
Name: proportion, dtype: float64


In [50]:
balanced_data.shape

(2778, 33)

In [13]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# Step 1: Select the nominal categorical features
nominal_features = ['Ethnicity', 'EducationLevel']

# Step 2: Initialize One-Hot Encoder without dropping any category
ohe = OneHotEncoder(drop=None, sparse_output=False)  # Keep all categories

# Step 3: Fit and transform the categorical features
encoded_features = ohe.fit_transform(balanced_data[nominal_features])

# Step 4: Convert the encoded features into a DataFrame
encoded_df = pd.DataFrame(encoded_features, columns=ohe.get_feature_names_out(nominal_features))

# Step 5: Drop the original categorical columns and merge encoded features
data = balanced_data.drop(columns=nominal_features).reset_index(drop=True)  # Drop original categorical columns
data = pd.concat([data, encoded_df], axis=1)  # Merge encoded data

# Step 6: Verify the transformed data
print("Updated Dataset After One-Hot Encoding:")
print(data.head())


Updated Dataset After One-Hot Encoding:
   Age  Gender        BMI  Smoking  AlcoholConsumption  PhysicalActivity  \
0   80       1  16.834968        0           19.053565          4.352272   
1   88       1  35.353244        1            0.768943          8.883326   
2   63       0  32.726550        0           16.971929          8.569751   
3   75       1  38.668960        1            6.669039          7.328895   
4   72       0  30.646711        0            4.452856          0.768016   

   DietQuality  SleepQuality  FamilyHistoryAlzheimers  CardiovascularDisease  \
0     3.432055      7.361459                        0                      0   
1     4.085773      7.450835                        0                      0   
2     8.744619      9.227229                        0                      0   
3     7.973275      9.966551                        0                      0   
4     4.978013      7.715735                        0                      1   

   ...  Forgetfulness 

In [15]:
data.shape

(1520, 39)

In [40]:
selected_features = [
    'FunctionalAssessment', 'ADL', 'MemoryComplaints', 'MMSE', 'BehavioralProblems', 'SleepQuality'
]

In [9]:
# selected_features = ['FunctionalAssessment', 'ADL', 'MemoryComplaints', 'MMSE', 'BehavioralProblems', 'SleepQuality', 'CholesterolHDL', 'CholesterolLDL', 'BMI', 'CholesterolTriglycerides', 'Age', 'PhysicalActivity', 'DietQuality', 'DiastolicBP', 'Gender']

In [42]:
X = data[selected_features]  
y = data['Diagnosis']

In [44]:
X.shape

(1520, 6)

In [46]:
# Split the data into 80% train and 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [48]:
import numpy as np

# Count the occurrences of 0 and 1 in y_test
unique, counts = np.unique(y_test, return_counts=True)
count_dict = dict(zip(unique, counts))

# Print the results
print(f"Count of 0s: {count_dict.get(0, 0)}")
print(f"Count of 1s: {count_dict.get(1, 0)}")

Count of 0s: 151
Count of 1s: 153


In [50]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Fit on training data and transform both train and test separately
X_train_scaled = scaler.fit_transform(X_train)  
X_test_scaled = scaler.transform(X_test)  

## DT

In [52]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

dt = DecisionTreeClassifier(random_state=42)

dt.fit(X_train_scaled, y_train)

dt_y_pred = dt.predict(X_test_scaled)

print("Accuracy:", accuracy_score(y_test, dt_y_pred))
print("Classification Report:\n", classification_report(y_test, dt_y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, dt_y_pred))

Accuracy: 0.9013157894736842
Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.89      0.90       151
           1       0.90      0.91      0.90       153

    accuracy                           0.90       304
   macro avg       0.90      0.90      0.90       304
weighted avg       0.90      0.90      0.90       304

Confusion Matrix:
 [[135  16]
 [ 14 139]]


## RF

In [54]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

rf = RandomForestClassifier(random_state=42, n_estimators=100)  # 100 trees in the forest

rf.fit(X_train_scaled, y_train)

rf_y_pred = rf.predict(X_test_scaled)

print("Accuracy:", accuracy_score(y_test, rf_y_pred))
print("\nClassification Report:\n", classification_report(y_test, rf_y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, rf_y_pred))


Accuracy: 0.9407894736842105

Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.96      0.94       151
           1       0.96      0.92      0.94       153

    accuracy                           0.94       304
   macro avg       0.94      0.94      0.94       304
weighted avg       0.94      0.94      0.94       304


Confusion Matrix:
 [[145   6]
 [ 12 141]]


## XGBoost

In [57]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

xgb = XGBClassifier(random_state=42, n_estimators=100, eval_metric='logloss')

xgb.fit(X_train_scaled, y_train)

xgb_y_pred = xgb.predict(X_test_scaled)

print("Accuracy:", accuracy_score(y_test, xgb_y_pred))
print("\nClassification Report:\n", classification_report(y_test, xgb_y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, xgb_y_pred))

Accuracy: 0.9342105263157895

Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.95      0.94       151
           1       0.95      0.92      0.93       153

    accuracy                           0.93       304
   macro avg       0.93      0.93      0.93       304
weighted avg       0.93      0.93      0.93       304


Confusion Matrix:
 [[144   7]
 [ 13 140]]


## Catboost

In [27]:
!pip install catboost



In [59]:
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

catboost = CatBoostClassifier(random_state=42, iterations=100, verbose=0)  # 100 iterations, silent training

catboost.fit(X_train_scaled, y_train)

catboost_y_pred = catboost.predict(X_test_scaled)

print("Accuracy:", accuracy_score(y_test, catboost_y_pred))
print("\nClassification Report:\n", classification_report(y_test, catboost_y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, catboost_y_pred))


Accuracy: 0.944078947368421

Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.96      0.94       151
           1       0.96      0.93      0.94       153

    accuracy                           0.94       304
   macro avg       0.94      0.94      0.94       304
weighted avg       0.94      0.94      0.94       304


Confusion Matrix:
 [[145   6]
 [ 11 142]]


## AdaBoost

In [61]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

adaboost = AdaBoostClassifier(random_state=42, n_estimators=100)  # 100 weak learners

adaboost.fit(X_train_scaled, y_train)

adaboost_y_pred = adaboost.predict(X_test_scaled)

print("Accuracy:", accuracy_score(y_test, adaboost_y_pred))
print("\nClassification Report:\n", classification_report(y_test, adaboost_y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, adaboost_y_pred))




Accuracy: 0.9046052631578947

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.87      0.90       151
           1       0.88      0.94      0.91       153

    accuracy                           0.90       304
   macro avg       0.91      0.90      0.90       304
weighted avg       0.91      0.90      0.90       304


Confusion Matrix:
 [[131  20]
 [  9 144]]


## SVM

In [64]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize SVM Classifier
svm = SVC(random_state=42, kernel='rbf')  # Using RBF kernel (default)

# Train the model
svm.fit(X_train_scaled, y_train)

# Make predictions
svm_y_pred = svm.predict(X_test_scaled)

# Print evaluation metrics
print("Accuracy:", accuracy_score(y_test, svm_y_pred))
print("\nClassification Report:\n", classification_report(y_test, svm_y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, svm_y_pred))


Accuracy: 0.9210526315789473

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.94      0.92       151
           1       0.94      0.90      0.92       153

    accuracy                           0.92       304
   macro avg       0.92      0.92      0.92       304
weighted avg       0.92      0.92      0.92       304


Confusion Matrix:
 [[142   9]
 [ 15 138]]


# Stratified cross validation

### DT

In [40]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import numpy as np

# Initialize Decision Tree Classifier
dt = DecisionTreeClassifier(random_state=42)

# Apply 10-Fold Stratified Cross-Validation
stratified_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Lists to store metrics for each fold
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

# Perform 10-Fold Cross-Validation
for fold, (train_idx, test_idx) in enumerate(stratified_cv.split(X_train_scaled, y_train), 1):
    X_train_fold, X_test_fold = X_train_scaled[train_idx], X_train_scaled[test_idx]
    y_train_fold, y_test_fold = y_train.iloc[train_idx], y_train.iloc[test_idx]

    # Train model on current fold
    dt.fit(X_train_fold, y_train_fold)
    y_pred_fold = dt.predict(X_test_fold)

    # Calculate metrics
    accuracy = accuracy_score(y_test_fold, y_pred_fold)
    precision = precision_score(y_test_fold, y_pred_fold, average='weighted')  # Weighted for class imbalance
    recall = recall_score(y_test_fold, y_pred_fold, average='weighted')
    f1 = f1_score(y_test_fold, y_pred_fold, average='weighted')

    # Store results
    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)

    # Print metrics for current fold
    print(f"Fold {fold}:")
    print(f"  Accuracy: {accuracy:.3f}")
    print(f"  Precision: {precision:.3f}")
    print(f"  Recall: {recall:.3f}")
    print(f"  F1-Score: {f1:.3f}\n")

# Print average results across all folds
print("\nFinal Cross-Validation Results (Average Over 10 Folds):")
print(f"  Accuracy: {np.mean(accuracy_scores):.3f} ± {np.std(accuracy_scores):.3f}")
print(f"  Precision: {np.mean(precision_scores):.3f} ± {np.std(precision_scores):.3f}")
print(f"  Recall: {np.mean(recall_scores):.3f} ± {np.std(recall_scores):.3f}")
print(f"  F1-Score: {np.mean(f1_scores):.3f} ± {np.std(f1_scores):.3f}")


Fold 1:
  Accuracy: 0.924
  Precision: 0.927
  Recall: 0.924
  F1-Score: 0.925

Fold 2:
  Accuracy: 0.884
  Precision: 0.884
  Recall: 0.884
  F1-Score: 0.884

Fold 3:
  Accuracy: 0.866
  Precision: 0.866
  Recall: 0.866
  F1-Score: 0.866

Fold 4:
  Accuracy: 0.890
  Precision: 0.890
  Recall: 0.890
  F1-Score: 0.890

Fold 5:
  Accuracy: 0.913
  Precision: 0.913
  Recall: 0.913
  F1-Score: 0.913

Fold 6:
  Accuracy: 0.872
  Precision: 0.871
  Recall: 0.872
  F1-Score: 0.870

Fold 7:
  Accuracy: 0.890
  Precision: 0.889
  Recall: 0.890
  F1-Score: 0.888

Fold 8:
  Accuracy: 0.866
  Precision: 0.867
  Recall: 0.866
  F1-Score: 0.867

Fold 9:
  Accuracy: 0.930
  Precision: 0.930
  Recall: 0.930
  F1-Score: 0.930

Fold 10:
  Accuracy: 0.901
  Precision: 0.901
  Recall: 0.901
  F1-Score: 0.901


Final Cross-Validation Results (Average Over 10 Folds):
  Accuracy: 0.894 ± 0.022
  Precision: 0.894 ± 0.022
  Recall: 0.894 ± 0.022
  F1-Score: 0.893 ± 0.022


### RF

In [66]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import numpy as np

# Initialize Random Forest Classifier
rf = RandomForestClassifier(random_state=42, n_estimators=100)  # 100 trees in the forest

# Apply 10-Fold Stratified Cross-Validation
stratified_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Lists to store metrics for each fold
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

# Perform 10-Fold Cross-Validation
for fold, (train_idx, test_idx) in enumerate(stratified_cv.split(X_train_scaled, y_train), 1):
    X_train_fold, X_test_fold = X_train_scaled[train_idx], X_train_scaled[test_idx]
    y_train_fold, y_test_fold = y_train.iloc[train_idx], y_train.iloc[test_idx]

    # Train model on current fold
    rf.fit(X_train_fold, y_train_fold)
    y_pred_fold = rf.predict(X_test_fold)

    # Calculate metrics
    accuracy = accuracy_score(y_test_fold, y_pred_fold)
    precision = precision_score(y_test_fold, y_pred_fold, average='weighted')  # Weighted for class imbalance
    recall = recall_score(y_test_fold, y_pred_fold, average='weighted')
    f1 = f1_score(y_test_fold, y_pred_fold, average='weighted')

    # Store results
    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)

    # Print metrics for current fold
    print(f"Fold {fold}:")
    print(f"  Accuracy: {accuracy:.3f}")
    print(f"  Precision: {precision:.3f}")
    print(f"  Recall: {recall:.3f}")
    print(f"  F1-Score: {f1:.3f}\n")

# Print average results across all folds
print("\nFinal Cross-Validation Results (Average Over 10 Folds):")
print(f"  Accuracy: {np.mean(accuracy_scores):.3f} ± {np.std(accuracy_scores):.3f}")
print(f"  Precision: {np.mean(precision_scores):.3f} ± {np.std(precision_scores):.3f}")
print(f"  Recall: {np.mean(recall_scores):.3f} ± {np.std(recall_scores):.3f}")
print(f"  F1-Score: {np.mean(f1_scores):.3f} ± {np.std(f1_scores):.3f}")


Fold 1:
  Accuracy: 0.951
  Precision: 0.955
  Recall: 0.951
  F1-Score: 0.951

Fold 2:
  Accuracy: 0.934
  Precision: 0.934
  Recall: 0.934
  F1-Score: 0.934

Fold 3:
  Accuracy: 0.951
  Precision: 0.951
  Recall: 0.951
  F1-Score: 0.951

Fold 4:
  Accuracy: 0.975
  Precision: 0.977
  Recall: 0.975
  F1-Score: 0.975

Fold 5:
  Accuracy: 0.967
  Precision: 0.968
  Recall: 0.967
  F1-Score: 0.967

Fold 6:
  Accuracy: 0.951
  Precision: 0.953
  Recall: 0.951
  F1-Score: 0.951

Fold 7:
  Accuracy: 0.917
  Precision: 0.921
  Recall: 0.917
  F1-Score: 0.917

Fold 8:
  Accuracy: 0.942
  Precision: 0.942
  Recall: 0.942
  F1-Score: 0.942

Fold 9:
  Accuracy: 0.926
  Precision: 0.931
  Recall: 0.926
  F1-Score: 0.925

Fold 10:
  Accuracy: 0.950
  Precision: 0.951
  Recall: 0.950
  F1-Score: 0.950


Final Cross-Validation Results (Average Over 10 Folds):
  Accuracy: 0.947 ± 0.017
  Precision: 0.948 ± 0.016
  Recall: 0.947 ± 0.017
  F1-Score: 0.946 ± 0.017


### Catboost

In [68]:
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import numpy as np

# Initialize CatBoost Classifier
catboost = CatBoostClassifier(random_state=42, iterations=100, verbose=0)  # 100 iterations, silent training

# Apply 10-Fold Stratified Cross-Validation
stratified_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Lists to store metrics for each fold
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

# Perform 10-Fold Cross-Validation
for fold, (train_idx, test_idx) in enumerate(stratified_cv.split(X_train_scaled, y_train), 1):
    X_train_fold, X_test_fold = X_train_scaled[train_idx], X_train_scaled[test_idx]
    y_train_fold, y_test_fold = y_train.iloc[train_idx], y_train.iloc[test_idx]

    # Train model on current fold
    catboost.fit(X_train_fold, y_train_fold)
    y_pred_fold = catboost.predict(X_test_fold)

    # Calculate metrics
    accuracy = accuracy_score(y_test_fold, y_pred_fold)
    precision = precision_score(y_test_fold, y_pred_fold, average='weighted')  # Weighted for class imbalance
    recall = recall_score(y_test_fold, y_pred_fold, average='weighted')
    f1 = f1_score(y_test_fold, y_pred_fold, average='weighted')

    # Store results
    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)

    # Print metrics for current fold
    print(f"Fold {fold}:")
    print(f"  Accuracy: {accuracy:.3f}")
    print(f"  Precision: {precision:.3f}")
    print(f"  Recall: {recall:.3f}")
    print(f"  F1-Score: {f1:.3f}\n")

# Print average results across all folds
print("\nFinal Cross-Validation Results (Average Over 10 Folds):")
print(f"  Accuracy: {np.mean(accuracy_scores):.3f} ± {np.std(accuracy_scores):.3f}")
print(f"  Precision: {np.mean(precision_scores):.3f} ± {np.std(precision_scores):.3f}")
print(f"  Recall: {np.mean(recall_scores):.3f} ± {np.std(recall_scores):.3f}")
print(f"  F1-Score: {np.mean(f1_scores):.3f} ± {np.std(f1_scores):.3f}")


Fold 1:
  Accuracy: 0.951
  Precision: 0.955
  Recall: 0.951
  F1-Score: 0.951

Fold 2:
  Accuracy: 0.926
  Precision: 0.926
  Recall: 0.926
  F1-Score: 0.926

Fold 3:
  Accuracy: 0.951
  Precision: 0.951
  Recall: 0.951
  F1-Score: 0.951

Fold 4:
  Accuracy: 0.975
  Precision: 0.977
  Recall: 0.975
  F1-Score: 0.975

Fold 5:
  Accuracy: 0.967
  Precision: 0.968
  Recall: 0.967
  F1-Score: 0.967

Fold 6:
  Accuracy: 0.959
  Precision: 0.959
  Recall: 0.959
  F1-Score: 0.959

Fold 7:
  Accuracy: 0.934
  Precision: 0.938
  Recall: 0.934
  F1-Score: 0.934

Fold 8:
  Accuracy: 0.950
  Precision: 0.951
  Recall: 0.950
  F1-Score: 0.950

Fold 9:
  Accuracy: 0.917
  Precision: 0.921
  Recall: 0.917
  F1-Score: 0.917

Fold 10:
  Accuracy: 0.950
  Precision: 0.951
  Recall: 0.950
  F1-Score: 0.950


Final Cross-Validation Results (Average Over 10 Folds):
  Accuracy: 0.948 ± 0.017
  Precision: 0.950 ± 0.016
  Recall: 0.948 ± 0.017
  F1-Score: 0.948 ± 0.017


### XGboost

In [47]:
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import numpy as np

# Initialize XGBoost Classifier
xgb = XGBClassifier(random_state=42, n_estimators=100, use_label_encoder=False, eval_metric='logloss')

# Apply 10-Fold Stratified Cross-Validation
stratified_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Lists to store metrics for each fold
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

# Perform 10-Fold Cross-Validation
for fold, (train_idx, test_idx) in enumerate(stratified_cv.split(X_train_scaled, y_train), 1):
    X_train_fold, X_test_fold = X_train_scaled[train_idx], X_train_scaled[test_idx]
    y_train_fold, y_test_fold = y_train.iloc[train_idx], y_train.iloc[test_idx]

    # Train model on current fold
    xgb.fit(X_train_fold, y_train_fold)
    y_pred_fold = xgb.predict(X_test_fold)

    # Calculate metrics
    accuracy = accuracy_score(y_test_fold, y_pred_fold)
    precision = precision_score(y_test_fold, y_pred_fold, average='weighted')  # Weighted for class imbalance
    recall = recall_score(y_test_fold, y_pred_fold, average='weighted')
    f1 = f1_score(y_test_fold, y_pred_fold, average='weighted')

    # Store results
    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)

    # Print metrics for current fold
    print(f"Fold {fold}:")
    print(f"  Accuracy: {accuracy:.3f}")
    print(f"  Precision: {precision:.3f}")
    print(f"  Recall: {recall:.3f}")
    print(f"  F1-Score: {f1:.3f}\n")

# Print average results across all folds
print("\nFinal Cross-Validation Results (Average Over 10 Folds):")
print(f"  Accuracy: {np.mean(accuracy_scores):.3f} ± {np.std(accuracy_scores):.3f}")
print(f"  Precision: {np.mean(precision_scores):.3f} ± {np.std(precision_scores):.3f}")
print(f"  Recall: {np.mean(recall_scores):.3f} ± {np.std(recall_scores):.3f}")
print(f"  F1-Score: {np.mean(f1_scores):.3f} ± {np.std(f1_scores):.3f}")


Parameters: { "use_label_encoder" } are not used.



Fold 1:
  Accuracy: 0.977
  Precision: 0.977
  Recall: 0.977
  F1-Score: 0.977



Parameters: { "use_label_encoder" } are not used.



Fold 2:
  Accuracy: 0.948
  Precision: 0.948
  Recall: 0.948
  F1-Score: 0.947



Parameters: { "use_label_encoder" } are not used.



Fold 3:
  Accuracy: 0.924
  Precision: 0.924
  Recall: 0.924
  F1-Score: 0.924



Parameters: { "use_label_encoder" } are not used.



Fold 4:
  Accuracy: 0.942
  Precision: 0.943
  Recall: 0.942
  F1-Score: 0.942



Parameters: { "use_label_encoder" } are not used.



Fold 5:
  Accuracy: 0.953
  Precision: 0.953
  Recall: 0.953
  F1-Score: 0.953



Parameters: { "use_label_encoder" } are not used.



Fold 6:
  Accuracy: 0.953
  Precision: 0.955
  Recall: 0.953
  F1-Score: 0.953



Parameters: { "use_label_encoder" } are not used.



Fold 7:
  Accuracy: 0.924
  Precision: 0.925
  Recall: 0.924
  F1-Score: 0.924



Parameters: { "use_label_encoder" } are not used.



Fold 8:
  Accuracy: 0.953
  Precision: 0.954
  Recall: 0.953
  F1-Score: 0.953



Parameters: { "use_label_encoder" } are not used.



Fold 9:
  Accuracy: 0.959
  Precision: 0.960
  Recall: 0.959
  F1-Score: 0.959



Parameters: { "use_label_encoder" } are not used.



Fold 10:
  Accuracy: 0.977
  Precision: 0.977
  Recall: 0.977
  F1-Score: 0.977


Final Cross-Validation Results (Average Over 10 Folds):
  Accuracy: 0.951 ± 0.017
  Precision: 0.951 ± 0.017
  Recall: 0.951 ± 0.017
  F1-Score: 0.951 ± 0.017


## stacking on the top 3 models

In [70]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import confusion_matrix, classification_report


# Create the individual classifiers
rf = make_pipeline(StandardScaler(), RandomForestClassifier(n_estimators=100, random_state=42))
xgb = make_pipeline(StandardScaler(), XGBClassifier(n_estimators=100, random_state=42, eval_metric='logloss'))
catboost = make_pipeline(StandardScaler(), CatBoostClassifier(iterations=100, verbose=0, random_state=42))

# Create the stacking classifier
stacking_clf = StackingClassifier(
    estimators=[('rf', rf), ('xgb', xgb), ('catboost', catboost)],  # Base models
    final_estimator=RandomForestClassifier(n_estimators=100, random_state=42)  # Meta learner
)


# Perform cross-validation predictions
stacking_preds = cross_val_predict(stacking_clf, X_train, y_train, cv=5)

# Train the classifiers on the full training set
stacking_clf.fit(X_train, y_train)

# Predict on the test set
stacking_test_preds = stacking_clf.predict(X_test)

# Function to print classification report
def print_classification_report(y_true, y_pred, title):
    print(f"\n{title}")
    print(classification_report(y_true, y_pred))

# Print classification reports
print_classification_report(y_test, stacking_test_preds, "Classification Report for Stacking Classifier (Test Set)")
print(confusion_matrix(y_test, stacking_test_preds))



Classification Report for Stacking Classifier (Test Set)
              precision    recall  f1-score   support

           0       0.92      0.94      0.93       151
           1       0.94      0.92      0.93       153

    accuracy                           0.93       304
   macro avg       0.93      0.93      0.93       304
weighted avg       0.93      0.93      0.93       304

[[142   9]
 [ 12 141]]


# parameter tuning

### RF

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [50, 100, 200],  # Number of trees
    'max_depth': [10, 20, None],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum samples required to split a node
    'min_samples_leaf': [1, 2, 4],  # Minimum samples required at a leaf node
    'max_features': ['sqrt', 'log2'],  # Number of features to consider for best split
    'bootstrap': [True, False]  # Whether bootstrap samples are used when building trees
}

# Initialize the Random Forest model
rf = RandomForestClassifier(random_state=42)

# Perform Grid Search with 5-Fold Cross-Validation
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)

# Fit on training data
grid_search.fit(X_train_scaled, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Train Random Forest with best parameters
best_rf = RandomForestClassifier(**best_params, random_state=42)
best_rf.fit(X_train_scaled, y_train)

# Make predictions
rf_y_pred = best_rf.predict(X_test_scaled)

# Print evaluation metrics
print("\nAccuracy:", accuracy_score(y_test, rf_y_pred))
print("\nClassification Report:\n", classification_report(y_test, rf_y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, rf_y_pred))

Fitting 5 folds for each of 324 candidates, totalling 1620 fits


In [72]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np

# Define the hyperparameter distribution
param_dist = {
    'n_estimators': [50, 100, 200],  # Number of trees
    'max_depth': [10, 20, None],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum samples required to split a node
    'min_samples_leaf': [1, 2, 4],  # Minimum samples required at a leaf node
    'max_features': ['sqrt', 'log2'],  # Number of features to consider for best split
    'bootstrap': [True, False]  # Whether bootstrap samples are used when building trees
}

# Initialize the Random Forest model
rf = RandomForestClassifier(random_state=42)

# Perform Randomized Search with 5-Fold Cross-Validation
random_search = RandomizedSearchCV(rf, param_distributions=param_dist, n_iter=20, cv=5,
                                   scoring='accuracy', n_jobs=-1, verbose=1, random_state=42)

# Fit on training data
random_search.fit(X_train_scaled, y_train)

# Get the best parameters
best_params = random_search.best_params_
print("Best Hyperparameters:", best_params)

# Train Random Forest with best parameters
best_rf = RandomForestClassifier(**best_params, random_state=42)
best_rf.fit(X_train_scaled, y_train)

# Make predictions
rf_y_pred = best_rf.predict(X_test_scaled)

# Print evaluation metrics
print("\nAccuracy:", accuracy_score(y_test, rf_y_pred))
print("\nClassification Report:\n", classification_report(y_test, rf_y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, rf_y_pred))


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Hyperparameters: {'n_estimators': 200, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': None, 'bootstrap': True}

Accuracy: 0.9407894736842105

Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.96      0.94       151
           1       0.96      0.92      0.94       153

    accuracy                           0.94       304
   macro avg       0.94      0.94      0.94       304
weighted avg       0.94      0.94      0.94       304


Confusion Matrix:
 [[145   6]
 [ 12 141]]


### cat

In [None]:
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Define the hyperparameter grid
param_grid = {
    'iterations': [100, 200, 500],  # Number of boosting iterations
    'learning_rate': [0.01, 0.05, 0.1],  # Step size shrinkage to prevent overfitting
    'depth': [4, 6, 8, 10],  # Maximum depth of the trees
    'l2_leaf_reg': [1, 3, 5, 7],  # L2 regularization to prevent overfitting
    'border_count': [32, 64, 128],  # Number of bins used for numeric feature quantization
}

# Initialize the CatBoost model
catboost = CatBoostClassifier(random_state=42, verbose=0)

# Perform Grid Search with 5-Fold Cross-Validation
grid_search = GridSearchCV(catboost, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)

# Fit on training data
grid_search.fit(X_train_scaled, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Train CatBoost with best parameters
best_catboost = CatBoostClassifier(**best_params, random_state=42, verbose=0)
best_catboost.fit(X_train_scaled, y_train)

# Make predictions
catboost_y_pred = best_catboost.predict(X_test_scaled)

# Print evaluation metrics
print("\nAccuracy:", accuracy_score(y_test, catboost_y_pred))
print("\nClassification Report:\n", classification_report(y_test, catboost_y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, catboost_y_pred))


Fitting 5 folds for each of 432 candidates, totalling 2160 fits


F

In [76]:
from scipy.stats import friedmanchisquare, rankdata

# Define models
models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    'CatBoost': CatBoostClassifier(iterations=1000, learning_rate=0.1, depth=6, verbose=100, random_state=42)
}

# Initialize K-Fold
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize a dictionary to store scores
model_scores = {model_name: [] for model_name in models.keys()}

# Perform K-Fold Cross-Validation
for train_idx, test_idx in kfold.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    for model_name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        score = f1_score(y_test, y_pred)  # You can use other metrics like F1-score
        model_scores[model_name].append(score)

# Print scores
for model_name, scores in model_scores.items():
    print(f"{model_name} Scores: {scores}")


# Convert model scores to a matrix
scores_matrix = np.array([model_scores[model_name] for model_name in models.keys()])

# Perform Friedman Test
stat, p = friedmanchisquare(*scores_matrix)
print("Friedman Test Statistic:", stat)
print("P-value:", p)

# Rank models for each fold
ranks = np.array([rankdata(-fold_scores) for fold_scores in scores_matrix.T])
avg_ranks = ranks.mean(axis=0)

# Print model rankings
model_names = list(models.keys())
for model, rank in zip(model_names, avg_ranks):
    print(f"{model}: Average Rank = {rank:.2f}")

# Sort and display rankings
sorted_models = sorted(zip(model_names, avg_ranks), key=lambda x: x[1])
print("\nModel Rankings:")
for i, (model, rank) in enumerate(sorted_models, 1):
    print(f"{i}. {model} (Average Rank: {rank:.2f})")

Parameters: { "use_label_encoder" } are not used.



0:	learn: 0.5785310	total: 4.73ms	remaining: 4.72s
100:	learn: 0.1071573	total: 455ms	remaining: 4.05s
200:	learn: 0.0650612	total: 849ms	remaining: 3.38s
300:	learn: 0.0431047	total: 1.27s	remaining: 2.96s
400:	learn: 0.0304908	total: 1.7s	remaining: 2.54s
500:	learn: 0.0226870	total: 2.15s	remaining: 2.14s
600:	learn: 0.0177688	total: 2.59s	remaining: 1.72s
700:	learn: 0.0144000	total: 3.05s	remaining: 1.3s
800:	learn: 0.0122343	total: 3.49s	remaining: 867ms
900:	learn: 0.0104172	total: 3.94s	remaining: 433ms
999:	learn: 0.0093536	total: 4.37s	remaining: 0us


Parameters: { "use_label_encoder" } are not used.



0:	learn: 0.5733256	total: 5.31ms	remaining: 5.31s
100:	learn: 0.0982809	total: 464ms	remaining: 4.13s
200:	learn: 0.0584185	total: 921ms	remaining: 3.66s
300:	learn: 0.0366199	total: 1.36s	remaining: 3.17s
400:	learn: 0.0244561	total: 1.8s	remaining: 2.69s
500:	learn: 0.0183659	total: 2.21s	remaining: 2.2s
600:	learn: 0.0142048	total: 2.62s	remaining: 1.74s
700:	learn: 0.0116738	total: 2.98s	remaining: 1.27s
800:	learn: 0.0097143	total: 3.37s	remaining: 838ms
900:	learn: 0.0087821	total: 3.79s	remaining: 416ms
999:	learn: 0.0078476	total: 4.19s	remaining: 0us


Parameters: { "use_label_encoder" } are not used.



0:	learn: 0.5780561	total: 6.23ms	remaining: 6.22s
100:	learn: 0.1111752	total: 395ms	remaining: 3.52s
200:	learn: 0.0672155	total: 808ms	remaining: 3.21s
300:	learn: 0.0414508	total: 1.25s	remaining: 2.9s
400:	learn: 0.0293439	total: 1.7s	remaining: 2.54s
500:	learn: 0.0211616	total: 2.15s	remaining: 2.14s
600:	learn: 0.0167282	total: 2.5s	remaining: 1.66s
700:	learn: 0.0135363	total: 2.9s	remaining: 1.24s
800:	learn: 0.0114164	total: 3.25s	remaining: 807ms
900:	learn: 0.0099901	total: 3.8s	remaining: 417ms
999:	learn: 0.0092753	total: 4.25s	remaining: 0us


Parameters: { "use_label_encoder" } are not used.



0:	learn: 0.5768994	total: 7.39ms	remaining: 7.38s
100:	learn: 0.1145030	total: 371ms	remaining: 3.3s
200:	learn: 0.0714900	total: 728ms	remaining: 2.89s
300:	learn: 0.0461566	total: 1.09s	remaining: 2.52s
400:	learn: 0.0335462	total: 1.43s	remaining: 2.14s
500:	learn: 0.0247397	total: 1.82s	remaining: 1.81s
600:	learn: 0.0188524	total: 2.24s	remaining: 1.49s
700:	learn: 0.0156000	total: 2.64s	remaining: 1.13s
800:	learn: 0.0132015	total: 3.04s	remaining: 756ms
900:	learn: 0.0117447	total: 3.46s	remaining: 380ms
999:	learn: 0.0107278	total: 3.87s	remaining: 0us


Parameters: { "use_label_encoder" } are not used.



0:	learn: 0.5719751	total: 4.77ms	remaining: 4.76s
100:	learn: 0.0996137	total: 462ms	remaining: 4.11s
200:	learn: 0.0573304	total: 934ms	remaining: 3.71s
300:	learn: 0.0358949	total: 1.39s	remaining: 3.23s
400:	learn: 0.0249307	total: 1.84s	remaining: 2.75s
500:	learn: 0.0181705	total: 2.34s	remaining: 2.33s
600:	learn: 0.0142450	total: 2.82s	remaining: 1.87s
700:	learn: 0.0114224	total: 3.29s	remaining: 1.41s
800:	learn: 0.0096208	total: 3.69s	remaining: 918ms
900:	learn: 0.0084883	total: 4.11s	remaining: 451ms
999:	learn: 0.0078242	total: 4.54s	remaining: 0us
Random Forest Scores: [0.9494949494949495, 0.9324324324324325, 0.9494949494949495, 0.9459459459459459, 0.9169435215946844]
XGBoost Scores: [0.9337748344370861, 0.9220338983050848, 0.9292929292929293, 0.9395973154362416, 0.9042904290429042]
CatBoost Scores: [0.9431438127090301, 0.9403973509933775, 0.9504950495049505, 0.9530201342281879, 0.9139072847682119]
Friedman Test Statistic: 7.6000000000000085
P-value: 0.0223707718561655
R