In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

In [2]:
# read data
data = pd.read_csv("alzheimers_disease_data.csv")
data.shape

(2149, 35)

In [3]:
data.drop(['PatientID', 'DoctorInCharge'], axis=1, inplace=True)
data.head(5)

Unnamed: 0,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,SleepQuality,...,FunctionalAssessment,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,Diagnosis
0,73,0,0,2,22.927749,0,13.297218,6.327112,1.347214,9.025679,...,6.518877,0,0,1.725883,0,0,0,1,0,0
1,89,0,0,0,26.827681,0,4.542524,7.619885,0.518767,7.151293,...,7.118696,0,0,2.592424,0,0,0,0,1,0
2,73,0,3,1,17.795882,0,19.555085,7.844988,1.826335,9.673574,...,5.895077,0,0,7.119548,0,1,0,1,0,0
3,74,1,0,1,33.800817,1,12.209266,8.428001,7.435604,8.392554,...,8.965106,0,1,6.481226,0,0,0,0,0,0
4,89,0,0,0,20.716974,0,18.454356,6.310461,0.795498,5.597238,...,6.045039,0,0,0.014691,0,0,1,1,0,0


In [7]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# Step 1: Select the nominal categorical features
nominal_features = ['Ethnicity', 'EducationLevel']

# Step 2: Initialize One-Hot Encoder without dropping any category
ohe = OneHotEncoder(drop=None, sparse_output=False)  # Keep all categories

# Step 3: Fit and transform the categorical features
encoded_features = ohe.fit_transform(data[nominal_features])

# Step 4: Convert the encoded features into a DataFrame
encoded_df = pd.DataFrame(encoded_features, columns=ohe.get_feature_names_out(nominal_features))

# Step 5: Drop the original categorical columns and merge encoded features
data = data.drop(columns=nominal_features).reset_index(drop=True)  # Drop original categorical columns
data = pd.concat([data, encoded_df], axis=1)  # Merge encoded data

# Step 6: Verify the transformed data
print("Updated Dataset After One-Hot Encoding:")
print(data.head())


Updated Dataset After One-Hot Encoding:
   Age  Gender        BMI  Smoking  AlcoholConsumption  PhysicalActivity  \
0   73       0  22.927749        0           13.297218          6.327112   
1   89       0  26.827681        0            4.542524          7.619885   
2   73       0  17.795882        0           19.555085          7.844988   
3   74       1  33.800817        1           12.209266          8.428001   
4   89       0  20.716974        0           18.454356          6.310461   

   DietQuality  SleepQuality  FamilyHistoryAlzheimers  CardiovascularDisease  \
0     1.347214      9.025679                        0                      0   
1     0.518767      7.151293                        0                      0   
2     1.826335      9.673574                        1                      0   
3     7.435604      8.392554                        0                      0   
4     0.795498      5.597238                        0                      0   

   ...  Forgetfulness 

In [9]:
data.shape

(2149, 39)

In [11]:
selected_features = [
    'FunctionalAssessment', 'ADL', 'MemoryComplaints', 'MMSE', 'BehavioralProblems', 'SleepQuality',
    'CholesterolHDL', 'Ethnicity_2', 'Hypertension', 'Ethnicity_1', 'CholesterolLDL', 'Diabetes',
    'EducationLevel_3', 'BMI', 'Ethnicity_3', 'Disorientation', 'CholesterolTriglycerides',
    'AlcoholConsumption', 'Forgetfulness', 'PersonalityChanges', 'Gender'
]

In [9]:
# selected_features = ['FunctionalAssessment', 'ADL', 'MemoryComplaints', 'MMSE', 'BehavioralProblems', 'SleepQuality', 'CholesterolHDL', 'CholesterolLDL', 'BMI', 'CholesterolTriglycerides', 'Age', 'PhysicalActivity', 'DietQuality', 'DiastolicBP', 'Gender']

In [13]:
X = data[selected_features]  
y = data['Diagnosis']

In [15]:
X.shape

(2149, 21)

In [15]:
# Split the data into 80% train and 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Fit on training data and transform both train and test separately
X_train_scaled = scaler.fit_transform(X_train)  
X_test_scaled = scaler.transform(X_test)  

## DT

In [22]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

dt = DecisionTreeClassifier(random_state=42)

dt.fit(X_train_scaled, y_train)

dt_y_pred = dt.predict(X_test_scaled)

print("Accuracy:", accuracy_score(y_test, dt_y_pred))
print("Classification Report:\n", classification_report(y_test, dt_y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, dt_y_pred))

Accuracy: 0.9
Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.94      0.92       277
           1       0.88      0.83      0.86       153

    accuracy                           0.90       430
   macro avg       0.90      0.88      0.89       430
weighted avg       0.90      0.90      0.90       430

Confusion Matrix:
 [[260  17]
 [ 26 127]]


## RF

In [25]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

rf = RandomForestClassifier(random_state=42, n_estimators=100)  # 100 trees in the forest

rf.fit(X_train_scaled, y_train)

rf_y_pred = rf.predict(X_test_scaled)

print("Accuracy:", accuracy_score(y_test, rf_y_pred))
print("\nClassification Report:\n", classification_report(y_test, rf_y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, rf_y_pred))


Accuracy: 0.9418604651162791

Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.98      0.96       277
           1       0.96      0.87      0.91       153

    accuracy                           0.94       430
   macro avg       0.95      0.93      0.94       430
weighted avg       0.94      0.94      0.94       430


Confusion Matrix:
 [[272   5]
 [ 20 133]]


## XGBoost

In [28]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

xgb = XGBClassifier(random_state=42, n_estimators=100, use_label_encoder=False, eval_metric='logloss')

xgb.fit(X_train_scaled, y_train)

xgb_y_pred = xgb.predict(X_test_scaled)

print("Accuracy:", accuracy_score(y_test, xgb_y_pred))
print("\nClassification Report:\n", classification_report(y_test, xgb_y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, xgb_y_pred))

Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.9465116279069767

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.97      0.96       277
           1       0.95      0.90      0.92       153

    accuracy                           0.95       430
   macro avg       0.95      0.94      0.94       430
weighted avg       0.95      0.95      0.95       430


Confusion Matrix:
 [[270   7]
 [ 16 137]]


## Catboost

In [27]:
!pip install catboost



In [31]:
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

catboost = CatBoostClassifier(random_state=42, iterations=100, verbose=0)  # 100 iterations, silent training

catboost.fit(X_train_scaled, y_train)

catboost_y_pred = catboost.predict(X_test_scaled)

print("Accuracy:", accuracy_score(y_test, catboost_y_pred))
print("\nClassification Report:\n", classification_report(y_test, catboost_y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, catboost_y_pred))


Accuracy: 0.9558139534883721

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.98      0.97       277
           1       0.96      0.92      0.94       153

    accuracy                           0.96       430
   macro avg       0.96      0.95      0.95       430
weighted avg       0.96      0.96      0.96       430


Confusion Matrix:
 [[271   6]
 [ 13 140]]


## AdaBoost

In [34]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

adaboost = AdaBoostClassifier(random_state=42, n_estimators=100)  # 100 weak learners

adaboost.fit(X_train_scaled, y_train)

adaboost_y_pred = adaboost.predict(X_test_scaled)

print("Accuracy:", accuracy_score(y_test, adaboost_y_pred))
print("\nClassification Report:\n", classification_report(y_test, adaboost_y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, adaboost_y_pred))




Accuracy: 0.8976744186046511

Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.94      0.92       277
           1       0.88      0.83      0.85       153

    accuracy                           0.90       430
   macro avg       0.89      0.88      0.89       430
weighted avg       0.90      0.90      0.90       430


Confusion Matrix:
 [[259  18]
 [ 26 127]]


## SVM

In [36]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize SVM Classifier
svm = SVC(random_state=42, kernel='rbf')  # Using RBF kernel (default)

# Train the model
svm.fit(X_train_scaled, y_train)

# Make predictions
svm_y_pred = svm.predict(X_test_scaled)

# Print evaluation metrics
print("Accuracy:", accuracy_score(y_test, svm_y_pred))
print("\nClassification Report:\n", classification_report(y_test, svm_y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, svm_y_pred))


Accuracy: 0.8534883720930233

Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.92      0.89       277
           1       0.83      0.74      0.78       153

    accuracy                           0.85       430
   macro avg       0.85      0.83      0.84       430
weighted avg       0.85      0.85      0.85       430


Confusion Matrix:
 [[254  23]
 [ 40 113]]


# Stratified cross validation

### DT

In [40]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import numpy as np

# Initialize Decision Tree Classifier
dt = DecisionTreeClassifier(random_state=42)

# Apply 10-Fold Stratified Cross-Validation
stratified_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Lists to store metrics for each fold
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

# Perform 10-Fold Cross-Validation
for fold, (train_idx, test_idx) in enumerate(stratified_cv.split(X_train_scaled, y_train), 1):
    X_train_fold, X_test_fold = X_train_scaled[train_idx], X_train_scaled[test_idx]
    y_train_fold, y_test_fold = y_train.iloc[train_idx], y_train.iloc[test_idx]

    # Train model on current fold
    dt.fit(X_train_fold, y_train_fold)
    y_pred_fold = dt.predict(X_test_fold)

    # Calculate metrics
    accuracy = accuracy_score(y_test_fold, y_pred_fold)
    precision = precision_score(y_test_fold, y_pred_fold, average='weighted')  # Weighted for class imbalance
    recall = recall_score(y_test_fold, y_pred_fold, average='weighted')
    f1 = f1_score(y_test_fold, y_pred_fold, average='weighted')

    # Store results
    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)

    # Print metrics for current fold
    print(f"Fold {fold}:")
    print(f"  Accuracy: {accuracy:.3f}")
    print(f"  Precision: {precision:.3f}")
    print(f"  Recall: {recall:.3f}")
    print(f"  F1-Score: {f1:.3f}\n")

# Print average results across all folds
print("\nFinal Cross-Validation Results (Average Over 10 Folds):")
print(f"  Accuracy: {np.mean(accuracy_scores):.3f} ± {np.std(accuracy_scores):.3f}")
print(f"  Precision: {np.mean(precision_scores):.3f} ± {np.std(precision_scores):.3f}")
print(f"  Recall: {np.mean(recall_scores):.3f} ± {np.std(recall_scores):.3f}")
print(f"  F1-Score: {np.mean(f1_scores):.3f} ± {np.std(f1_scores):.3f}")


Fold 1:
  Accuracy: 0.924
  Precision: 0.927
  Recall: 0.924
  F1-Score: 0.925

Fold 2:
  Accuracy: 0.884
  Precision: 0.884
  Recall: 0.884
  F1-Score: 0.884

Fold 3:
  Accuracy: 0.866
  Precision: 0.866
  Recall: 0.866
  F1-Score: 0.866

Fold 4:
  Accuracy: 0.890
  Precision: 0.890
  Recall: 0.890
  F1-Score: 0.890

Fold 5:
  Accuracy: 0.913
  Precision: 0.913
  Recall: 0.913
  F1-Score: 0.913

Fold 6:
  Accuracy: 0.872
  Precision: 0.871
  Recall: 0.872
  F1-Score: 0.870

Fold 7:
  Accuracy: 0.890
  Precision: 0.889
  Recall: 0.890
  F1-Score: 0.888

Fold 8:
  Accuracy: 0.866
  Precision: 0.867
  Recall: 0.866
  F1-Score: 0.867

Fold 9:
  Accuracy: 0.930
  Precision: 0.930
  Recall: 0.930
  F1-Score: 0.930

Fold 10:
  Accuracy: 0.901
  Precision: 0.901
  Recall: 0.901
  F1-Score: 0.901


Final Cross-Validation Results (Average Over 10 Folds):
  Accuracy: 0.894 ± 0.022
  Precision: 0.894 ± 0.022
  Recall: 0.894 ± 0.022
  F1-Score: 0.893 ± 0.022


### RF

In [43]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import numpy as np

# Initialize Random Forest Classifier
rf = RandomForestClassifier(random_state=42, n_estimators=100)  # 100 trees in the forest

# Apply 10-Fold Stratified Cross-Validation
stratified_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Lists to store metrics for each fold
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

# Perform 10-Fold Cross-Validation
for fold, (train_idx, test_idx) in enumerate(stratified_cv.split(X_train_scaled, y_train), 1):
    X_train_fold, X_test_fold = X_train_scaled[train_idx], X_train_scaled[test_idx]
    y_train_fold, y_test_fold = y_train.iloc[train_idx], y_train.iloc[test_idx]

    # Train model on current fold
    rf.fit(X_train_fold, y_train_fold)
    y_pred_fold = rf.predict(X_test_fold)

    # Calculate metrics
    accuracy = accuracy_score(y_test_fold, y_pred_fold)
    precision = precision_score(y_test_fold, y_pred_fold, average='weighted')  # Weighted for class imbalance
    recall = recall_score(y_test_fold, y_pred_fold, average='weighted')
    f1 = f1_score(y_test_fold, y_pred_fold, average='weighted')

    # Store results
    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)

    # Print metrics for current fold
    print(f"Fold {fold}:")
    print(f"  Accuracy: {accuracy:.3f}")
    print(f"  Precision: {precision:.3f}")
    print(f"  Recall: {recall:.3f}")
    print(f"  F1-Score: {f1:.3f}\n")

# Print average results across all folds
print("\nFinal Cross-Validation Results (Average Over 10 Folds):")
print(f"  Accuracy: {np.mean(accuracy_scores):.3f} ± {np.std(accuracy_scores):.3f}")
print(f"  Precision: {np.mean(precision_scores):.3f} ± {np.std(precision_scores):.3f}")
print(f"  Recall: {np.mean(recall_scores):.3f} ± {np.std(recall_scores):.3f}")
print(f"  F1-Score: {np.mean(f1_scores):.3f} ± {np.std(f1_scores):.3f}")


Fold 1:
  Accuracy: 0.971
  Precision: 0.971
  Recall: 0.971
  F1-Score: 0.971

Fold 2:
  Accuracy: 0.942
  Precision: 0.942
  Recall: 0.942
  F1-Score: 0.941

Fold 3:
  Accuracy: 0.930
  Precision: 0.930
  Recall: 0.930
  F1-Score: 0.930

Fold 4:
  Accuracy: 0.924
  Precision: 0.924
  Recall: 0.924
  F1-Score: 0.924

Fold 5:
  Accuracy: 0.948
  Precision: 0.948
  Recall: 0.948
  F1-Score: 0.947

Fold 6:
  Accuracy: 0.953
  Precision: 0.955
  Recall: 0.953
  F1-Score: 0.953

Fold 7:
  Accuracy: 0.919
  Precision: 0.920
  Recall: 0.919
  F1-Score: 0.917

Fold 8:
  Accuracy: 0.948
  Precision: 0.948
  Recall: 0.948
  F1-Score: 0.947

Fold 9:
  Accuracy: 0.953
  Precision: 0.954
  Recall: 0.953
  F1-Score: 0.953

Fold 10:
  Accuracy: 0.965
  Precision: 0.965
  Recall: 0.965
  F1-Score: 0.965


Final Cross-Validation Results (Average Over 10 Folds):
  Accuracy: 0.945 ± 0.016
  Precision: 0.946 ± 0.016
  Recall: 0.945 ± 0.016
  F1-Score: 0.945 ± 0.016


### Catboost

In [45]:
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import numpy as np

# Initialize CatBoost Classifier
catboost = CatBoostClassifier(random_state=42, iterations=100, verbose=0)  # 100 iterations, silent training

# Apply 10-Fold Stratified Cross-Validation
stratified_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Lists to store metrics for each fold
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

# Perform 10-Fold Cross-Validation
for fold, (train_idx, test_idx) in enumerate(stratified_cv.split(X_train_scaled, y_train), 1):
    X_train_fold, X_test_fold = X_train_scaled[train_idx], X_train_scaled[test_idx]
    y_train_fold, y_test_fold = y_train.iloc[train_idx], y_train.iloc[test_idx]

    # Train model on current fold
    catboost.fit(X_train_fold, y_train_fold)
    y_pred_fold = catboost.predict(X_test_fold)

    # Calculate metrics
    accuracy = accuracy_score(y_test_fold, y_pred_fold)
    precision = precision_score(y_test_fold, y_pred_fold, average='weighted')  # Weighted for class imbalance
    recall = recall_score(y_test_fold, y_pred_fold, average='weighted')
    f1 = f1_score(y_test_fold, y_pred_fold, average='weighted')

    # Store results
    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)

    # Print metrics for current fold
    print(f"Fold {fold}:")
    print(f"  Accuracy: {accuracy:.3f}")
    print(f"  Precision: {precision:.3f}")
    print(f"  Recall: {recall:.3f}")
    print(f"  F1-Score: {f1:.3f}\n")

# Print average results across all folds
print("\nFinal Cross-Validation Results (Average Over 10 Folds):")
print(f"  Accuracy: {np.mean(accuracy_scores):.3f} ± {np.std(accuracy_scores):.3f}")
print(f"  Precision: {np.mean(precision_scores):.3f} ± {np.std(precision_scores):.3f}")
print(f"  Recall: {np.mean(recall_scores):.3f} ± {np.std(recall_scores):.3f}")
print(f"  F1-Score: {np.mean(f1_scores):.3f} ± {np.std(f1_scores):.3f}")


Fold 1:
  Accuracy: 0.971
  Precision: 0.971
  Recall: 0.971
  F1-Score: 0.971

Fold 2:
  Accuracy: 0.948
  Precision: 0.948
  Recall: 0.948
  F1-Score: 0.947

Fold 3:
  Accuracy: 0.930
  Precision: 0.930
  Recall: 0.930
  F1-Score: 0.930

Fold 4:
  Accuracy: 0.936
  Precision: 0.936
  Recall: 0.936
  F1-Score: 0.936

Fold 5:
  Accuracy: 0.953
  Precision: 0.953
  Recall: 0.953
  F1-Score: 0.953

Fold 6:
  Accuracy: 0.953
  Precision: 0.955
  Recall: 0.953
  F1-Score: 0.953

Fold 7:
  Accuracy: 0.936
  Precision: 0.936
  Recall: 0.936
  F1-Score: 0.935

Fold 8:
  Accuracy: 0.948
  Precision: 0.948
  Recall: 0.948
  F1-Score: 0.947

Fold 9:
  Accuracy: 0.971
  Precision: 0.971
  Recall: 0.971
  F1-Score: 0.971

Fold 10:
  Accuracy: 0.977
  Precision: 0.977
  Recall: 0.977
  F1-Score: 0.977


Final Cross-Validation Results (Average Over 10 Folds):
  Accuracy: 0.952 ± 0.015
  Precision: 0.953 ± 0.015
  Recall: 0.952 ± 0.015
  F1-Score: 0.952 ± 0.015


### XGboost

In [47]:
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import numpy as np

# Initialize XGBoost Classifier
xgb = XGBClassifier(random_state=42, n_estimators=100, use_label_encoder=False, eval_metric='logloss')

# Apply 10-Fold Stratified Cross-Validation
stratified_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Lists to store metrics for each fold
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

# Perform 10-Fold Cross-Validation
for fold, (train_idx, test_idx) in enumerate(stratified_cv.split(X_train_scaled, y_train), 1):
    X_train_fold, X_test_fold = X_train_scaled[train_idx], X_train_scaled[test_idx]
    y_train_fold, y_test_fold = y_train.iloc[train_idx], y_train.iloc[test_idx]

    # Train model on current fold
    xgb.fit(X_train_fold, y_train_fold)
    y_pred_fold = xgb.predict(X_test_fold)

    # Calculate metrics
    accuracy = accuracy_score(y_test_fold, y_pred_fold)
    precision = precision_score(y_test_fold, y_pred_fold, average='weighted')  # Weighted for class imbalance
    recall = recall_score(y_test_fold, y_pred_fold, average='weighted')
    f1 = f1_score(y_test_fold, y_pred_fold, average='weighted')

    # Store results
    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)

    # Print metrics for current fold
    print(f"Fold {fold}:")
    print(f"  Accuracy: {accuracy:.3f}")
    print(f"  Precision: {precision:.3f}")
    print(f"  Recall: {recall:.3f}")
    print(f"  F1-Score: {f1:.3f}\n")

# Print average results across all folds
print("\nFinal Cross-Validation Results (Average Over 10 Folds):")
print(f"  Accuracy: {np.mean(accuracy_scores):.3f} ± {np.std(accuracy_scores):.3f}")
print(f"  Precision: {np.mean(precision_scores):.3f} ± {np.std(precision_scores):.3f}")
print(f"  Recall: {np.mean(recall_scores):.3f} ± {np.std(recall_scores):.3f}")
print(f"  F1-Score: {np.mean(f1_scores):.3f} ± {np.std(f1_scores):.3f}")


Parameters: { "use_label_encoder" } are not used.



Fold 1:
  Accuracy: 0.977
  Precision: 0.977
  Recall: 0.977
  F1-Score: 0.977



Parameters: { "use_label_encoder" } are not used.



Fold 2:
  Accuracy: 0.948
  Precision: 0.948
  Recall: 0.948
  F1-Score: 0.947



Parameters: { "use_label_encoder" } are not used.



Fold 3:
  Accuracy: 0.924
  Precision: 0.924
  Recall: 0.924
  F1-Score: 0.924



Parameters: { "use_label_encoder" } are not used.



Fold 4:
  Accuracy: 0.942
  Precision: 0.943
  Recall: 0.942
  F1-Score: 0.942



Parameters: { "use_label_encoder" } are not used.



Fold 5:
  Accuracy: 0.953
  Precision: 0.953
  Recall: 0.953
  F1-Score: 0.953



Parameters: { "use_label_encoder" } are not used.



Fold 6:
  Accuracy: 0.953
  Precision: 0.955
  Recall: 0.953
  F1-Score: 0.953



Parameters: { "use_label_encoder" } are not used.



Fold 7:
  Accuracy: 0.924
  Precision: 0.925
  Recall: 0.924
  F1-Score: 0.924



Parameters: { "use_label_encoder" } are not used.



Fold 8:
  Accuracy: 0.953
  Precision: 0.954
  Recall: 0.953
  F1-Score: 0.953



Parameters: { "use_label_encoder" } are not used.



Fold 9:
  Accuracy: 0.959
  Precision: 0.960
  Recall: 0.959
  F1-Score: 0.959



Parameters: { "use_label_encoder" } are not used.



Fold 10:
  Accuracy: 0.977
  Precision: 0.977
  Recall: 0.977
  F1-Score: 0.977


Final Cross-Validation Results (Average Over 10 Folds):
  Accuracy: 0.951 ± 0.017
  Precision: 0.951 ± 0.017
  Recall: 0.951 ± 0.017
  F1-Score: 0.951 ± 0.017


## stacking on the top 3 models

In [28]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import confusion_matrix, classification_report


# Create the individual classifiers
rf = make_pipeline(StandardScaler(), RandomForestClassifier(n_estimators=100, random_state=42))
xgb = make_pipeline(StandardScaler(), XGBClassifier(n_estimators=100, random_state=42, eval_metric='logloss'))
catboost = make_pipeline(StandardScaler(), CatBoostClassifier(iterations=100, verbose=0, random_state=42))

# Create the stacking classifier
stacking_clf = StackingClassifier(
    estimators=[('rf', rf), ('xgb', xgb), ('catboost', catboost)],  # Base models
    final_estimator=RandomForestClassifier(n_estimators=100, random_state=42)  # Meta learner
)


# Perform cross-validation predictions
stacking_preds = cross_val_predict(stacking_clf, X_train, y_train, cv=5)

# Train the classifiers on the full training set
stacking_clf.fit(X_train, y_train)

# Predict on the test set
stacking_test_preds = stacking_clf.predict(X_test)

# Function to print classification report
def print_classification_report(y_true, y_pred, title):
    print(f"\n{title}")
    print(classification_report(y_true, y_pred))

# Print classification reports
print_classification_report(y_test, stacking_test_preds, "Classification Report for Stacking Classifier (Test Set)")
print(confusion_matrix(y_test, stacking_test_preds))



Classification Report for Stacking Classifier (Test Set)
              precision    recall  f1-score   support

           0       0.95      0.98      0.96       277
           1       0.96      0.91      0.93       153

    accuracy                           0.95       430
   macro avg       0.95      0.94      0.95       430
weighted avg       0.95      0.95      0.95       430

[[271   6]
 [ 14 139]]


# parameter tuning

### RF

In [36]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [50, 100, 200],  # Number of trees
    'max_depth': [10, 20, None],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum samples required to split a node
    'min_samples_leaf': [1, 2, 4],  # Minimum samples required at a leaf node
    'max_features': ['sqrt', 'log2'],  # Number of features to consider for best split
    'bootstrap': [True, False]  # Whether bootstrap samples are used when building trees
}

# Initialize the Random Forest model
rf = RandomForestClassifier(random_state=42)

# Perform Grid Search with 5-Fold Cross-Validation
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)

# Fit on training data
grid_search.fit(X_train_scaled, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Train Random Forest with best parameters
best_rf = RandomForestClassifier(**best_params, random_state=42)
best_rf.fit(X_train_scaled, y_train)

# Make predictions
rf_y_pred = best_rf.predict(X_test_scaled)

# Print evaluation metrics
print("\nAccuracy:", accuracy_score(y_test, rf_y_pred))
print("\nClassification Report:\n", classification_report(y_test, rf_y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, rf_y_pred))

Fitting 5 folds for each of 324 candidates, totalling 1620 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Best Hyperparameters: {'bootstrap': False, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}

Accuracy: 0.9511627906976744

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.98      0.96       277
           1       0.96      0.90      0.93       153

    accuracy                           0.95       430
   macro avg       0.95      0.94      0.95       430
weighted avg       0.95      0.95      0.95       430


Confusion Matrix:
 [[272   5]
 [ 16 137]]


### cat

In [None]:
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Define the hyperparameter grid
param_grid = {
    'iterations': [100, 200, 500],  # Number of boosting iterations
    'learning_rate': [0.01, 0.05, 0.1],  # Step size shrinkage to prevent overfitting
    'depth': [4, 6, 8, 10],  # Maximum depth of the trees
    'l2_leaf_reg': [1, 3, 5, 7],  # L2 regularization to prevent overfitting
    'border_count': [32, 64, 128],  # Number of bins used for numeric feature quantization
}

# Initialize the CatBoost model
catboost = CatBoostClassifier(random_state=42, verbose=0)

# Perform Grid Search with 5-Fold Cross-Validation
grid_search = GridSearchCV(catboost, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)

# Fit on training data
grid_search.fit(X_train_scaled, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Train CatBoost with best parameters
best_catboost = CatBoostClassifier(**best_params, random_state=42, verbose=0)
best_catboost.fit(X_train_scaled, y_train)

# Make predictions
catboost_y_pred = best_catboost.predict(X_test_scaled)

# Print evaluation metrics
print("\nAccuracy:", accuracy_score(y_test, catboost_y_pred))
print("\nClassification Report:\n", classification_report(y_test, catboost_y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, catboost_y_pred))


Fitting 5 folds for each of 432 candidates, totalling 2160 fits


## stacking after hyper parameter tuning 

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostClassifier
from sklearn.metrics import confusion_matrix, classification_report

# Create the individual classifiers
rf = make_pipeline(StandardScaler(), RandomForestClassifier(
    bootstrap=False,  # Do not use bootstrap sampling
    max_depth=None,  # No maximum depth (fully grown trees)
    max_features='sqrt',  # Use square root of features for best splits
    min_samples_leaf=1,  # Minimum 1 sample per leaf
    min_samples_split=2,  # Minimum 2 samples to split a node
    n_estimators=100,  # Number of trees in the forest
    random_state=42  # Ensures reproducibility
))
catboost = make_pipeline(StandardScaler(), CatBoostClassifier(iterations=100, verbose=0, random_state=42))

# Create the stacking classifier
stacking_clf = StackingClassifier(
    estimators=[('rf', rf), ('xgb', xgb), ('catboost', catboost)],  # Base models
    final_estimator=RandomForestClassifier(n_estimators=100, random_state=42)  # Meta learner
)


# Perform cross-validation predictions
stacking_preds = cross_val_predict(stacking_clf, X_train, y_train, cv=5)

# Train the classifiers on the full training set
stacking_clf.fit(X_train, y_train)

# Predict on the test set
stacking_test_preds = stacking_clf.predict(X_test)

# Function to print classification report
def print_classification_report(y_true, y_pred, title):
    print(f"\n{title}")
    print(classification_report(y_true, y_pred))

# Print classification reports
print_classification_report(y_test, stacking_test_preds, "Classification Report for Stacking Classifier (Test Set)")
print(confusion_matrix(y_test, stacking_test_preds))
