In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import seaborn as sns
import matplotlib.pyplot as plt


In [2]:
# Load the dataset
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)

# Standardize the data
scaler = StandardScaler()
X = scaler.fit_transform(X)


In [3]:
# Define the Stratified K-Fold cross-validator
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize lists to store results
accuracies_original = []
accuracies_corr = []
accuracies_pca = []
accuracies_lda = []

# Perform k-fold cross-validation
for train_index, test_index in kf.split(X, y):
    # Split the data into training and testing sets for this fold
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Feature Reduction using Correlation Coefficient
    corr_matrix = pd.DataFrame(X_train).corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]
    X_train_corr = pd.DataFrame(X_train).drop(columns=to_drop)
    X_test_corr = pd.DataFrame(X_test).drop(columns=to_drop)

    # Feature Reduction using PCA
    pca = PCA(n_components=0.95)  # Preserve 95% of the variance
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)

    # Feature Reduction using LDA
    lda = LDA(n_components=1)  # LDA can have at most (number of classes - 1) components for binary classification
    X_train_lda = lda.fit_transform(X_train, y_train)
    X_test_lda = lda.transform(X_test)

    # Train and Evaluate Logistic Regression Models

    # Original Dataset
    model_original = LogisticRegression(random_state=42)
    model_original.fit(X_train, y_train)
    y_pred_original = model_original.predict(X_test)
    accuracies_original.append(accuracy_score(y_test, y_pred_original))

    # Correlation-Reduced Dataset
    model_corr = LogisticRegression(random_state=42)
    model_corr.fit(X_train_corr, y_train)
    y_pred_corr = model_corr.predict(X_test_corr)
    accuracies_corr.append(accuracy_score(y_test, y_pred_corr))

    # PCA-Reduced Dataset
    model_pca = LogisticRegression(random_state=42)
    model_pca.fit(X_train_pca, y_train)
    y_pred_pca = model_pca.predict(X_test_pca)
    accuracies_pca.append(accuracy_score(y_test, y_pred_pca))

    # LDA-Reduced Dataset
    model_lda = LogisticRegression(random_state=42)
    model_lda.fit(X_train_lda, y_train)
    y_pred_lda = model_lda.predict(X_test_lda)
    accuracies_lda.append(accuracy_score(y_test, y_pred_lda))

# Calculate the mean accuracy for each method across all folds
print("Mean Accuracy for Original Dataset:", np.mean(accuracies_original))
print("Mean Accuracy for Correlation-Reduced Dataset:", np.mean(accuracies_corr))
print("Mean Accuracy for PCA-Reduced Dataset:", np.mean(accuracies_pca))
print("Mean Accuracy for LDA-Reduced Dataset:", np.mean(accuracies_lda))


Mean Accuracy for Original Dataset: 0.9736686849868033
Mean Accuracy for Correlation-Reduced Dataset: 0.9754075454122031
Mean Accuracy for PCA-Reduced Dataset: 0.9771774569166279
Mean Accuracy for LDA-Reduced Dataset: 0.9684055270920664


In [5]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
import seaborn as sns
import matplotlib.pyplot as plt


In [6]:
# Load the dataset
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)

# Standardize the data
scaler = StandardScaler()
X = scaler.fit_transform(X)


In [7]:
# Define the Stratified K-Fold cross-validator
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize lists to store results
accuracies_original = []
precisions_original = []
recalls_original = []
f1s_original = []
confusion_matrices_original = []

accuracies_corr = []
precisions_corr = []
recalls_corr = []
f1s_corr = []
confusion_matrices_corr = []

accuracies_pca = []
precisions_pca = []
recalls_pca = []
f1s_pca = []
confusion_matrices_pca = []

accuracies_lda = []
precisions_lda = []
recalls_lda = []
f1s_lda = []
confusion_matrices_lda = []

# Perform k-fold cross-validation
for train_index, test_index in kf.split(X, y):
    # Split the data into training and testing sets for this fold
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Feature Reduction using Correlation Coefficient
    corr_matrix = pd.DataFrame(X_train).corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]
    X_train_corr = pd.DataFrame(X_train).drop(columns=to_drop)
    X_test_corr = pd.DataFrame(X_test).drop(columns=to_drop)

    # Feature Reduction using PCA
    pca = PCA(n_components=0.95)  # Preserve 95% of the variance
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)

    # Feature Reduction using LDA
    lda = LDA(n_components=1)  # LDA can have at most (number of classes - 1) components for binary classification
    X_train_lda = lda.fit_transform(X_train, y_train)
    X_test_lda = lda.transform(X_test)

    # Train and Evaluate Logistic Regression Models

    # Original Dataset
    model_original = LogisticRegression(random_state=42)
    model_original.fit(X_train, y_train)
    y_pred_original = model_original.predict(X_test)
    accuracies_original.append(accuracy_score(y_test, y_pred_original))
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred_original, average='binary')
    precisions_original.append(precision)
    recalls_original.append(recall)
    f1s_original.append(f1)
    confusion_matrices_original.append(confusion_matrix(y_test, y_pred_original))

    # Correlation-Reduced Dataset
    model_corr = LogisticRegression(random_state=42)
    model_corr.fit(X_train_corr, y_train)
    y_pred_corr = model_corr.predict(X_test_corr)
    accuracies_corr.append(accuracy_score(y_test, y_pred_corr))
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred_corr, average='binary')
    precisions_corr.append(precision)
    recalls_corr.append(recall)
    f1s_corr.append(f1)
    confusion_matrices_corr.append(confusion_matrix(y_test, y_pred_corr))

    # PCA-Reduced Dataset
    model_pca = LogisticRegression(random_state=42)
    model_pca.fit(X_train_pca, y_train)
    y_pred_pca = model_pca.predict(X_test_pca)
    accuracies_pca.append(accuracy_score(y_test, y_pred_pca))
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred_pca, average='binary')
    precisions_pca.append(precision)
    recalls_pca.append(recall)
    f1s_pca.append(f1)
    confusion_matrices_pca.append(confusion_matrix(y_test, y_pred_pca))

    # LDA-Reduced Dataset
    model_lda = LogisticRegression(random_state=42)
    model_lda.fit(X_train_lda, y_train)
    y_pred_lda = model_lda.predict(X_test_lda)
    accuracies_lda.append(accuracy_score(y_test, y_pred_lda))
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred_lda, average='binary')
    precisions_lda.append(precision)
    recalls_lda.append(recall)
    f1s_lda.append(f1)
    confusion_matrices_lda.append(confusion_matrix(y_test, y_pred_lda))

# Calculate mean performance metrics across all folds
print("Mean Accuracy for Original Dataset:", np.mean(accuracies_original))
print("Mean Precision for Original Dataset:", np.mean(precisions_original))
print("Mean Recall for Original Dataset:", np.mean(recalls_original))
print("Mean F1-Score for Original Dataset:", np.mean(f1s_original))
print("Confusion Matrix for Original Dataset:\n", np.sum(confusion_matrices_original, axis=0))

print("\nMean Accuracy for Correlation-Reduced Dataset:", np.mean(accuracies_corr))
print("Mean Precision for Correlation-Reduced Dataset:", np.mean(precisions_corr))
print("Mean Recall for Correlation-Reduced Dataset:", np.mean(recalls_corr))
print("Mean F1-Score for Correlation-Reduced Dataset:", np.mean(f1s_corr))
print("Confusion Matrix for Correlation-Reduced Dataset:\n", np.sum(confusion_matrices_corr, axis=0))

print("\nMean Accuracy for PCA-Reduced Dataset:", np.mean(accuracies_pca))
print("Mean Precision for PCA-Reduced Dataset:", np.mean(precisions_pca))
print("Mean Recall for PCA-Reduced Dataset:", np.mean(recalls_pca))
print("Mean F1-Score for PCA-Reduced Dataset:", np.mean(f1s_pca))
print("Confusion Matrix for PCA-Reduced Dataset:\n", np.sum(confusion_matrices_pca, axis=0))

print("\nMean Accuracy for LDA-Reduced Dataset:", np.mean(accuracies_lda))
print("Mean Precision for LDA-Reduced Dataset:", np.mean(precisions_lda))
print("Mean Recall for LDA-Reduced Dataset:", np.mean(recalls_lda))
print("Mean F1-Score for LDA-Reduced Dataset:", np.mean(f1s_lda))
print("Confusion Matrix for LDA-Reduced Dataset:\n", np.sum(confusion_matrices_lda, axis=0))


Mean Accuracy for Original Dataset: 0.9736686849868033
Mean Precision for Original Dataset: 0.9682543479911901
Mean Recall for Original Dataset: 0.9915884194053209
Mean F1-Score for Original Dataset: 0.9794339645403476
Confusion Matrix for Original Dataset:
 [[200  12]
 [  3 354]]

Mean Accuracy for Correlation-Reduced Dataset: 0.9754075454122031
Mean Precision for Correlation-Reduced Dataset: 0.9703665199364704
Mean Recall for Correlation-Reduced Dataset: 0.9915884194053209
Mean F1-Score for Correlation-Reduced Dataset: 0.9807047622856266
Confusion Matrix for Correlation-Reduced Dataset:
 [[201  11]
 [  3 354]]

Mean Accuracy for PCA-Reduced Dataset: 0.9771774569166279
Mean Precision for PCA-Reduced Dataset: 0.9782569712569714
Mean Recall for PCA-Reduced Dataset: 0.9860328638497652
Mean F1-Score for PCA-Reduced Dataset: 0.9818711966159823
Confusion Matrix for PCA-Reduced Dataset:
 [[204   8]
 [  5 352]]

Mean Accuracy for LDA-Reduced Dataset: 0.9684055270920664
Mean Precision for LDA-

In [4]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler

# Load the dataset
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)

# Standardize the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Define the Stratified K-Fold cross-validator
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize lists to store results
accuracies_original = []
precisions_original = []
recalls_original = []
f1s_original = []
confusion_matrices_original = []

accuracies_corr = []
precisions_corr = []
recalls_corr = []
f1s_corr = []
confusion_matrices_corr = []

accuracies_pca = []
precisions_pca = []
recalls_pca = []
f1s_pca = []
confusion_matrices_pca = []

accuracies_lda = []
precisions_lda = []
recalls_lda = []
f1s_lda = []
confusion_matrices_lda = []

# Perform k-fold cross-validation
for train_index, test_index in kf.split(X, y):
    # Split the data into training and testing sets for this fold
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Feature Reduction using Correlation Coefficient
    corr_matrix = pd.DataFrame(X_train).corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]
    X_train_corr = pd.DataFrame(X_train).drop(columns=to_drop)
    X_test_corr = pd.DataFrame(X_test).drop(columns=to_drop)

    # Print the number of features before and after correlation-based reduction
    print(f"Original number of features: {X_train.shape[1]}")
    print(f"Number of features after correlation reduction: {X_train_corr.shape[1]}")

    # Feature Reduction using PCA
    pca = PCA(n_components=0.95)  # Preserve 95% of the variance
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)

    # Print the number of features after PCA reduction
    print(f"Number of features after PCA reduction: {X_train_pca.shape[1]}")

    # Feature Reduction using LDA
    lda = LDA(n_components=1)  # LDA can have at most (number of classes - 1) components for binary classification
    X_train_lda = lda.fit_transform(X_train, y_train)
    X_test_lda = lda.transform(X_test)

    # Print the number of features after LDA reduction
    print(f"Number of features after LDA reduction: {X_train_lda.shape[1]}")

    # Train and Evaluate Logistic Regression Models

    # Original Dataset
    model_original = LogisticRegression(random_state=42)
    model_original.fit(X_train, y_train)
    y_pred_original = model_original.predict(X_test)
    accuracies_original.append(accuracy_score(y_test, y_pred_original))
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred_original, average='binary')
    precisions_original.append(precision)
    recalls_original.append(recall)
    f1s_original.append(f1)
    confusion_matrices_original.append(confusion_matrix(y_test, y_pred_original))

    # Correlation-Reduced Dataset
    model_corr = LogisticRegression(random_state=42)
    model_corr.fit(X_train_corr, y_train)
    y_pred_corr = model_corr.predict(X_test_corr)
    accuracies_corr.append(accuracy_score(y_test, y_pred_corr))
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred_corr, average='binary')
    precisions_corr.append(precision)
    recalls_corr.append(recall)
    f1s_corr.append(f1)
    confusion_matrices_corr.append(confusion_matrix(y_test, y_pred_corr))

    # PCA-Reduced Dataset
    model_pca = LogisticRegression(random_state=42)
    model_pca.fit(X_train_pca, y_train)
    y_pred_pca = model_pca.predict(X_test_pca)
    accuracies_pca.append(accuracy_score(y_test, y_pred_pca))
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred_pca, average='binary')
    precisions_pca.append(precision)
    recalls_pca.append(recall)
    f1s_pca.append(f1)
    confusion_matrices_pca.append(confusion_matrix(y_test, y_pred_pca))

    # LDA-Reduced Dataset
    model_lda = LogisticRegression(random_state=42)
    model_lda.fit(X_train_lda, y_train)
    y_pred_lda = model_lda.predict(X_test_lda)
    accuracies_lda.append(accuracy_score(y_test, y_pred_lda))
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred_lda, average='binary')
    precisions_lda.append(precision)
    recalls_lda.append(recall)
    f1s_lda.append(f1)
    confusion_matrices_lda.append(confusion_matrix(y_test, y_pred_lda))

# Calculate mean performance metrics across all folds
print("\nMean Accuracy for Original Dataset:", np.mean(accuracies_original))
print("Mean Precision for Original Dataset:", np.mean(precisions_original))
print("Mean Recall for Original Dataset:", np.mean(recalls_original))
print("Mean F1-Score for Original Dataset:", np.mean(f1s_original))
print("Confusion Matrix for Original Dataset:\n", np.sum(confusion_matrices_original, axis=0))

print("\nMean Accuracy for Correlation-Reduced Dataset:", np.mean(accuracies_corr))
print("Mean Precision for Correlation-Reduced Dataset:", np.mean(precisions_corr))
print("Mean Recall for Correlation-Reduced Dataset:", np.mean(recalls_corr))
print("Mean F1-Score for Correlation-Reduced Dataset:", np.mean(f1s_corr))
print("Confusion Matrix for Correlation-Reduced Dataset:\n", np.sum(confusion_matrices_corr, axis=0))

print("\nMean Accuracy for PCA-Reduced Dataset:", np.mean(accuracies_pca))
print("Mean Precision for PCA-Reduced Dataset:", np.mean(precisions_pca))
print("Mean Recall for PCA-Reduced Dataset:", np.mean(recalls_pca))
print("Mean F1-Score for PCA-Reduced Dataset:", np.mean(f1s_pca))
print("Confusion Matrix for PCA-Reduced Dataset:\n", np.sum(confusion_matrices_pca, axis=0))

print("\nMean Accuracy for LDA-Reduced Dataset:", np.mean(accuracies_lda))
print("Mean Precision for LDA-Reduced Dataset:", np.mean(precisions_lda))
print("Mean Recall for LDA-Reduced Dataset:", np.mean(recalls_lda))
print("Mean F1-Score for LDA-Reduced Dataset:", np.mean(f1s_lda))
print("Confusion Matrix for LDA-Reduced Dataset:\n", np.sum(confusion_matrices_lda, axis=0))


Original number of features: 30
Number of features after correlation reduction: 20
Number of features after PCA reduction: 10
Number of features after LDA reduction: 1
Original number of features: 30
Number of features after correlation reduction: 19
Number of features after PCA reduction: 10
Number of features after LDA reduction: 1
Original number of features: 30
Number of features after correlation reduction: 20
Number of features after PCA reduction: 10
Number of features after LDA reduction: 1
Original number of features: 30
Number of features after correlation reduction: 20
Number of features after PCA reduction: 10
Number of features after LDA reduction: 1
Original number of features: 30
Number of features after correlation reduction: 20
Number of features after PCA reduction: 10
Number of features after LDA reduction: 1

Mean Accuracy for Original Dataset: 0.9736686849868033
Mean Precision for Original Dataset: 0.9682543479911901
Mean Recall for Original Dataset: 0.991588419405