In [35]:
import numpy as np
import sklearn
import sklearn.datasets
import sklearn.model_selection
import sklearn.decomposition
import sklearn.neighbors
import sklearn.metrics
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_digits
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from collections import Counter

In [19]:
# PLEASE CHANGE THE VALUE OF DATASET TO CHANGE THE DATASET (0 = digits, 1 = 20 newsgroups)
dataset = 0

if dataset == 0:
    digits = sklearn.datasets.load_digits()
    X = digits.data
    y = digits.target

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    scaler = StandardScaler()

    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

elif dataset == 2:
    newsgroups = fetch_20newsgroups(subset='all', 
                                remove=('headers', 'footers', 'quotes'),
                                random_state=42)

    # Convert text to TF-IDF features
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(newsgroups.data)
    y = newsgroups.target

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [20]:
def compute_test(X, y, clf, cv=10):
    # Initialize cross-validation
    kfold = KFold(n_splits=cv, shuffle=True, random_state=42)
    scores = []
    
    for train_index, test_index in kfold.split(X):
        # Split data into train and test folds
        X_train_fold, X_test_fold = X[train_index], X[test_index]
        y_train_fold, y_test_fold = y[train_index], y[test_index]
        
        # Fit the classifier on the training fold
        clf.fit(X_train_fold, y_train_fold)
        
        # Predict on the test fold
        y_pred = clf.predict(X_test_fold)
        
        # Compute accuracy for the fold
        scores.append(accuracy_score(y_test_fold, y_pred))
    
    # Return the mean accuracy across all folds
    mean_score = np.mean(scores)
    print(f"Mean Cross-Validation Accuracy: {mean_score}")
    return mean_score

# 1

# PCA

In [21]:
# Define parameter grid
param_grid = {
    'pca__n_components': np.arange(2, 60, 2),  # Number of dimensions to test
    'knn__n_neighbors': np.arange(1, 21, 1),   # Number of neighbors to test
    'knn__weights': ['uniform', 'distance'],  # Weighting schemes
    'knn__metric': ['euclidean', 'manhattan']  # Distance metrics
}

# Create a pipeline with PCA and k-NN
pipeline = Pipeline([
    ('pca', PCA()),  # PCA step
    ('knn', KNeighborsClassifier())  # k-NN step
])

# Initialize GridSearchCV
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=10,
    scoring='accuracy',
    n_jobs=-1
)

# Perform the grid search
grid_search.fit(X_train, y_train)

# Retrieve the best parameters and accuracy
best_params_pca = grid_search.best_params_
best_accuracy_pca = grid_search.best_score_

print(f"Best Parameters: {best_params_pca}")
print(f"Best Cross-Validation Accuracy: {best_accuracy_pca}")

# Evaluate the best model on the test data
best_model_pca = grid_search.best_estimator_
test_accuracy_pca = best_model_pca.score(X_test, y_test)

print(f"Test Accuracy with Best Model: {test_accuracy_pca}")

# Now, use the compute_test function to print the mean accuracy during cross-validation
print("\nPerforming Compute Test on the best model:")
result_pca = compute_test(X_train, y_train, best_model_pca, cv=10)



Best Parameters: {'knn__metric': 'euclidean', 'knn__n_neighbors': 4, 'knn__weights': 'distance', 'pca__n_components': 38}
Best Cross-Validation Accuracy: 0.9777333333333331
Test Accuracy with Best Model: 0.9777777777777777

Performing Compute Test on the best model:
Mean Cross-Validation Accuracy: 0.9785142857142857


# LDA

In [None]:
# Define parameter grid

if dataset == 0:
    n_components = np.arange(1, 9, 1)
elif dataset == 1:
    n_components = np.arange(1, 19, 1)

param_grid = {
    'lda__n_components': n_components,  # Number of dimensions to test
    'knn__n_neighbors': np.arange(1, 21, 1),   # Number of neighbors to test
    'knn__weights': ['uniform', 'distance'],  # Weighting schemes
    'knn__metric': ['euclidean', 'manhattan']  # Distance metrics
}

# Create a pipeline with LDA and k-NN
pipeline = Pipeline([
    ('lda', LDA()),  # LDA step
    ('knn', KNeighborsClassifier())  # k-NN step
])

# Initialize GridSearchCV
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=10,
    scoring='accuracy',
    n_jobs=-1
)

# Perform the grid search
grid_search.fit(X_train, y_train)

# Retrieve the best parameters and accuracy
best_params_lda = grid_search.best_params_
best_accuracy_lda = grid_search.best_score_

print(f"Best Parameters: {best_params_lda}")
print(f"Best Cross-Validation Accuracy: {best_accuracy_lda}")

# Evaluate the best model on the test data
best_model_lda = grid_search.best_estimator_
test_accuracy_lda = best_model_lda.score(X_test, y_test)

print(f"Test Accuracy with Best Model: {test_accuracy_lda}")

# Now, use the compute_test function to print the mean accuracy during cross-validation
print("\nPerforming Compute Test on the best model:")
result_lda = compute_test(X_train, y_train, best_model_lda, cv=10)

Best Parameters: {'knn__metric': 'manhattan', 'knn__n_neighbors': 3, 'knn__weights': 'distance', 'lda__n_components': 8}
Best Cross-Validation Accuracy: 0.9625777777777778
Test Accuracy with Best Model: 0.9611111111111111

Performing Compute Test on the best model:
Mean Cross-Validation Accuracy: 0.959415873015873


# SVD

In [24]:
# Define parameter grid
param_grid = {
    'svd__n_components': np.arange(2, 60, 2),  # Number of dimensions to test
    'knn__n_neighbors': np.arange(1, 21, 1),   # Number of neighbors to test
    'knn__weights': ['uniform', 'distance'],  # Weighting schemes
    'knn__metric': ['euclidean', 'manhattan']  # Distance metrics
}

# Create a pipeline with SVD and k-NN
pipeline = Pipeline([
    ('svd', TruncatedSVD()),  # SVD step
    ('knn', KNeighborsClassifier())  # k-NN step
])

# Initialize GridSearchCV
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=10,
    scoring='accuracy',
    n_jobs=-1
)

# Perform the grid search
grid_search.fit(X_train, y_train)

# Retrieve the best parameters and accuracy
best_params_svd = grid_search.best_params_
best_accuracy_svd = grid_search.best_score_

print(f"Best Parameters: {best_params_svd}")
print(f"Best Cross-Validation Accuracy: {best_accuracy_svd}")

# Evaluate the best model on the test data
best_model_svd = grid_search.best_estimator_
test_accuracy_svd = best_model_svd.score(X_test, y_test)

print(f"Test Accuracy with Best Model: {test_accuracy_svd}")

# Now, use the compute_test function to print the mean accuracy during cross-validation
print("\nPerforming Compute Test on the best model:")
result_svd = compute_test(X_train, y_train, best_model_svd, cv=10)

Best Parameters: {'knn__metric': 'euclidean', 'knn__n_neighbors': 4, 'knn__weights': 'distance', 'svd__n_components': 24}
Best Cross-Validation Accuracy: 0.9777333333333331
Test Accuracy with Best Model: 0.975925925925926

Performing Compute Test on the best model:
Mean Cross-Validation Accuracy: 0.9729460317460319


In [25]:
results = {
    'PCA': {
        'best_params': best_params_pca,
        'best_accuracy': best_accuracy_pca,
        'test_accuracy': test_accuracy_pca,
        'cv_accuracy': result_pca
    },
    'SVD': {
        'best_params': best_params_svd,
        'best_accuracy': best_accuracy_svd,
        'test_accuracy': test_accuracy_svd,
        'cv_accuracy': result_svd
    },
    'LDA': {
        'best_params': best_params_lda,
        'best_accuracy': best_accuracy_lda,
        'test_accuracy': test_accuracy_lda,
        'cv_accuracy': result_lda
    }
}

# Find the method with the highest test accuracy
best_method = max(results, key=lambda x: results[x]['test_accuracy'])

# Print out the best method and comparison results
print(f"\nBest Method: {best_method}")
print(f"Best Parameters for {best_method}: {results[best_method]['best_params']}")
print(f"Best Cross-Validation Accuracy for {best_method}: {results[best_method]['best_accuracy']}")
print(f"Test Accuracy for {best_method}: {results[best_method]['test_accuracy']}")
print(f"Cross-Validation Accuracy for {best_method}: {results[best_method]['cv_accuracy']}")


Best Method: PCA
Best Parameters for PCA: {'knn__metric': 'euclidean', 'knn__n_neighbors': 4, 'knn__weights': 'distance', 'pca__n_components': 38}
Best Cross-Validation Accuracy for PCA: 0.9777333333333331
Test Accuracy for PCA: 0.9777777777777777
Cross-Validation Accuracy for PCA: 0.9785142857142857


# 2

In [27]:
# Define parameter grid for MLP
param_grid_mlp = {
    'pca__n_components': np.arange(2, 60, 2),  # Number of components to test for PCA
    'mlp__hidden_layer_sizes': [(100, 100), (150, 150), (50, 50)],  # Hidden layer configurations
    'mlp__learning_rate_init': [0.001, 0.01, 0.02],  # Learning rates
    'mlp__max_iter': [10, 50, 100]  # Iterations for training
}

# Create a pipeline with PCA and MLPClassifier
pipeline_mlp = Pipeline([
    ('pca', PCA()),  # PCA step
    ('mlp', MLPClassifier(activation='relu', solver='sgd', learning_rate='constant'))  # MLP step
])

# Initialize GridSearchCV for MLP
grid_search_mlp = GridSearchCV(
    pipeline_mlp,
    param_grid_mlp,
    cv=10,
    scoring='accuracy',
    n_jobs=-1
)

# Perform the grid search for MLP
grid_search_mlp.fit(X_train, y_train)

# Retrieve the best parameters and accuracy for MLP
best_params_mlp = grid_search_mlp.best_params_
best_accuracy_mlp = grid_search_mlp.best_score_

print(f"Best Parameters for MLP: {best_params_mlp}")
print(f"Best Cross-Validation Accuracy for MLP: {best_accuracy_mlp}")

# Evaluate the best model on the test data for MLP
best_model_mlp = grid_search_mlp.best_estimator_
test_accuracy_mlp = best_model_mlp.score(X_test, y_test)
print(f"Test Accuracy for MLP: {test_accuracy_mlp}")

# Perform compute_test to evaluate cross-validation performance
result_mlp = compute_test(X_train, y_train, best_model_mlp, cv=10)
print(f"Mean Cross-Validation Accuracy for MLP: {result_mlp}")

Best Parameters for MLP: {'mlp__hidden_layer_sizes': (150, 150), 'mlp__learning_rate_init': 0.02, 'mlp__max_iter': 100, 'pca__n_components': 54}
Best Cross-Validation Accuracy for MLP: 0.9753206349206348
Test Accuracy for MLP: 0.975925925925926
Mean Cross-Validation Accuracy: 0.9705650793650793
Mean Cross-Validation Accuracy for MLP: 0.9705650793650793


In [33]:
param_grid_bagging = {
    'pca__n_components': [2, 10, 20, 30],  # PCA components to test
    'bagging__n_estimators': [10, 50, 100],  # Number of base learners
    'bagging__estimator__max_depth': [5, 10]  # Max depth for base estimator (DecisionTree)
}

# AdaBoost Classifier
param_grid_adaboost = {
    'pca__n_components': [2, 10, 20, 30],  # PCA components to test
    'adaboost__n_estimators': [50, 100],  # Number of base learners
    'adaboost__learning_rate': [0.01, 0.1]  # Learning rate for AdaBoost
}

# Gradient Boosting Classifier
param_grid_gb = {
    'pca__n_components': [2, 10, 20, 30],  # PCA components to test
    'gb__n_estimators': [50, 100],  # Number of base learners
    'gb__learning_rate': [0.01, 0.1],  # Learning rate for Gradient Boosting
    'gb__max_depth': [3, 5]  # Max depth for trees in Gradient Boosting
}

# Create pipelines for each ensemble method with PCA
pipeline_bagging = Pipeline([
    ('pca', PCA()),  # PCA step
    ('bagging', BaggingClassifier(estimator=DecisionTreeClassifier(), random_state=42))  # Bagging step
])

pipeline_adaboost = Pipeline([
    ('pca', PCA()),  # PCA step
    ('adaboost', AdaBoostClassifier(random_state=42))  # AdaBoost step
])

pipeline_gb = Pipeline([
    ('pca', PCA()),  # PCA step
    ('gb', GradientBoostingClassifier(random_state=42))  # Gradient Boosting step
])

# Initialize GridSearchCV for each ensemble method
grid_search_bagging = GridSearchCV(pipeline_bagging, param_grid_bagging, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_adaboost = GridSearchCV(pipeline_adaboost, param_grid_adaboost, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_gb = GridSearchCV(pipeline_gb, param_grid_gb, cv=5, scoring='accuracy', n_jobs=-1)

# Perform the grid search for each ensemble method
grid_search_bagging.fit(X_train, y_train)
grid_search_adaboost.fit(X_train, y_train)
grid_search_gb.fit(X_train, y_train)

# Retrieve best parameters and accuracy for Bagging
best_params_bagging = grid_search_bagging.best_params_
best_accuracy_bagging = grid_search_bagging.best_score_
print(f"Best Parameters for Bagging: {best_params_bagging}")
print(f"Best Cross-Validation Accuracy for Bagging: {best_accuracy_bagging}")

# Retrieve best parameters and accuracy for AdaBoost
best_params_adaboost = grid_search_adaboost.best_params_
best_accuracy_adaboost = grid_search_adaboost.best_score_
print(f"Best Parameters for AdaBoost: {best_params_adaboost}")
print(f"Best Cross-Validation Accuracy for AdaBoost: {best_accuracy_adaboost}")

# Retrieve best parameters and accuracy for Gradient Boosting
best_params_gb = grid_search_gb.best_params_
best_accuracy_gb = grid_search_gb.best_score_
print(f"Best Parameters for Gradient Boosting: {best_params_gb}")
print(f"Best Cross-Validation Accuracy for Gradient Boosting: {best_accuracy_gb}")

# Evaluate the best model for Bagging on test data
best_model_bagging = grid_search_bagging.best_estimator_
test_accuracy_bagging = best_model_bagging.score(X_test, y_test)
print(f"Test Accuracy for Bagging: {test_accuracy_bagging}")

# Evaluate the best model for AdaBoost on test data
best_model_adaboost = grid_search_adaboost.best_estimator_
test_accuracy_adaboost = best_model_adaboost.score(X_test, y_test)
print(f"Test Accuracy for AdaBoost: {test_accuracy_adaboost}")

# Evaluate the best model for Gradient Boosting on test data
best_model_gb = grid_search_gb.best_estimator_
test_accuracy_gb = best_model_gb.score(X_test, y_test)
print(f"Test Accuracy for Gradient Boosting: {test_accuracy_gb}")

# Perform compute_test for each ensemble method
result_bagging = compute_test(X_train, y_train, best_model_bagging, cv=5)
result_adaboost = compute_test(X_train, y_train, best_model_adaboost, cv=5)
result_gb = compute_test(X_train, y_train, best_model_gb, cv=5)

print(f"Mean Cross-Validation Accuracy for Bagging: {result_bagging}")
print(f"Mean Cross-Validation Accuracy for AdaBoost: {result_adaboost}")
print(f"Mean Cross-Validation Accuracy for Gradient Boosting: {result_gb}")



Best Parameters for Bagging: {'bagging__estimator__max_depth': 10, 'bagging__n_estimators': 100, 'pca__n_components': 20}
Best Cross-Validation Accuracy for Bagging: 0.9021532915955227
Best Parameters for AdaBoost: {'adaboost__learning_rate': 0.01, 'adaboost__n_estimators': 50, 'pca__n_components': 20}
Best Cross-Validation Accuracy for AdaBoost: 0.5894770125845823
Best Parameters for Gradient Boosting: {'gb__learning_rate': 0.1, 'gb__max_depth': 3, 'gb__n_estimators': 100, 'pca__n_components': 30}
Best Cross-Validation Accuracy for Gradient Boosting: 0.9204357174476696
Test Accuracy for Bagging: 0.8981481481481481
Test Accuracy for AdaBoost: 0.6055555555555555
Test Accuracy for Gradient Boosting: 0.924074074074074
Mean Cross-Validation Accuracy: 0.9013216973376336




Mean Cross-Validation Accuracy: 0.5560899260102448
Mean Cross-Validation Accuracy: 0.9085088218554354
Mean Cross-Validation Accuracy for Bagging: 0.9013216973376336
Mean Cross-Validation Accuracy for AdaBoost: 0.5560899260102448
Mean Cross-Validation Accuracy for Gradient Boosting: 0.9085088218554354


In [34]:
# Store the best test accuracies and models
best_model_info = {
    "Bagging": {"accuracy": test_accuracy_bagging, "model": best_model_bagging, "params": best_params_bagging},
    "AdaBoost": {"accuracy": test_accuracy_adaboost, "model": best_model_adaboost, "params": best_params_adaboost},
    "Gradient Boosting": {"accuracy": test_accuracy_gb, "model": best_model_gb, "params": best_params_gb}
}

# Compare the test accuracies and print the best model
best_model_name = max(best_model_info, key=lambda x: best_model_info[x]["accuracy"])
best_model_accuracy = best_model_info[best_model_name]["accuracy"]
best_model_params = best_model_info[best_model_name]["params"]

print(f"\nBest Ensemble Method: {best_model_name}")
print(f"Best Parameters: {best_model_params}")
print(f"Test Accuracy: {best_model_accuracy}")

# Perform compute_test on the best model
best_model = best_model_info[best_model_name]["model"]
result_best_model = compute_test(X_train, y_train, best_model, cv=5)
print(f"Mean Cross-Validation Accuracy for Best Model: {result_best_model}")


Best Ensemble Method: Gradient Boosting
Best Parameters: {'gb__learning_rate': 0.1, 'gb__max_depth': 3, 'gb__n_estimators': 100, 'pca__n_components': 30}
Test Accuracy: 0.924074074074074
Mean Cross-Validation Accuracy: 0.9084993359893758
Mean Cross-Validation Accuracy for Best Model: 0.9084993359893758
