In [58]:
import numpy as np
import sklearn
import sklearn.datasets
import sklearn.model_selection
import sklearn.decomposition
import sklearn.neighbors
import sklearn.metrics
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_digits
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.datasets import load_iris
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from collections import Counter

import ssl
import certifi
import urllib.request

In [55]:
# PLEASE CHANGE THE VALUE OF DATASET TO CHANGE THE DATASET (0 = digits, 1 = 20 newsgroups)
dataset = 1

if dataset == 0:
    digits = sklearn.datasets.load_digits()
    X = digits.data
    y = digits.target

elif dataset == 1:
    iris = load_iris()
    X = iris.data  
    y = iris.target  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [56]:
def compute_test(X, y, clf, cv=10):
    kfold = KFold(n_splits=cv, shuffle=True, random_state=42)
    scores = []
    
    for train_index, test_index in kfold.split(X):
        X_train_fold, X_test_fold = X[train_index], X[test_index]
        y_train_fold, y_test_fold = y[train_index], y[test_index]
        
        clf.fit(X_train_fold, y_train_fold)
        
        y_pred = clf.predict(X_test_fold)
        
        scores.append(accuracy_score(y_test_fold, y_pred))
    
    mean_score = np.mean(scores)
    print(f"Mean Cross-Validation Accuracy: {mean_score}")
    return mean_score

# 1)

# PCA

In [57]:
if dataset == 0:
    n_components = np.arange(2, 60, 2)
elif dataset == 1:
    n_components = np.arange(2, 5, 1)

# Define parameter grid
param_grid = {
    'pca__n_components': n_components,  # Number of dimensions to test
    'knn__n_neighbors': np.arange(1, 21, 1),   # Number of neighbors to test
    'knn__weights': ['uniform', 'distance'],  # Weighting schemes
    'knn__metric': ['euclidean', 'manhattan']  # Distance metrics
}

# Create a pipeline with PCA and k-NN
pipeline = Pipeline([
    ('pca', PCA()),  # PCA step
    ('knn', KNeighborsClassifier())  # k-NN step
])

# Initialize GridSearchCV
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=10,
    scoring='accuracy',
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

best_params_pca = grid_search.best_params_
best_accuracy_pca = grid_search.best_score_

print(f"Best Parameters: {best_params_pca}")
print(f"Best Cross-Validation Accuracy: {best_accuracy_pca}")

best_model_pca = grid_search.best_estimator_
test_accuracy_pca = best_model_pca.score(X_test, y_test)

print(f"Test Accuracy with Best Model: {test_accuracy_pca}")

print("\nPerforming Compute Test on the best model:")
result_pca = compute_test(X_train, y_train, best_model_pca, cv=10)



Best Parameters: {'knn__metric': 'euclidean', 'knn__n_neighbors': 14, 'knn__weights': 'uniform', 'pca__n_components': 4}
Best Cross-Validation Accuracy: 0.9445454545454547
Test Accuracy with Best Model: 1.0

Performing Compute Test on the best model:
Mean Cross-Validation Accuracy: 0.9236363636363636


# LDA

In [59]:
# Define parameter grid

if dataset == 0:
    n_components = np.arange(1, 9, 1)
elif dataset == 1:
    n_components = np.arange(2, 5, 1)

param_grid = {
    'lda__n_components': n_components,  # Number of dimensions to test
    'knn__n_neighbors': np.arange(1, 21, 1),   # Number of neighbors to test
    'knn__weights': ['uniform', 'distance'],  # Weighting schemes
    'knn__metric': ['euclidean', 'manhattan']  # Distance metrics
}

# Create a pipeline with LDA and k-NN
pipeline = Pipeline([
    ('lda', LDA()),  # LDA step
    ('knn', KNeighborsClassifier())  # k-NN step
])

# Initialize GridSearchCV
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=10,
    scoring='accuracy',
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

best_params_lda = grid_search.best_params_
best_accuracy_lda = grid_search.best_score_

print(f"Best Parameters: {best_params_lda}")
print(f"Best Cross-Validation Accuracy: {best_accuracy_lda}")

best_model_lda = grid_search.best_estimator_
test_accuracy_lda = best_model_lda.score(X_test, y_test)

print(f"Test Accuracy with Best Model: {test_accuracy_lda}")

print("\nPerforming Compute Test on the best model:")
result_lda = compute_test(X_train, y_train, best_model_lda, cv=10)

Best Parameters: {'knn__metric': 'euclidean', 'knn__n_neighbors': 6, 'knn__weights': 'uniform', 'lda__n_components': 2}
Best Cross-Validation Accuracy: 0.9727272727272727
Test Accuracy with Best Model: 1.0

Performing Compute Test on the best model:
Mean Cross-Validation Accuracy: 0.9718181818181819


1600 fits failed out of a total of 2400.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1600 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\diego\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\diego\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\diego\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\pipeline.py", line 471, in fit
    Xt = self._fit(X, y, routed_params)

# SVD

In [60]:
if dataset == 0:
    n_components = np.arange(2, 60, 2)
elif dataset == 1:
    n_components = np.arange(2, 5, 1)

# Define parameter grid
param_grid = {
    'svd__n_components': np.arange(2, 5, 1),  # Number of dimensions to test
    'knn__n_neighbors': np.arange(1, 21, 1),   # Number of neighbors to test
    'knn__weights': ['uniform', 'distance'],  # Weighting schemes
    'knn__metric': ['euclidean', 'manhattan']  # Distance metrics
}

# Create a pipeline with SVD and k-NN
pipeline = Pipeline([
    ('svd', TruncatedSVD()),  # SVD step
    ('knn', KNeighborsClassifier())  # k-NN step
])

# Initialize GridSearchCV
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=10,
    scoring='accuracy',
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

best_params_svd = grid_search.best_params_
best_accuracy_svd = grid_search.best_score_

print(f"Best Parameters: {best_params_svd}")
print(f"Best Cross-Validation Accuracy: {best_accuracy_svd}")

best_model_svd = grid_search.best_estimator_
test_accuracy_svd = best_model_svd.score(X_test, y_test)

print(f"Test Accuracy with Best Model: {test_accuracy_svd}")

print("\nPerforming Compute Test on the best model:")
result_svd = compute_test(X_train, y_train, best_model_svd, cv=10)

Best Parameters: {'knn__metric': 'euclidean', 'knn__n_neighbors': 14, 'knn__weights': 'uniform', 'svd__n_components': 4}
Best Cross-Validation Accuracy: 0.9445454545454547
Test Accuracy with Best Model: 1.0

Performing Compute Test on the best model:
Mean Cross-Validation Accuracy: 0.9236363636363636


In [61]:
results = {
    'PCA': {
        'best_params': best_params_pca,
        'best_accuracy': best_accuracy_pca,
        'test_accuracy': test_accuracy_pca,
        'cv_accuracy': result_pca
    },
    'SVD': {
        'best_params': best_params_svd,
        'best_accuracy': best_accuracy_svd,
        'test_accuracy': test_accuracy_svd,
        'cv_accuracy': result_svd
    },
    'LDA': {
        'best_params': best_params_lda,
        'best_accuracy': best_accuracy_lda,
        'test_accuracy': test_accuracy_lda,
        'cv_accuracy': result_lda
    }
}

best_method = max(results, key=lambda x: results[x]['test_accuracy'])

# Print out the best method and comparison results
print(f"\nBest Method: {best_method}")
print(f"Best Parameters for {best_method}: {results[best_method]['best_params']}")
print(f"Best Cross-Validation Accuracy for {best_method}: {results[best_method]['best_accuracy']}")
print(f"Test Accuracy for {best_method}: {results[best_method]['test_accuracy']}")
print(f"Cross-Validation Accuracy for {best_method}: {results[best_method]['cv_accuracy']}")


Best Method: PCA
Best Parameters for PCA: {'knn__metric': 'euclidean', 'knn__n_neighbors': 14, 'knn__weights': 'uniform', 'pca__n_components': 4}
Best Cross-Validation Accuracy for PCA: 0.9445454545454547
Test Accuracy for PCA: 1.0
Cross-Validation Accuracy for PCA: 0.9236363636363636


# 2

In [62]:
if dataset == 0:
    n_components = np.arange(2, 60, 2)
elif dataset == 1:
    n_components = np.arange(2, 5, 1)

# Define parameter grid for MLP
param_grid_mlp = {
    'pca__n_components': n_components,  # Number of components to test for PCA
    'mlp__hidden_layer_sizes': [(100, 100), (150, 150), (50, 50)],  # Hidden layer configurations
    'mlp__learning_rate_init': [0.001, 0.01, 0.02],  # Learning rates
    'mlp__max_iter': [10, 50, 100]  # Iterations for training
}

# Create a pipeline with PCA and MLPClassifier
pipeline_mlp = Pipeline([
    ('pca', PCA()),  # PCA step
    ('mlp', MLPClassifier(activation='relu', solver='sgd', learning_rate='constant'))  # MLP step
])

# Initialize GridSearchCV for MLP
grid_search_mlp = GridSearchCV(
    pipeline_mlp,
    param_grid_mlp,
    cv=10,
    scoring='accuracy',
    n_jobs=-1
)

grid_search_mlp.fit(X_train, y_train)

best_params_mlp = grid_search_mlp.best_params_
best_accuracy_mlp = grid_search_mlp.best_score_

print(f"Best Parameters for MLP: {best_params_mlp}")
print(f"Best Cross-Validation Accuracy for MLP: {best_accuracy_mlp}")

best_model_mlp = grid_search_mlp.best_estimator_
test_accuracy_mlp = best_model_mlp.score(X_test, y_test)
print(f"Test Accuracy for MLP: {test_accuracy_mlp}")

result_mlp = compute_test(X_train, y_train, best_model_mlp, cv=10)
print(f"Mean Cross-Validation Accuracy for MLP: {result_mlp}")



Best Parameters for MLP: {'mlp__hidden_layer_sizes': (150, 150), 'mlp__learning_rate_init': 0.01, 'mlp__max_iter': 100, 'pca__n_components': 4}
Best Cross-Validation Accuracy for MLP: 0.9536363636363637
Test Accuracy for MLP: 1.0




Mean Cross-Validation Accuracy: 0.9345454545454546
Mean Cross-Validation Accuracy for MLP: 0.9345454545454546




In [44]:
if dataset == 0:
    n_components = [2, 10, 20, 30]
elif dataset == 1:
    n_components = np.arange(2, 5, 1)

param_grid_bagging = {
    'pca__n_components': n_components,  # PCA components to test
    'bagging__n_estimators': [10, 50, 100],  # Number of base learners
    'bagging__estimator__max_depth': [5, 10]  # Max depth for base estimator
}

# AdaBoost Classifier
param_grid_adaboost = {
    'pca__n_components': n_components,  # PCA components to test
    'adaboost__n_estimators': [50, 100],  # Number of base learners
    'adaboost__learning_rate': [0.01, 0.1]  # Learning rate for AdaBoost
}

# Gradient Boosting Classifier
param_grid_gb = {
    'pca__n_components': n_components,  # PCA components to test
    'gb__n_estimators': [50, 100],  # Number of base learners
    'gb__learning_rate': [0.01, 0.1],  # Learning rate for Gradient Boosting
    'gb__max_depth': [3, 5]  # Max depth for trees in Gradient Boosting
}

pipeline_bagging = Pipeline([
    ('pca', PCA()),  # PCA step
    ('bagging', BaggingClassifier(estimator=DecisionTreeClassifier(), random_state=42))  # Bagging step
])

pipeline_adaboost = Pipeline([
    ('pca', PCA()),  # PCA step
    ('adaboost', AdaBoostClassifier(random_state=42))  # AdaBoost step
])

pipeline_gb = Pipeline([
    ('pca', PCA()),  # PCA step
    ('gb', GradientBoostingClassifier(random_state=42))  # Gradient Boosting step
])

grid_search_bagging = GridSearchCV(pipeline_bagging, param_grid_bagging, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_adaboost = GridSearchCV(pipeline_adaboost, param_grid_adaboost, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_gb = GridSearchCV(pipeline_gb, param_grid_gb, cv=5, scoring='accuracy', n_jobs=-1)

# Perform the grid search for each ensemble method
grid_search_bagging.fit(X_train, y_train)
grid_search_adaboost.fit(X_train, y_train)
grid_search_gb.fit(X_train, y_train)

# Retrieve best parameters and accuracy for Bagging
best_params_bagging = grid_search_bagging.best_params_
best_accuracy_bagging = grid_search_bagging.best_score_
print(f"Best Parameters for Bagging: {best_params_bagging}")
print(f"Best Cross-Validation Accuracy for Bagging: {best_accuracy_bagging}")

# Retrieve best parameters and accuracy for AdaBoost
best_params_adaboost = grid_search_adaboost.best_params_
best_accuracy_adaboost = grid_search_adaboost.best_score_
print(f"Best Parameters for AdaBoost: {best_params_adaboost}")
print(f"Best Cross-Validation Accuracy for AdaBoost: {best_accuracy_adaboost}")

# Retrieve best parameters and accuracy for Gradient Boosting
best_params_gb = grid_search_gb.best_params_
best_accuracy_gb = grid_search_gb.best_score_
print(f"Best Parameters for Gradient Boosting: {best_params_gb}")
print(f"Best Cross-Validation Accuracy for Gradient Boosting: {best_accuracy_gb}")

# Evaluate the best model for Bagging on test data
best_model_bagging = grid_search_bagging.best_estimator_
test_accuracy_bagging = best_model_bagging.score(X_test, y_test)
print(f"Test Accuracy for Bagging: {test_accuracy_bagging}")

# Evaluate the best model for AdaBoost on test data
best_model_adaboost = grid_search_adaboost.best_estimator_
test_accuracy_adaboost = best_model_adaboost.score(X_test, y_test)
print(f"Test Accuracy for AdaBoost: {test_accuracy_adaboost}")

# Evaluate the best model for Gradient Boosting on test data
best_model_gb = grid_search_gb.best_estimator_
test_accuracy_gb = best_model_gb.score(X_test, y_test)
print(f"Test Accuracy for Gradient Boosting: {test_accuracy_gb}")

# Perform compute_test for each ensemble method
result_bagging = compute_test(X_train, y_train, best_model_bagging, cv=5)
result_adaboost = compute_test(X_train, y_train, best_model_adaboost, cv=5)
result_gb = compute_test(X_train, y_train, best_model_gb, cv=5)

print(f"Mean Cross-Validation Accuracy for Bagging: {result_bagging}")
print(f"Mean Cross-Validation Accuracy for AdaBoost: {result_adaboost}")
print(f"Mean Cross-Validation Accuracy for Gradient Boosting: {result_gb}")



Best Parameters for Bagging: {'bagging__estimator__max_depth': 5, 'bagging__n_estimators': 10, 'pca__n_components': 3}
Best Cross-Validation Accuracy for Bagging: 0.9238095238095237
Best Parameters for AdaBoost: {'adaboost__learning_rate': 0.01, 'adaboost__n_estimators': 50, 'pca__n_components': 2}
Best Cross-Validation Accuracy for AdaBoost: 0.8952380952380953
Best Parameters for Gradient Boosting: {'gb__learning_rate': 0.1, 'gb__max_depth': 3, 'gb__n_estimators': 50, 'pca__n_components': 3}
Best Cross-Validation Accuracy for Gradient Boosting: 0.9142857142857143
Test Accuracy for Bagging: 0.9777777777777777
Test Accuracy for AdaBoost: 0.9333333333333333
Test Accuracy for Gradient Boosting: 0.9777777777777777
Mean Cross-Validation Accuracy: 0.9142857142857143
Mean Cross-Validation Accuracy: 0.8857142857142856




Mean Cross-Validation Accuracy: 0.9142857142857143
Mean Cross-Validation Accuracy for Bagging: 0.9142857142857143
Mean Cross-Validation Accuracy for AdaBoost: 0.8857142857142856
Mean Cross-Validation Accuracy for Gradient Boosting: 0.9142857142857143


In [None]:
# Store the best test accuracies and models
best_model_info = {
    "Bagging": {"accuracy": test_accuracy_bagging, "model": best_model_bagging, "params": best_params_bagging},
    "AdaBoost": {"accuracy": test_accuracy_adaboost, "model": best_model_adaboost, "params": best_params_adaboost},
    "Gradient Boosting": {"accuracy": test_accuracy_gb, "model": best_model_gb, "params": best_params_gb}
}


# Compare the test accuracies and print the best model
best_model_name = max(best_model_info, key=lambda x: best_model_info[x]["accuracy"])
best_model_accuracy = best_model_info[best_model_name]["accuracy"]
best_model_params = best_model_info[best_model_name]["params"]

print(f"\nBest Ensemble Method: {best_model_name}")
print(f"Best Parameters: {best_model_params}")
print(f"Test Accuracy: {best_model_accuracy}")

# Perform compute_test on the best model
best_model = best_model_info[best_model_name]["model"]
result_best_model = compute_test(X_train, y_train, best_model, cv=5)
print(f"Mean Cross-Validation Accuracy for Best Model: {result_best_model}")


Best Ensemble Method: Bagging
Best Parameters: {'bagging__estimator__max_depth': 5, 'bagging__n_estimators': 10, 'pca__n_components': 3}
Test Accuracy: 0.9777777777777777
Mean Cross-Validation Accuracy: 0.9142857142857143
Mean Cross-Validation Accuracy for Best Model: 0.9142857142857143
