# Part 1

In [93]:
# Import libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier


In [95]:
RS = 0 # Random State

# Problem 1 
# Step 1: load data set
dataset = datasets.load_breast_cancer()
x = dataset.data
y = dataset.target

In [97]:
# Step 2: Split the data into training and test data
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0, stratify=y)

In [99]:
# Step 3: Define classifiers that we will test accuracy, and param grid
param_grids = {
    "KNN": {
        "n_neighbors": list(range(1, 26)), 
    },
    "Decision Tree": {
        "max_depth": list(range(1, 26)),
    },
    "SVM": {
        "kernel": ["linear", "rbf", "poly"],
    },
    "Logistic Regression": {
        "C": [0.01, 0.1, 1, 10, 100],
        "penalty": ["l2", "l1"],  
        "solver": ["liblinear", "lbfgs"],  
        "max_iter": [100, 500, 1000]
    },
    "Naive Bayes": {
        "var_smoothing": np.logspace(-9, 0, 10)  # Test smoothing values from 1e-9 to 1
    },
    "Logistic Regression": {
        "C": [0.01, 0.1, 1, 10, 100],
        "penalty": ["l2", "l1"],  
        "solver": ["liblinear", "lbfgs"],  
        "max_iter": [100, 500, 1000]
    },
    "ANN": {
        "hidden_layer_sizes": [(50,), (100,), (50, 50)], 
        "activation": ["relu", "tanh"],                  
        "solver": ["adam", "sgd"],                       
        "alpha": [0.0001, 0.001, 0.01] 
    },
}

classifiers = {
    "KNN": (KNeighborsClassifier(), param_grids['KNN'] ),
    "Decision Tree": (DecisionTreeClassifier(random_state=0), param_grids['Decision Tree']),
    "SVM": (SVC(kernel='linear', random_state=0), param_grids['SVM']),
    "Naive Bayes": (GaussianNB(), param_grids['Naive Bayes']),
    "Logistic Regression": (LogisticRegression(max_iter=5000, random_state=0), param_grids['Logistic Regression']),
    "ANN": (MLPClassifier(random_state=0, max_iter=500), param_grids['ANN'])
}


# Important parametrs 
# KNN: n_neighbors, 
# Decision Tree: max_depth,
# SVM: gamma, 
# Naive Bayes: var_smoothing
# Logistic Regression: Penalty, C, solver, max_iter


In [101]:
# Step 4: Create methods used to exexute and print classifies

# Name: Run Classifiers
# Desc: Executes the classifiers and puts them in a dict
def run_classifiers(X_train, X_test, y_train, y_test, ensemble_classifiers, param_grids):
    results = {}

    for name, (classifier, param_grid) in classifiers.items():

        # Executee 
        grid_search = GridSearchCV(classifier, param_grid, cv=5, scoring="accuracy", n_jobs=-1)
        grid_search.fit(X_train, y_train)

        # Get the best model and its parameters
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_

        # Make predictions
        y_pred = best_model.predict(X_test)

        # Calculate accuracy
        accuracy = accuracy_score(y_test, y_pred)

        # Store results
        results[name] = {
            "Accuracy": accuracy,
            "Report": classification_report(y_test, y_pred),
            "Confusion Matrix": confusion_matrix(y_test, y_pred),
            "Parameters": best_params
        }

    return results
    
# Print Results
# Take a dict of results
def print_results(results):
 
    for name, result in results.items():
        print(f"Classifier: {name}")
        print(f"Accuracy: {result['Accuracy']}")
        print(f"Parameters: {result['Parameters']}")
        print("Classification Report:")
        print(result['Report'])
        print("Confusion Matrix:")
        print(result['Confusion Matrix'])
        print("-" * 50)

# Name: Print Best Classifier
# Desc: Print the best classifier based off accuracy
def print_best_classifier(results):
    # Find the classifier with the highest accuracy
    best_classifier = max(results, key=lambda x: results[x]['Accuracy'])
    
    # Print the details of the best classifier
    print(f"Best Classifier: {best_classifier}")
    print(f"Accuracy: {results[best_classifier]['Accuracy']}")
    print(f"Parameters: {results[best_classifier]['Parameters']}")
    print("Classification Report:")
    print(results[best_classifier]['Report'])
    print("Confusion Matrix:")
    print(results[best_classifier]['Confusion Matrix'])
    return best_classifier


In [103]:
# Step 5: Print the classifier method data
results1 = run_classifiers(X_train, X_test, y_train, y_test, classifiers, param_grids)  
print_results(results1)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Classifier: KNN
Accuracy: 0.9239766081871345
Parameters: {'n_neighbors': 8}
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.86      0.89        64
           1       0.92      0.96      0.94       107

    accuracy                           0.92       171
   macro avg       0.93      0.91      0.92       171
weighted avg       0.92      0.92      0.92       171

Confusion Matrix:
[[ 55   9]
 [  4 103]]
--------------------------------------------------
Classifier: Decision Tree
Accuracy: 0.9064327485380117
Parameters: {'max_depth': 2}
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.92      0.88        64
           1       0.95      0.90      0.92       107

    accuracy                           0.91       171
   macro avg       0.90      0.91      0.90       171
weighted avg       0.91      0.91      0.91       171

Confusion Matrix:
[[59  5]
 [11 96]]
-----------

In [104]:
# Step 6: Print the best Classifier
best_clf = print_best_classifier(results1)

# We can see that random forest performed the best, so we can now use this for are esemble methods with the same hyper parameters
# Lets define its parameters below
best_svm_params = {
    "kernel": "linear",
    "random_state": 0,
    "probability":True
}
base_svm = SVC(**best_svm_params)


Best Classifier: SVM
Accuracy: 0.9415204678362573
Parameters: {'kernel': 'linear'}
Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.92      0.92        64
           1       0.95      0.95      0.95       107

    accuracy                           0.94       171
   macro avg       0.94      0.94      0.94       171
weighted avg       0.94      0.94      0.94       171

Confusion Matrix:
[[ 59   5]
 [  5 102]]


In [105]:
# Step 7: import new libraries for esemble methods 
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report

pipe_svc = Pipeline([('scl', StandardScaler()),
                     ('pca', PCA(n_components=0.8, random_state = 0)),
                     ('clf', SVC(kernel='rbf', random_state=0))])

bag = BaggingClassifier(estimator=pipe_svc,
                        n_estimators=10,
                        max_samples=0.5,
                        max_features=0.5,
                        bootstrap=True,
                        bootstrap_features=True,
                        oob_score=True,
                        warm_start=False,
                        n_jobs=-1,
                        random_state=0)


In [106]:
# Step 8: Set up Esemble Classifiesrs with SVM and best parameters
param_grids = {
    "Bagging (SVM)": {
        "n_estimators": list(range(20, 26)), 
        "max_samples": [0.5, 0.8, 1, 2],
    },
    "AdaBoost (SVM)": {
        "n_estimators": [1, 3, 5, 7, 10],
        "learning_rate": [0.001, 0.03 ,0.005, 0.07, 0.01],
    },
    "Random Forest": {
        "n_estimators": [50, 100, 200],
        "max_depth": [None, 10, 20],
    },
}



ensemble_classifiers = {
    "Bagging (SVM)": (BaggingClassifier(estimator=base_svm, random_state=0), param_grids["Bagging (SVM)"]),
    "AdaBoost (SVM)": (AdaBoostClassifier(estimator=base_svm, random_state=0), param_grids["AdaBoost (SVM)"]),
    "Random Forest": (RandomForestClassifier(random_state=0), param_grids["Random Forest"]),
}

In [107]:
# Step 9: Create a function too test the esemble classifier parameters
# Name: Run Tunes Esemble Classifiers
# Desc: Executes the ensemble classifiers with the given params
def run_tuned_ensemble_classifiers(X_train, X_test, y_train, y_test, ensemble_classifiers, param_grids):
    results = {}

    for name, (classifier, param_grid) in ensemble_classifiers.items():

        # Executee 
        grid_search = GridSearchCV(classifier, param_grid, cv=5, scoring="accuracy", n_jobs=-1)
        grid_search.fit(X_train, y_train)

        # Get the best model and its parameters
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_

        # Make predictions
        y_pred = best_model.predict(X_test)

        # Calculate accuracy
        accuracy = accuracy_score(y_test, y_pred)

        # Store results
        results[name] = {
            "Accuracy": accuracy,
            "Report": classification_report(y_test, y_pred),
            "Confusion Matrix": confusion_matrix(y_test, y_pred),
            "Parameters": best_params
        }

    return results

In [108]:
# Step 10: Print the results of the esemble classifiers
# Run the tuned ensemble classifiers
ensemble_results = run_tuned_ensemble_classifiers(X_train, X_test, y_train, y_test, ensemble_classifiers, param_grids)

# Print ensemble results
print_results(ensemble_results)


60 fits failed out of a total of 120.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/ensemble/_bagging.py", line 334, in fit
    return self._fit(X, y, self.max_samples, sample_weight=sample_weight)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Classifier: Bagging (SVM)
Accuracy: 0.9239766081871345
Parameters: {'max_samples': 0.5, 'n_estimators': 20}
Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.88      0.90        64
           1       0.93      0.95      0.94       107

    accuracy                           0.92       171
   macro avg       0.92      0.91      0.92       171
weighted avg       0.92      0.92      0.92       171

Confusion Matrix:
[[ 56   8]
 [  5 102]]
--------------------------------------------------
Classifier: AdaBoost (SVM)
Accuracy: 0.9239766081871345
Parameters: {'learning_rate': 0.001, 'n_estimators': 5}
Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.88      0.90        64
           1       0.93      0.95      0.94       107

    accuracy                           0.92       171
   macro avg       0.92      0.91      0.92       171
weighted avg       0.92      0.92      0.92

In [109]:
# Step 11: Print the best Classifier
# Find and print the best ensemble classifier
print_best_classifier(ensemble_results)

Best Classifier: Random Forest
Accuracy: 0.9532163742690059
Parameters: {'max_depth': None, 'n_estimators': 100}
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.92      0.94        64
           1       0.95      0.97      0.96       107

    accuracy                           0.95       171
   macro avg       0.95      0.95      0.95       171
weighted avg       0.95      0.95      0.95       171

Confusion Matrix:
[[ 59   5]
 [  3 104]]


'Random Forest'

In [115]:
# We can see that the 

# Part 2

In [117]:
# Step 1: Import Standard 
# import standard Scalar
from sklearn.preprocessing import StandardScaler

In [118]:
# Step 2: Read in the dataset 
dataset = pd.read_csv('Project3_Dataset.csv')

In [119]:
# Step 3: Seperate features (Columns 1 to 65) and target column 66
x = dataset.iloc[:, :65]
y = dataset.iloc[:, 65]

In [120]:
# Step 4: Initalize the Standard Scalar and get the data
scaler = StandardScaler()

# Normalize Features
x_scaled = scaler.fit_transform(x)

# Convert Data back to a data frame
x_scaled = pd.DataFrame(x_scaled, columns=x.columns)

# Combine scaled features with the target column
x_scaled_with_y = pd.concat([x_scaled, y], axis=1)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)


In [121]:
# Display the first few rows of the result
#print(x_scaled_with_y.head())

In [122]:
# Step 5: Use our methods fo get the classifiers
results2 = run_classifiers(X_train, X_test, y_train, y_test,classifiers, param_grids )  
print_results(results2)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

Classifier: KNN
Accuracy: 0.2
Parameters: {'n_neighbors': 3}
Classification Report:
              precision    recall  f1-score   support

           1       0.00      0.00      0.00         3
           2       0.50      0.50      0.50         2
           3       0.17      0.50      0.25         2
           4       1.00      1.00      1.00         2
           5       0.00      0.00      0.00         2
           6       0.00      0.00      0.00         2
           7       0.00      0.00      0.00         3
           8       0.00      0.00      0.00         1
           9       1.00      0.50      0.67         2
          10       0.00      0.00      0.00         1
          11       0.00      0.00      0.00         1
          12       0.00      0.00      0.00         2
          13       1.00      0.33      0.50         3
          14       0.33      0.50      0.40         2
          15       0.00      0.00      0.00         1
          16       1.00      1.00      1.00        

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [123]:
# Step 6: print the best essemble classifier
best_clf = print_best_classifier(results2)

# We can see that is naive Bayes So lets save the param list
best_NB_params = {
    'var_smoothing': 1e-09
}

base_svm = GaussianNB(**best_NB_params)

Best Classifier: Naive Bayes
Accuracy: 0.575
Parameters: {'var_smoothing': 1e-09}
Classification Report:
              precision    recall  f1-score   support

           1       0.75      1.00      0.86         3
           2       0.00      0.00      0.00         2
           3       1.00      1.00      1.00         2
           4       1.00      1.00      1.00         2
           5       0.00      0.00      0.00         2
           6       0.14      0.50      0.22         2
           7       0.67      0.67      0.67         3
           8       1.00      1.00      1.00         1
           9       1.00      0.50      0.67         2
          10       0.00      0.00      0.00         1
          11       0.00      0.00      0.00         1
          12       0.00      0.00      0.00         2
          13       1.00      0.67      0.80         3
          14       1.00      1.00      1.00         2
          15       0.50      1.00      0.67         1
          16       0.50      1

In [126]:
# Setp 7: Now lets set up our assemble classifiers
# Define the base Naive Bayes model


# Corrected classifiers with Naive Bayes
ensemble_classifiers = {
    "Bagging (Naive Bayes)": (BaggingClassifier(estimator=base_nb, random_state=0), param_grids["Bagging (SVM)"]),
    "AdaBoost (Naive Bayes)": (AdaBoostClassifier(estimator=base_nb, random_state=0), param_grids["AdaBoost (SVM)"]),
    "Random Forest": (RandomForestClassifier(random_state=0), param_grids["Random Forest"]),
}


In [125]:
# Step 8: Print the results of the esemble classifiers
# Run the tuned ensemble classifiers
ensemble_results = run_tuned_ensemble_classifiers(X_train, X_test, y_train, y_test, ensemble_classifiers, param_grids)

# Print ensemble results
print_results(ensemble_results)

  jointi = np.log(self.class_prior_[i])
  jointi = np.log(self.class_prior_[i])
  jointi = np.log(self.class_prior_[i])
  jointi = np.log(self.class_prior_[i])
  jointi = np.log(self.class_prior_[i])
  jointi = np.log(self.class_prior_[i])
  jointi = np.log(self.class_prior_[i])
  jointi = np.log(self.class_prior_[i])
  jointi = np.log(self.class_prior_[i])
  jointi = np.log(self.class_prior_[i])
  jointi = np.log(self.class_prior_[i])
  jointi = np.log(self.class_prior_[i])
  jointi = np.log(self.class_prior_[i])
  jointi = np.log(self.class_prior_[i])
  jointi = np.log(self.class_prior_[i])
  jointi = np.log(self.class_prior_[i])
  jointi = np.log(self.class_prior_[i])
  jointi = np.log(self.class_prior_[i])
  jointi = np.log(self.class_prior_[i])
  jointi = np.log(self.class_prior_[i])
  jointi = np.log(self.class_prior_[i])
  jointi = np.log(self.class_prior_[i])
  jointi = np.log(self.class_prior_[i])
  jointi = np.log(self.class_prior_[i])
  jointi = np.log(self.class_prior_[i])


Classifier: Bagging (Naive Bayes)
Accuracy: 0.4
Parameters: {'max_samples': 0.8, 'n_estimators': 20}
Classification Report:
              precision    recall  f1-score   support

           1       1.00      0.67      0.80         3
           2       0.00      0.00      0.00         2
           3       1.00      0.50      0.67         2
           4       1.00      1.00      1.00         2
           5       0.17      0.50      0.25         2
           6       0.25      0.50      0.33         2
           7       0.50      0.33      0.40         3
           8       1.00      1.00      1.00         1
           9       1.00      0.50      0.67         2
          10       0.00      0.00      0.00         1
          11       0.00      0.00      0.00         1
          12       0.00      0.00      0.00         2
          13       0.00      0.00      0.00         3
          14       1.00      1.00      1.00         2
          15       0.25      1.00      0.40         1
          1

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [139]:
# Step 9: Print the best Result
print_best_classifier(ensemble_results)

Best Classifier: Random Forest
Accuracy: 0.6
Parameters: {'max_depth': 10, 'n_estimators': 200}
Classification Report:
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         3
           2       0.00      0.00      0.00         2
           3       0.40      1.00      0.57         2
           4       1.00      1.00      1.00         2
           5       0.50      0.50      0.50         2
           6       0.33      0.50      0.40         2
           7       1.00      0.33      0.50         3
           8       1.00      1.00      1.00         1
           9       1.00      0.50      0.67         2
          10       0.00      0.00      0.00         1
          11       0.00      0.00      0.00         1
          12       0.50      0.50      0.50         2
          13       1.00      0.67      0.80         3
          14       0.67      1.00      0.80         2
          15       0.50      1.00      0.67         1
          16    

'Random Forest'