Load the libraries

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import os
import cv2
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.tree import DecisionTreeClassifier,plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from skimage.feature import hog

Dataset

In [3]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("aryashah2k/breast-ultrasound-images-dataset")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/aryashah2k/breast-ultrasound-images-dataset?dataset_version_number=1...


100%|██████████| 195M/195M [00:04<00:00, 43.6MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/aryashah2k/breast-ultrasound-images-dataset/versions/1


Loading the dataset

In [4]:
path = "/root/.cache/kagglehub/datasets/aryashah2k/breast-ultrasound-images-dataset/versions/1/Dataset_BUSI_with_GT/"
categories = [['benign',438],['malignant',211],['normal',134]]

images_path_list= []
mask_path_list = []

all_data_path_list = []


for i in range(len(categories)) :

    folder_path = (f'{path}/{categories[i][0]}')

    for j in range(1,categories[i][1]):

        image_path = (f'{folder_path}/{categories[i][0]} ({j}).png')
        mask_path = (f'{folder_path}/{categories[i][0]} ({j})_mask.png')

        images_path_list.append(image_path)
        mask_path_list.append(mask_path)

    data_path_df = pd.DataFrame()

    data_path_df['Image-Path'] =   images_path_list
    data_path_df['Mask-Path'] = mask_path_list
    data_path_df['Label'] = categories[i][0]

    all_data_path_list.append(data_path_df)

    images_path_list = []
    mask_path_list = []

data_path_df = pd.concat(all_data_path_list,axis = 0)
data_path_df.reset_index(drop=True,inplace=True)

data_path_df

Unnamed: 0,Image-Path,Mask-Path,Label
0,/root/.cache/kagglehub/datasets/aryashah2k/bre...,/root/.cache/kagglehub/datasets/aryashah2k/bre...,benign
1,/root/.cache/kagglehub/datasets/aryashah2k/bre...,/root/.cache/kagglehub/datasets/aryashah2k/bre...,benign
2,/root/.cache/kagglehub/datasets/aryashah2k/bre...,/root/.cache/kagglehub/datasets/aryashah2k/bre...,benign
3,/root/.cache/kagglehub/datasets/aryashah2k/bre...,/root/.cache/kagglehub/datasets/aryashah2k/bre...,benign
4,/root/.cache/kagglehub/datasets/aryashah2k/bre...,/root/.cache/kagglehub/datasets/aryashah2k/bre...,benign
...,...,...,...
775,/root/.cache/kagglehub/datasets/aryashah2k/bre...,/root/.cache/kagglehub/datasets/aryashah2k/bre...,normal
776,/root/.cache/kagglehub/datasets/aryashah2k/bre...,/root/.cache/kagglehub/datasets/aryashah2k/bre...,normal
777,/root/.cache/kagglehub/datasets/aryashah2k/bre...,/root/.cache/kagglehub/datasets/aryashah2k/bre...,normal
778,/root/.cache/kagglehub/datasets/aryashah2k/bre...,/root/.cache/kagglehub/datasets/aryashah2k/bre...,normal


shuffling the dataset

In [5]:
benign_df = data_path_df.iloc[0:133,:]

malignant_df = data_path_df.iloc[437:646,:]

normal_df = data_path_df.iloc[647:780,:]

shuffled_benign_df = benign_df.sample(133)

shuffled_malignant_df = malignant_df.sample(133)

final_df = pd.concat([shuffled_benign_df,shuffled_malignant_df,normal_df],axis=0)

final_df.reset_index(drop=True,inplace=True)

final_df

map_dic = {'benign':0,'malignant':1,'normal':2}
final_df.replace(map_dic,inplace=True)

final_df['Label'].value_counts()

  final_df.replace(map_dic,inplace=True)


Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
0,133
1,133
2,133


In [6]:
final_df = final_df.sample(frac=1)

final_df.reset_index(inplace = True,drop = True)

final_df

Unnamed: 0,Image-Path,Mask-Path,Label
0,/root/.cache/kagglehub/datasets/aryashah2k/bre...,/root/.cache/kagglehub/datasets/aryashah2k/bre...,1
1,/root/.cache/kagglehub/datasets/aryashah2k/bre...,/root/.cache/kagglehub/datasets/aryashah2k/bre...,1
2,/root/.cache/kagglehub/datasets/aryashah2k/bre...,/root/.cache/kagglehub/datasets/aryashah2k/bre...,0
3,/root/.cache/kagglehub/datasets/aryashah2k/bre...,/root/.cache/kagglehub/datasets/aryashah2k/bre...,0
4,/root/.cache/kagglehub/datasets/aryashah2k/bre...,/root/.cache/kagglehub/datasets/aryashah2k/bre...,2
...,...,...,...
394,/root/.cache/kagglehub/datasets/aryashah2k/bre...,/root/.cache/kagglehub/datasets/aryashah2k/bre...,0
395,/root/.cache/kagglehub/datasets/aryashah2k/bre...,/root/.cache/kagglehub/datasets/aryashah2k/bre...,0
396,/root/.cache/kagglehub/datasets/aryashah2k/bre...,/root/.cache/kagglehub/datasets/aryashah2k/bre...,2
397,/root/.cache/kagglehub/datasets/aryashah2k/bre...,/root/.cache/kagglehub/datasets/aryashah2k/bre...,0


Reading the images and masks

In [7]:
def load_images_and_mask(image_path,mask_path,target_size = (400,400)) :

    image = cv2.imread(image_path)
    mask = cv2.imread(mask_path,cv2.IMREAD_GRAYSCALE)

    image = cv2.resize(image,target_size)
    mask = cv2.resize(mask,target_size)

    return image,mask

def load_images_NormalImage(image_path, target_size=(400, 400)):
    # Check if the image exists
    if not os.path.exists(image_path):
        print(f"Image path does not exist: {image_path}")
        return None

    image = cv2.imread(image_path)

    # Check if image is loaded successfully
    if image is None:
        print(f"Failed to load image: {image_path}")
        return None

    # Resize the image if loaded correctly
    image = cv2.resize(image, target_size)

    return image

Extract HOG Features

In [8]:
def hog_extractor(image,mask) :

    masked_image = cv2.bitwise_and(image,image,mask=mask)

    gray_image = cv2.cvtColor(masked_image,cv2.COLOR_BGR2GRAY)

    features,hot_image = hog(gray_image,pixels_per_cell=(16,16),cells_per_block=(2,2),visualize=True)

    return features

def hog_extractor_NormalImage(image) :

    gray_image = cv2.cvtColor(image,cv2.COLOR_BGR2GRAY)

    features,hot_image = hog(gray_image,pixels_per_cell=(16,16),cells_per_block=(2,2),visualize=True)

    return features

In [9]:
image_path = final_df.loc[0, 'Image-Path']
print(f"Checking if this image exists: {image_path}")

Checking if this image exists: /root/.cache/kagglehub/datasets/aryashah2k/breast-ultrasound-images-dataset/versions/1/Dataset_BUSI_with_GT//malignant/malignant (86).png


In [10]:
!ls /root/.cache/kagglehub/datasets/aryashah2k/breast-ultrasound-images-dataset/versions/1/Dataset_BUSI_with_GT/

benign	malignant  normal


In [11]:
from PIL import Image

# Try to open the image with Pillow to check if it works
try:
    pil_image = Image.open(image_path)
    pil_image.show()  # This will display the image
    print(f"Image loaded successfully using Pillow: {image_path}")
except Exception as e:
    print(f"Error opening image with Pillow: {e}")

Image loaded successfully using Pillow: /root/.cache/kagglehub/datasets/aryashah2k/breast-ultrasound-images-dataset/versions/1/Dataset_BUSI_with_GT//malignant/malignant (86).png


In [12]:
X = []
Y = []

for rows in range(len(final_df)):
    image_path = final_df.loc[rows, 'Image-Path']

    # Check if the image is loaded properly
    print(f"Processing: {image_path}")

    # Load the normal image
    image = load_images_NormalImage(image_path=image_path)

    # Skip processing if the image is None
    if image is None:
        continue

    try:
        # Extract HOG features
        features = hog_extractor_NormalImage(image)
        if len(features) == 0:
            print(f"Skipping row {rows} because no features were extracted.")
            continue

        X.append(features)
        Y.append(final_df.loc[rows, 'Label'])

    except Exception as e:
        print(f"Error processing row {rows}: {e}")
        continue

print(f"Length of X after loop: {len(X)}")
print(f"Length of Y after loop: {len(Y)}")

if len(X) > 0 and len(Y) > 0:
    X = np.array(X)
    Y = np.array(Y)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
else:
    print("No valid data to split. Please check the data processing.")

Processing: /root/.cache/kagglehub/datasets/aryashah2k/breast-ultrasound-images-dataset/versions/1/Dataset_BUSI_with_GT//malignant/malignant (86).png
Processing: /root/.cache/kagglehub/datasets/aryashah2k/breast-ultrasound-images-dataset/versions/1/Dataset_BUSI_with_GT//malignant/malignant (59).png
Processing: /root/.cache/kagglehub/datasets/aryashah2k/breast-ultrasound-images-dataset/versions/1/Dataset_BUSI_with_GT//benign/benign (128).png
Processing: /root/.cache/kagglehub/datasets/aryashah2k/breast-ultrasound-images-dataset/versions/1/Dataset_BUSI_with_GT//benign/benign (8).png
Processing: /root/.cache/kagglehub/datasets/aryashah2k/breast-ultrasound-images-dataset/versions/1/Dataset_BUSI_with_GT//normal/normal (35).png
Processing: /root/.cache/kagglehub/datasets/aryashah2k/breast-ultrasound-images-dataset/versions/1/Dataset_BUSI_with_GT//malignant/malignant (156).png
Processing: /root/.cache/kagglehub/datasets/aryashah2k/breast-ultrasound-images-dataset/versions/1/Dataset_BUSI_with_

Define A Function To Evalute

In [13]:
def evaluate(Y_prediction,name):

    report = classification_report(y_true=Y_test,y_pred=Y_prediction)

    print(report)

    print('\n---------------------------------------------------------------------------------------------------------------\n')

    conf_matrix = confusion_matrix(y_true=Y_test,y_pred=Y_prediction)

    plt.figure(figsize=(8,5))

    sns.heatmap(conf_matrix,annot=True,cmap='Blues')

    plt.xlabel('Predicted Label')

    plt.ylabel('True Label')

    plt.title(f'{name} Evaluation')

    plt.show()

Feature extraction with CNN

In [14]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.models import Model
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA

# Preprocessing function
def preprocess_image(image_path, target_size=(224, 224)):
    img = image.load_img(image_path, target_size=target_size)
    img_array = image.img_to_array(img)
    img_array = np.expand_dims(img_array, axis=0)
    return preprocess_input(img_array)

# Load VGG16 model pre-trained on ImageNet
base_model = VGG16(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
feature_extractor = Model(inputs=base_model.input, outputs=base_model.output)

# Prepare X and Y arrays for features and labels
X = []
Y = []

#extract features
for index, row in final_df.iterrows():
    image_path = row['Image-Path']
    label = row['Label']

    processed_image = preprocess_image(image_path)
    features = feature_extractor.predict(processed_image)
    features_flat = features.flatten()

    X.append(features_flat)
    Y.append(label)


X = np.array(X)
Y = np.array(Y)

# Apply PCA for dimensionality reduction
pca = PCA(n_components=100)
X_reduced = pca.fit_transform(X)

X_train, X_test, Y_train, Y_test = train_test_split(X_reduced, Y, test_size=0.2, random_state=42)


X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m58889256/58889256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 897ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 784ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 986ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 549ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 552ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 578ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 540ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 557ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

In [15]:
print(f"Feature shape: {X.shape}, Labels shape: {Y.shape}")
print(f"Training set size: {X_train.shape}, Test set size: {X_test.shape}")

Feature shape: (399, 25088), Labels shape: (399,)
Training set size: (319, 100), Test set size: (80, 100)


ADAboost

In [17]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.tree import DecisionTreeClassifier

# Initialize the AdaBoost classifier with a DecisionTreeClassifier as the base estimator
base_estimator = DecisionTreeClassifier(max_depth=1)  # Weak learner
ada_classifier = AdaBoostClassifier(
    estimator=base_estimator,  # Use 'estimator' instead of 'base_estimator'
    n_estimators=100,  # Number of boosting rounds
    learning_rate=1.0,
    random_state=42
)

# Train the classifier on the training data
ada_classifier.fit(X_train_pca, Y_train)

# Make predictions on the test data
Y_pred = ada_classifier.predict(X_test_pca)

# Evaluate the classifier's performance
print("Classification Report:")
print(classification_report(Y_test, Y_pred))

print("Accuracy:", accuracy_score(Y_test, Y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.44      0.52        25
           1       0.58      0.66      0.61        29
           2       0.67      0.77      0.71        26

    accuracy                           0.62        80
   macro avg       0.63      0.62      0.62        80
weighted avg       0.63      0.62      0.62        80

Accuracy: 0.625


using grid search for adaboost

In [16]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score

# Grid Search for AdaBoost
def adaboost_with_grid_search(X_train, Y_train, X_test, Y_test):
    # Define the parameter grid for hyperparameter tuning
    param_grid = {
        'n_estimators': [50, 100, 150],  # Number of boosting rounds
        'learning_rate': [0.1, 0.5, 1.0],  # Learning rate
        'estimator__max_depth': [1, 2, 3]  # Maximum depth of the decision tree base estimator
    }

    # Initialize the base estimator
    base_estimator = DecisionTreeClassifier(random_state=42)

    # Initialize AdaBoostClassifier
    ada_classifier = AdaBoostClassifier(
        estimator=base_estimator,
        random_state=42
    )

    # Perform GridSearchCV for hyperparameter tuning
    grid_search = GridSearchCV(
        estimator=ada_classifier,
        param_grid=param_grid,
        scoring='accuracy',
        cv=5,  # 5-fold cross-validation
        n_jobs=-1  # Use all available CPU cores
    )

    print("Running Grid Search for AdaBoost...")
    grid_search.fit(X_train, Y_train.ravel())

    # Get the best model and parameters
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    print(f"\nBest Parameters: {best_params}")
    print(f"Cross-Validation Accuracy: {grid_search.best_score_:.4f}")

    # Evaluate on the test set
    test_accuracy = best_model.score(X_test, Y_test)
    print(f"\nTest Accuracy: {test_accuracy:.4f}")

    # Make predictions on the test set
    Y_pred = best_model.predict(X_test)

    # Print evaluation metrics
    print("\nClassification Report:")
    print(classification_report(Y_test, Y_pred))

    print("\nAccuracy:", accuracy_score(Y_test, Y_pred))

    return best_model

In [17]:
# Call the function to train and evaluate the model
best_ada_model = adaboost_with_grid_search(X_train_pca, Y_train, X_test_pca, Y_test)

Running Grid Search for AdaBoost...

Best Parameters: {'estimator__max_depth': 3, 'learning_rate': 0.1, 'n_estimators': 150}
Cross-Validation Accuracy: 0.8023

Test Accuracy: 0.8500

Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.83      0.83        24
           1       0.86      0.75      0.80        24
           2       0.86      0.94      0.90        32

    accuracy                           0.85        80
   macro avg       0.85      0.84      0.84        80
weighted avg       0.85      0.85      0.85        80


Accuracy: 0.85


save the model

In [18]:
import joblib

# Save the trained model
joblib.dump(best_ada_model, 'best_ada_model.pkl')

print("Model saved as 'best_ada_model.pkl'")

# Load the model later
loaded_model = joblib.load('best_ada_model.pkl')

Model saved as 'best_ada_model.pkl'


In [None]:
# Evaluate accuracy on the training set
train_accuracy = best_ada_model.score(X_train_pca, Y_train)
print(f"Training Accuracy: {train_accuracy:.4f}")

# Evaluate accuracy on the testing set
test_accuracy = best_ada_model.score(X_test_pca, Y_test)
print(f"Testing Accuracy: {test_accuracy:.4f}")

In [19]:
pip install joblib



In [20]:
import joblib

# Save the trained model to a .joblib file
joblib.dump(best_ada_model, 'best_ada_model.joblib')

print("Model saved as 'best_ada_model.joblib'")

Model saved as 'best_ada_model.joblib'


Grid Search for AdaBoost with PCA Features

In [21]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score

# Grid Search for AdaBoost with PCA Features
def adaboost_with_grid_search_pca(X_train_pca, Y_train, X_test_pca, Y_test):
    """
    Perform AdaBoost classification with Grid Search hyperparameter tuning using PCA features.
    """
    # Define the parameter grid for hyperparameter tuning
    param_grid = {
        'n_estimators': [50, 100, 150],  # Number of boosting rounds
        'learning_rate': [0.1, 0.5, 1.0],  # Learning rate
        'estimator__max_depth': [1, 2, 3]  # Maximum depth of the decision tree base estimator
    }

    # Initialize the base estimator
    base_estimator = DecisionTreeClassifier(random_state=42)

    # Initialize AdaBoostClassifier
    ada_classifier = AdaBoostClassifier(
        estimator=base_estimator,
        random_state=42
    )

    # Perform GridSearchCV for hyperparameter tuning
    grid_search = GridSearchCV(
        estimator=ada_classifier,
        param_grid=param_grid,
        scoring='accuracy',
        cv=5,  # 5-fold cross-validation
        n_jobs=-1  # Use all available CPU cores
    )

    print("Running Grid Search for AdaBoost...")
    grid_search.fit(X_train_pca, Y_train.ravel())

    # Get the best model and parameters
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    print(f"\nBest Parameters: {best_params}")
    print(f"Cross-Validation Accuracy: {grid_search.best_score_:.4f}")

    # Evaluate on the test set
    test_accuracy = best_model.score(X_test_pca, Y_test)
    print(f"\nTest Accuracy: {test_accuracy:.4f}")

    # Make predictions on the test set
    Y_pred = best_model.predict(X_test_pca)

    # Print evaluation metrics
    print("\nClassification Report:")
    print(classification_report(Y_test, Y_pred))

    print("\nAccuracy:", accuracy_score(Y_test, Y_pred))

    return best_model

In [22]:
# Call the function to train and evaluate the model
best_ada_model_pca = adaboost_with_grid_search_pca(X_train_pca, Y_train, X_test_pca, Y_test)

Running Grid Search for AdaBoost...

Best Parameters: {'estimator__max_depth': 3, 'learning_rate': 0.1, 'n_estimators': 150}
Cross-Validation Accuracy: 0.8023

Test Accuracy: 0.8500

Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.83      0.83        24
           1       0.86      0.75      0.80        24
           2       0.86      0.94      0.90        32

    accuracy                           0.85        80
   macro avg       0.85      0.84      0.84        80
weighted avg       0.85      0.85      0.85        80


Accuracy: 0.85


In [23]:
import h5py

# Save data to an .h5 file
with h5py.File('features_and_labels.h5', 'w') as h5file:
    # Save training and testing features
    h5file.create_dataset('X_train', data=X_train)
    h5file.create_dataset('X_test', data=X_test)
    # Save training and testing labels
    h5file.create_dataset('Y_train', data=Y_train)
    h5file.create_dataset('Y_test', data=Y_test)

print("Features and labels saved to 'features_and_labels.h5'")


Features and labels saved to 'features_and_labels.h5'


Randomized Search for AdaBoost with PCA Features

In [24]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, accuracy_score

# Randomized Search for AdaBoost with PCA Features
def adaboost_with_random_search_pca(X_train_pca, Y_train, X_test_pca, Y_test):
    """
    Perform AdaBoost classification with Randomized Search hyperparameter tuning using PCA features.
    """
    # Define the parameter grid for hyperparameter tuning
    param_dist = {
        'n_estimators': [50, 100, 150, 200, 250],  # Number of boosting rounds
        'learning_rate': [0.01, 0.1, 0.5, 1.0, 1.5],  # Learning rate
        'estimator__max_depth': [1, 2, 3, 4]  # Maximum depth of the decision tree base estimator
    }

    # Initialize the base estimator
    base_estimator = DecisionTreeClassifier(random_state=42)

    # Initialize AdaBoostClassifier
    ada_classifier = AdaBoostClassifier(
        estimator=base_estimator,
        random_state=42
    )

    # Perform RandomizedSearchCV for hyperparameter tuning
    random_search = RandomizedSearchCV(
        estimator=ada_classifier,
        param_distributions=param_dist,
        n_iter=20,  # Number of parameter settings sampled
        scoring='accuracy',
        cv=5,  # 5-fold cross-validation
        random_state=42,
        n_jobs=-1  # Use all available CPU cores
    )

    print("Running Randomized Search for AdaBoost...")
    random_search.fit(X_train_pca, Y_train.ravel())

    # Get the best model and parameters
    best_model = random_search.best_estimator_
    best_params = random_search.best_params_
    print(f"\nBest Parameters: {best_params}")
    print(f"Cross-Validation Accuracy: {random_search.best_score_:.4f}")

    # Evaluate on the test set
    test_accuracy = best_model.score(X_test_pca, Y_test)
    print(f"\nTest Accuracy: {test_accuracy:.4f}")

    # Make predictions on the test set
    Y_pred = best_model.predict(X_test_pca)

    # Print evaluation metrics
    print("\nClassification Report:")
    print(classification_report(Y_test, Y_pred))

    print("\nAccuracy:", accuracy_score(Y_test, Y_pred))

    return best_model

In [25]:
# Call the function to train and evaluate the model
best_ada_model_random = adaboost_with_random_search_pca(X_train_pca, Y_train, X_test_pca, Y_test)

Running Randomized Search for AdaBoost...

Best Parameters: {'n_estimators': 200, 'learning_rate': 0.1, 'estimator__max_depth': 4}
Cross-Validation Accuracy: 0.7805

Test Accuracy: 0.9250

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.88      0.91        24
           1       0.88      0.92      0.90        24
           2       0.94      0.97      0.95        32

    accuracy                           0.93        80
   macro avg       0.92      0.92      0.92        80
weighted avg       0.93      0.93      0.92        80


Accuracy: 0.925


save the model

In [26]:
import joblib

# Save the trained model
joblib.dump(best_ada_model_random, 'best_ada_model_random.pkl')

print("Model saved as 'best_ada_model_random.pkl'")

# Load the model later
loaded_model = joblib.load('best_ada_model_random.pkl')

Model saved as 'best_ada_model_random.pkl'


In [27]:
import joblib

# Save the trained model to a .joblib file
joblib.dump(best_ada_model_random, 'best_ada_model_random.joblib')

print("Model saved as 'best_ada_model_random.joblib'")

Model saved as 'best_ada_model_random.joblib'
