# Homework 5 RF Accuracy Improvement

This assignment is inspired by examples of Shan-Hung Wu from National Tsing Hua University.

Requirement: improve the accuracy per feature of the following code from 0.03 up to at least 0.45 and accuracy should be more than 0.92

Here are three hints:

    You can improve the ratio by picking out or "creating" several features.
    Tune hyperparameters
    The ratio can be improved from 0.03 up to 0.47.

In [2]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
import numpy as np

# Load the breast cancer dataset and extract the data and labels
breast_cancer_data = load_breast_cancer()
X, y = load_breast_cancer(return_X_y=True)

# Perform PCA to reduce the dimensionality to 2 components
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

# Split the data into training and testing sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.3, random_state=0)

# Define the Random Forest Classifier and its hyperparameter search space
clf = RandomForestClassifier()
clf_params = {
    'n_estimators': np.linspace(1, 10, 10, dtype=int),
    'criterion': ["gini", "entropy"]
}

# Perform RandomizedSearchCV to find the best hyperparameters
random_search = RandomizedSearchCV(estimator=clf, param_distributions=clf_params, n_iter=10, cv=5, error_score='raise')
random_search.fit(X_train, y_train)

# Print the best accuracy and hyperparameters
best_accuracy = random_search.best_score_
best_params = random_search.best_params_
print(f'Best Accuracy: {best_accuracy:.2f}')
print(f'Best Hyperparameters: {best_params}')

# Make predictions using the best estimator
best_estimator = random_search.best_estimator_
y_pred = best_estimator.predict(X_test)

# Calculate and print the accuracy and accuracy per feature
test_accuracy = accuracy_score(y_test, y_pred)
accuracy_per_feature = test_accuracy / X_pca.shape[1]

print(f'Test Accuracy: {test_accuracy:.2f}')
print(f'Accuracy per Feature: {accuracy_per_feature:.2f}')


Best Accuracy: 0.92
Best Hyperparameters: {'n_estimators': 8, 'criterion': 'entropy'}
Test Accuracy: 0.92
Accuracy per Feature: 0.46
