In [1]:
import numpy as np
from sklearn.preprocessing import StandardScaler


def PCA(X: np.ndarray, dim: int) -> np.ndarray:
    # Z-normalize data
    sc = StandardScaler()
    Z = sc.fit_transform(X)
    # Estimate the correlation matrix
    R = np.dot(Z.T, Z) / X.shape[0]

    # Calculate the eigen values, eigen vectors
    eigen_vals, eigen_vecs = np.linalg.eigh(R)

    # Make a list of (eigenvalue, eigenvector) tuples
    eigen_pairs = [
        (np.abs(eigen_vals[i]), eigen_vecs[:, i])
        for i in range(len(eigen_vals))
    ]

    # Sort the (eigenvalue, eigenvector) tuples from high to low
    eigen_pairs.sort(reverse=True)

    W = np.hstack([eigen_pairs[i][1][:, np.newaxis] for i in range(dim)])

    Z_pca = np.dot(Z, W)
    return Z_pca

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from typing import Literal


def train_and_test(X, y, model_type=["decision_tree", "random_forest"]):
    # split X into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=0
    )

    # Train a RandomForestClassifier as model
    if model_type == "decision_tree":
        cls = DecisionTreeClassifier(criterion="entropy", random_state=1)
    elif model_type == "random_forest":
        cls = RandomForestClassifier(
            criterion="entropy", n_estimators=200, random_state=1, n_jobs=2
        )
    else:
        raise ValueError("Invalid model_type.")
    cls.fit(X_train, y_train)

    y_pred = cls.predict(X_test)
    print("Accuracy: %.2f" % accuracy_score(y_test, y_pred))
    print(
        "Accuracy per feature: %.2f"
        % (accuracy_score(y_test, y_pred) / X.shape[1])
    )

In [3]:
from sklearn.datasets import load_breast_cancer
import pandas as pd

# load the breast_cancer dataset
init_data = load_breast_cancer()
(X, y) = load_breast_cancer(return_X_y=True)
X = pd.DataFrame(data=X, columns=init_data["feature_names"])
y = pd.DataFrame(data=y, columns=["label"])

In [4]:
train_and_test(X, y, "random_forest")

Accuracy: 0.98
Accuracy per feature: 0.03


  return fit_method(estimator, *args, **kwargs)


In [5]:
# train and test the model after applying PCA
train_and_test(PCA(X, 2), y, "decision_tree")

Accuracy: 0.88
Accuracy per feature: 0.44
