In [2]:
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OrdinalEncoder

In [3]:
data = pd.read_excel("./Datasets/Dry_Bean_Dataset.xlsx")
# data.head(10)

In [4]:
def pca_preprocessing(data):
    centered_data = (data - data.mean()) / data.std()
    return centered_data

In [5]:
# do not include the label column
X = data.iloc[:, :-1]

y = data.iloc[:, -1]

# convert y to numerical values
ordiner = OrdinalEncoder()
y = ordiner.fit_transform(y.values.reshape(-1, 1)).ravel()

X_centered = pca_preprocessing(X)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_centered, y, test_size=0.2, random_state=42, stratify=y)

In [7]:
import time


def my_PCA(X, dim=None):
    # compute the covariance matrix
    cov_matrix = np.cov(X, rowvar=False)        # with rowvar=True, it assumes X[features, samples]

    # find eigenvalues and eigenvectors
    eigenvals, eigenvecs = np.linalg.eig(cov_matrix)

    # sort eigenvalues and eigenvectors
    sorted_indices = np.argsort(eigenvals)[::-1]
    sorted_eigenvals = eigenvals[sorted_indices]
    sorted_eigenvecs = eigenvecs[:, sorted_indices]

    if dim is None:
        px.scatter(x=np.arange(1, len(sorted_eigenvals)+1), y=sorted_eigenvals, title="Eigenvalues", labels={"x": "Index", "y": "Eigenvalue"}, width=800, height=800).show()

        time.sleep(2)
        dim = int(float(input("Enter the number of dimensions to reduce to: ")))

    # select the top d eigenvectors
    selected_eigenvecs =  sorted_eigenvecs[:, :dim]

    # project into the d dimension
    X_pca = X @ selected_eigenvecs

    return X_pca, dim, selected_eigenvecs

In [8]:
X_pca, d, eigenvecs = my_PCA(X_train)
px.scatter(x=X_pca.iloc[:, 0], y=X_pca.iloc[:, 1], color=np.array(y_train), title=f"PCA with {d} dimensions", width=600, height=600).show()

In [9]:
X_pca, d, eigenvecs = my_PCA(X_train)
fig = px.scatter_3d(x=X_pca.iloc[:, 0], y=X_pca.iloc[:, 1], z=X_pca.iloc[:, 2], color=y_train, title=f"PCA with {d} dimensions", opacity=0.7, width=600, height=600)
fig.update_traces(marker=dict(size=2))

In [10]:
train_accuracies = []
test_accuracies = []

for d in range(1, 17):
    X_train_pca, dim, eigenvecs_matrix = PCA(X_train, dim=d)
    X_test_pca = X_test @ eigenvecs_matrix

    model = LogisticRegression()
    model.fit(X_train_pca, y_train)
    y_pred = model.predict(X_test_pca)

    train_acc = model.score(X_train_pca, y_train)
    test_acc = model.score(X_test_pca, y_test)
    train_accuracies.append(train_acc)
    test_accuracies.append(test_acc)

    print(f"Dimensions: {d}, Train Accuracy: {train_acc:.4f}, Test Accuracy: {test_acc:.4f}")

NameError: name 'PCA' is not defined

In [12]:
# Swiss roll dataset
def swiss_roll(n):
    """
    Parameters:
    n: int
        Number of points to generate"""

    data = np.zeros((n,3))
    phi = np.random.uniform(low=1.5*np.pi, high=4.5*np.pi, size=n)
    psi = np.random.uniform(0,10,n)

    data[:,0]=phi*np.cos(phi) #x coordinate
    data[:,1]=phi*np.sin(phi) #y coordinate
    data[:,2]=psi #z coordinate
    return data

data = swiss_roll(10000)
data = pd.DataFrame(data, columns=["X1", "X2", "X3"])
data_prep = pca_preprocessing(data)
data_pca = my_PCA(data_prep, dim=2)

In [16]:
fig = px.scatter_3d(data, x="X1", y="X2", z="X3", title="Swiss Roll 3D", opacity=0.7, color_discrete_sequence=["orange"])
fig.update_traces(marker=dict(size=2))
fig.show()

In [15]:
fig_2 = px.scatter(x=data_pca[0].iloc[:, 0], y=data_pca[0].iloc[:, 1], title="Swiss Roll PCA 2D", labels={"x": "Principal Component 1", "y": "Principal Component 2"})
fig_2.update_traces(marker=dict(size=2))
fig_2.show()