In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import seaborn as sns
import umap
from sklearn.manifold import TSNE


In [2]:
import numpy as np
class CustomPCA:
	def __init__(self, num_components):
		self.num_components = num_components		# The number of reduced components
		self.explained_variance_ratio_ = None

	def get_explained_variance_ratio(self):
		return self.explained_variance_ratio_
	
	def get_eigenvectors(self, dataframe, plot_covariance=0):
		# dataframe: rows represent samples, columns represent features
		# np.cov expects features along the rows, hence the argument passed here should be transposed
		cov_matrix = np.cov(dataframe.T)
		if plot_covariance == 1:
			plt.figure(figsize=(12,10))
			sns.heatmap(cov_matrix, cmap='coolwarm', cbar=True)
			plt.title('Covariance Matrix Heatmap')
			plt.xlabel('Features')
			plt.ylabel('Features')
			plt.show()
		
		# Cov(i,j) > 0	: if i increases, j increases
		# Cov(i,j) < 0	: if i decreases, j increases
		# Cov(i,j) = 0	: No relationship 

		eigenvalues, eigenvectors = np.linalg.eigh(cov_matrix) # here H means Hermitian matrix, a symmetric matrix(matrix and transpose same) like Covariance Matrix
		sorted_indices = np.argsort(eigenvalues)[::-1]
		eigenvectors = eigenvectors[:, sorted_indices[:self.num_components]]

		total_sum = np.sum(eigenvalues)
		eigenvalues = eigenvalues[sorted_indices[:self.num_components]]

		self.explained_variance_ratio_ = np.sum(eigenvalues) / total_sum


		return eigenvectors

	def get_coordinates(self, eigenvectors, dataframe):
		reduced_data = np.dot(dataframe, eigenvectors)
		return pd.DataFrame(reduced_data)
	
	def plot_data_loss(self, eigenvector, reduced_dataframe, original_dataframe, col):
		restored_data = np.dot(reduced_dataframe, eigenvector.T)
		plt.figure(figsize=(10, 6))
		plt.plot(original_dataframe[col], label="Original Data", color="blue", alpha=0.7)
		plt.plot(restored_data[:, col], label="Restored Data", color="red", linestyle='--')
		plt.xlabel("Sample index")
		plt.ylabel(f"Value of Column {col}")
		plt.legend()
		plt.title(f"Data Loss Visualization for Column {col}")
		plt.show()
		plt.savefig(f"Loss_PCA_col_{col}.png")



In [3]:
def plot_reduced_data(reduced_dataframe, config=2, title="Default_Title"):
    if config == 1:
        plt.figure(figsize=(8, 6))
        plt.scatter(reduced_dataframe[0], np.zeros(len(reduced_dataframe[0])), alpha=0.6)
        plt.xlabel("Component 1")
        plt.ylabel("Y Axis")
        plt.title(title)
        plt.savefig(f"{title}.png")
        plt.show()

    elif config == 2:
        plt.figure(figsize=(8, 6))
        plt.scatter(reduced_dataframe[0], reduced_dataframe[1], alpha=0.6)
        plt.xlabel("Component 1")
        plt.ylabel("Component 2")
        plt.title(title)
        plt.savefig(f"{title}.png")
        plt.show()

    elif config == 3:
        fig = plt.figure(figsize=(8, 6))
        ax = fig.add_subplot(111, projection='3d')
        ax.scatter(reduced_dataframe[0], reduced_dataframe[1], reduced_dataframe[2], alpha=0.6)
        ax.set_xlabel("Component 1")
        ax.set_ylabel("Component 2")
        ax.set_zlabel("Component 3")
        ax.set_title(title)
        plt.savefig(f"{title}.png")
        plt.show()

    else:
        raise ValueError("Config should be 1, 2, or 3 for 1D, 2D, or 3D plot respectively.")

In [4]:
data = pd.read_csv("pca_data.txt", sep=" ", header=None)
# Standardization
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)

In [None]:
pca = CustomPCA(num_components=2)
eigenvectors = pca.get_eigenvectors(dataframe=data_scaled)
# Getting the reduced coordinates
reduced_data = pca.get_coordinates(eigenvectors, data_scaled)
plot_reduced_data(reduced_data, config=2, title="PCA_2D")

In [None]:
ratio = pca.get_explained_variance_ratio()
print("Explained Variance Ratio: ",ratio)

In [7]:
umap_model = umap.UMAP(n_components=2)
umap_data = umap_model.fit_transform(data_scaled)

In [None]:
print(umap_data)

In [None]:
umap_df = pd.DataFrame(umap_data)
plot_reduced_data(umap_df, config=2, title = "UMAP_2D")

In [10]:
# t-SNE Reduction (2 components)
tsne_model = TSNE(n_components=2)
tsne_data = tsne_model.fit_transform(data_scaled)

In [None]:
tsne_df = pd.DataFrame(tsne_data)
plot_reduced_data(tsne_df, config=2, title = "TSNE_2D")

In [None]:
pca = CustomPCA(num_components=3)
eigenvectors = pca.get_eigenvectors(dataframe=data_scaled)
# Getting the reduced coordinates
reduced_data = pca.get_coordinates(eigenvectors, data_scaled)
plot_reduced_data(reduced_data, config=3, title="PCA_3D")
ratio = pca.get_explained_variance_ratio()
print("Explained Variance Ratio: ",ratio)
umap_model = umap.UMAP(n_components=3)
umap_data = umap_model.fit_transform(data_scaled)
umap_df = pd.DataFrame(umap_data)
plot_reduced_data(umap_df, config=3, title = "UMAP_3D")
# t-SNE Reduction (3 components)
tsne_model = TSNE(n_components=3)
tsne_data = tsne_model.fit_transform(data_scaled)
tsne_df = pd.DataFrame(tsne_data)
plot_reduced_data(tsne_df, config=3, title = "TSNE_3D")



In [None]:
pca = CustomPCA(num_components=1)
eigenvectors = pca.get_eigenvectors(dataframe=data_scaled)
# Getting the reduced coordinates
reduced_data = pca.get_coordinates(eigenvectors, data_scaled)
plot_reduced_data(reduced_data, config=1, title="PCA_1D")
ratio = pca.get_explained_variance_ratio()
print("Explained Variance Ratio: ",ratio)
umap_model = umap.UMAP(n_components=1)
umap_data = umap_model.fit_transform(data_scaled)
umap_df = pd.DataFrame(umap_data)
plot_reduced_data(umap_df, config=1, title = "UMAP_1D")
# t-SNE Reduction (1 components)
tsne_model = TSNE(n_components=1)
tsne_data = tsne_model.fit_transform(data_scaled)
tsne_df = pd.DataFrame(tsne_data)
plot_reduced_data(tsne_df, config=1, title = "TSNE_1D")