In [None]:
'''
ECSE 556 Homework 1
Dimensionality Reduction
Gian Favero
October 6th, 2023
'''

# Common imports
import numpy as np
import matplotlib.pyplot as plt
import time

# import data
data = np.genfromtxt('Data/gdsc_expr_postCB.csv', delimiter=',')
data = data[1:,1:]
data = np.transpose(data)

# Scale data (comment out for no scaling)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data = scaler.fit_transform(data)

# Dimensionality Reduction

Principal Component Analysis (PCA)

In [None]:
from sklearn.decomposition import PCA

# Create PCA object
pca = PCA(n_components=2)

# Fit and tranform data
pca_start = time.time()
data_pca = pca.fit_transform(data)
pca_end = time.time()

UMAP

In [None]:
import umap

# Create UMAP object
u_map = umap.UMAP(n_components=2)

# UMAP reduction of data
umap_start = time.time()
data_umap = u_map.fit_transform(data)
umap_end = time.time()

t-SNE

In [None]:
from sklearn.manifold import TSNE

# Generate TSNE object
tsne = TSNE(n_components=2)

# Fit and transform data
tsne_start = time.time()
data_tsne = tsne.fit_transform(data)
tsne_end = time.time()

Comparison (Default Params)

In [None]:
# Plotting
plt.figure(figsize=(20,20))
plt.subplot(331)
plt.scatter(data_pca[:,0], data_pca[:,1], s=1)
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('PCA Reduction')

plt.subplot(332)
plt.scatter(data_umap[:,0], data_umap[:,1], s=1)
plt.xlabel('UMAP1')
plt.ylabel('UMAP2')
plt.title('UMAP Reduction')

plt.subplot(333)
plt.scatter(data_tsne[:,0], data_tsne[:,1], s=1)
plt.xlabel('t-SNE1')
plt.ylabel('t-SNE2')
plt.title('t-SNE Reduction')
plt.show()

# Run time comparison
import prettytable as pt

t = pt.PrettyTable(['Method', 'Run Time (s)'])
t.add_row(['PCA', pca_end - pca_start])
t.add_row(['UMAP', umap_end - umap_start])
t.add_row(['TSNE', tsne_end - tsne_start])
t.title = 'Run Time Comparison'
print(t)

UMAP Analysis

In [None]:
# Changing number of neighbours
neighbours = [5, 20, 150]
umaps = []
for n in neighbours:
    u_map = umap.UMAP(n_components=2, n_neighbors=n)
    umaps.append(u_map.fit_transform(data))

# Changing min distance
min_dist = [0.1, 0.5, 0.9]
for m in min_dist:
    u_map = umap.UMAP(n_components=2, min_dist=m)
    umaps.append(u_map.fit_transform(data))

In [None]:
# Plotting UMAPs
plt.figure(figsize=(20,20))
for i in range(3):
    plt.subplot(331+i)
    plt.xlim(-10, 20)
    plt.ylim(-10, 25)
    plt.scatter(umaps[i][:,0], umaps[i][:,1], s=1)
    plt.xlabel('UMAP1')
    plt.ylabel('UMAP2')
    plt.title(f'UMAP Reduction with {neighbours[i]} Neighbours')
plt.show()

plt.figure(figsize=(20,20))
for i in [3, 4, 5]:
    plt.subplot(331+i)
    plt.xlim(-10, 20)
    plt.ylim(-10, 25)
    plt.scatter(umaps[i][:,0], umaps[i][:,1], s=1)
    plt.xlabel('UMAP1')
    plt.ylabel('UMAP2')
    plt.title(f'UMAP Reduction with {min_dist[i % 3]} Min Distance')
plt.show()

t-SNE Analysis

In [None]:
# Changing perplexity
perplexities = [5, 50, 150]
tsnes = []
for p in perplexities:
    tsne = TSNE(n_components=2, perplexity=p)
    tsnes.append(tsne.fit_transform(data))

In [None]:
# Plotting TSNEs
plt.figure(figsize=(20,20))
for i in range(3):
    plt.subplot(331+i)
    plt.xlim(-80, 80)
    plt.ylim(-80, 65)
    plt.scatter(tsnes[i][:,0], tsnes[i][:,1], s=1)
    plt.xlabel('t-SNE1')
    plt.ylabel('t-SNE2')
    plt.title('t-SNE Reduction with {} Perplexity'.format(perplexities[i]))
plt.show()