# Comparison between different dimensional reduction techniques

In this notebook we want to compare different techniques for dimensional reduction applying them to the (numerical part of) the data from the Kaggle anomaly detection challenge: https://www.kaggle.com/c/anomaly-detection-challenges

We will use three algorithms:
- Principal component analysis (PCA)
- t-distributed stochastic neighbour embedding (t-SNE)
- Uniform manifold approximation and projection (UMAP)

In [None]:
import pandas as pd
import numpy as np
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go

import sys
sys.path.insert(0, '/project/anomaly-detection/modules/')

from anomaly_detection_tools import *

init_notebook_mode(connected=True)

## Data ingestion

In [None]:
data_df = read_training_data()

In [None]:
data_df.head()

In [None]:
numerical_data_df = filter_numerical_only(data_df)

In [None]:
numerical_data_df.head()

In [None]:
Y = np.array(numerical_data_df['label']).reshape(-1,1)

## Dimensional reduction

### PCA

Reduction to 2 dimensions.

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca2d = PCA(n_components=2)

In [None]:
X_pca2d = pca2d.fit_transform(numerical_data_df.drop(['label'], axis=1))

In [None]:
plot2d(X_pca2d, Y)

Rescale features before dimensional reduction.

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()

In [None]:
X_scaled = scaler.fit_transform(numerical_data_df.drop(['label'], axis=1))

In [None]:
pca2d_scaled = PCA(n_components=2)

X_scaled_pca2d = pca2d_scaled.fit_transform(X_scaled)

In [None]:
plot2d(X_scaled_pca2d, Y)

Reduction to 3 dimensions, without scaling.

In [None]:
pca3d = PCA(n_components=3)

X_pca3d = pca3d.fit_transform(numerical_data_df.drop(['label'], axis=1))

In [None]:
plot3d(X_pca3d, Y)

Dimensional reduction to 3 dimensions, with scaling.

In [None]:
pca3d_scaled = PCA(n_components=3)

X_scaled_pca3d = pca3d_scaled.fit_transform(X_scaled)

In [None]:
plot3d(X_scaled_pca3d, Y)

## t-SNE

In [None]:
from sklearn.manifold import TSNE

In [None]:
tsne2d = TSNE(n_components=2)

X_tsne2d = tsne2d.fit_transform(numerical_data_df.drop(['label'], axis=1))

In [None]:
plot2d(X_tsne2d, Y)

In [None]:
tsne2d_scaled = TSNE(n_components=2)

X_scaled_tsne2d = tsne2d_scaled.fit_transform(X_scaled)

plot2d(X_scaled_tsne2d, Y)

In [None]:
tsne3d = TSNE(n_components=3)

X_tsne3d = tsne3d.fit_transform(numerical_data_df.drop(['label'], axis=1))

plot3d(X_tsne3d, Y)

In [None]:
tsne3d_scaled = TSNE(n_components=3)

X_scaled_tsne3d = tsne3d_scaled.fit_transform(X_scaled)

plot3d(X_scaled_tsne3d, Y)

## UMAP

In [None]:
from umap import UMAP

In [None]:
X_umap2d = UMAP().fit_transform(numerical_data_df.drop(['label'], axis=1))

plot2d(X_umap2d, Y)

In [None]:
X_scaled_umap2d = UMAP().fit_transform(X_scaled)

plot2d(X_scaled_umap2d, Y)

In [None]:
X_umap3d = UMAP(n_components=3).fit_transform(numerical_data_df.drop(['label'], axis=1))

plot3d(X_umap3d, Y)

In [None]:
X_scaled_umap3d = UMAP(n_components=3).fit_transform(X_scaled)

plot3d(X_scaled_umap3d, Y)