# Dimensional reduction and clustering of data

In [None]:
import numpy as np
import pandas as pd
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import requests
from configparser import ConfigParser

init_notebook_mode(connected=True)

## Data ingestion

The data is served thorugh an API running on SherlockML: we send a GET request to it in order to retrieve the dataset.

The request must be authorized: the credentials are in a configuration file that is read by a parser, and are then included in the header of the request.

In [None]:
confpars = ConfigParser()

confpars.read('../.credentials/credentials.ini')

In [None]:
endpoint = '/get_data'

headers = {
        confpars['data_api']['header_key']: confpars['data_api']['api_key']
    }

response = requests.get(
    url = confpars['data_api']['url']+endpoint,
    headers = headers
)

print('Response status: '+str(response.status_code))

data_df = pd.DataFrame(response.json())
data_df = data_df.reset_index()
data_df = data_df.drop('index', axis=1)

In [None]:
data_df.head()

## Data exploration

Plot the data and have a look.

In [None]:
trace = go.Scatter3d(
    x = data_df['x'],
    y = data_df['y'],
    z = data_df['z'],
    mode = 'markers',
    marker = dict(
        size = 2.5
    )
)

data = [trace]

layout = go.Layout(
    scene = dict(
        xaxis = dict(
            title = 'x'
        ),
        yaxis = dict(
            title = 'y'
        ),
        zaxis = dict(
            title = 'z'
        )
    )
)

fig = go.Figure(data=data, layout=layout)

iplot(fig)

## Dimensional reduction

Import principal component analysis (PCA) from SKlearn and instantiate a PCA model.

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca_dimred = PCA(n_components=2)

Recast the data as a numpy array.

In [None]:
X = np.array(data_df)

Fit PCA to data and perform dimensional reduction.

In [None]:
pca_dimred.fit(X)

In [None]:
X_red = pca_dimred.transform(X)

Plot the dimensionally reduced dataset.

In [None]:
trace = go.Scatter(
    x = X_red[:,0],
    y = X_red[:,1],
    mode = 'markers',
    marker = dict(
        size = 2.5
    )
)

data = [trace]

layout = go.Layout(
    xaxis = dict(
        title = 'x (reduced)'
    ),
    yaxis = dict(
        title = 'y (reduced)'
    )
)

fig = go.Figure(data=data, layout=layout)

iplot(fig)

In many cases in which clustering happens along one dimension much more than along another, it is useful to rescale the data.

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()

In [None]:
X_red = scaler.fit_transform(X_red)

In [None]:
trace = go.Scatter(
    x = X_red[:,0],
    y = X_red[:,1],
    mode = 'markers',
    marker = dict(
        size = 2.5
    )
)

data = [trace]

layout = go.Layout(
    xaxis = dict(
        title = 'x (reduced)'
    ),
    yaxis = dict(
        title = 'y (reduced)'
    )
)

fig = go.Figure(data=data, layout=layout)

iplot(fig)

## Clustering

Import KMeans from SKlearn and instantiate a KMeans model.

In [None]:
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN

In [None]:
kmeans_clustering = KMeans(n_clusters=2, random_state=42)

Fit KMeans to the data and predict clusters.

In [None]:
kmeans_clustering.fit(X_red)

In [None]:
clusters_pred = kmeans_clustering.predict(X_red)

Put the dimensionally reduced data and the predicted cluster labels in a pandas dataframe and plot.

In [None]:
clusters_df = pd.DataFrame({
    'x_red': X_red[:,0],
    'y_red': X_red[:,1],
    'cluster': clusters_pred
})

In [None]:
data = []

for label in clusters_df['cluster'].unique():
    data.append(
        go.Scatter(
            x = clusters_df[clusters_df['cluster']==label]['x_red'],
            y = clusters_df[clusters_df['cluster']==label]['y_red'],
            mode = 'markers',
            marker = dict(
                size = 2.5
            ),
            name = 'cluster '+str(label)
        )   
    )

layout = go.Layout(
    xaxis = dict(
        title = 'x (reduced)'
    ),
    yaxis = dict(
        title = 'y (reduced)'
    )
)

fig = go.Figure(data=data, layout=layout)

iplot(fig)