In [None]:
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
from sklearn.cluster import DBSCAN, OPTICS, AgglomerativeClustering, KMeans
import matplotlib.pyplot as plt 
%matplotlib inline
import seaborn as sns
from tqdm import tqdm
import hdbscan

In [None]:
infile = 'vecs/tess_ode.pkl'
# infile = 'gaia2d_vecs.pkl'

In [None]:
data = pd.read_pickle(infile)

In [None]:
data = {k:v for k,v in data.items() if type(v) == np.ndarray}

In [None]:
clusters = 'random'

In [None]:
vecs = np.stack(list(data.values()))

In [None]:
vecs.shape

### DBSCAN

In [None]:
# alg = DBSCAN(min_samples=5, n_jobs=-1)
# alg = OPTICS(min_samples=10, n_jobs=-1, cluster_method='dbscan')
# alg = AgglomerativeClustering(n_clusters=10)
# alg = KMeans(n_clusters=20)
alg = hdbscan.HDBSCAN(min_cluster_size=10, leaf_size=10, min_samples=10)

In [None]:
clusters = alg.fit_predict(vecs)

In [None]:
np.unique(clusters)

### TSNE

In [None]:
res = pd.read_pickle('tess_tsne.pkl')

In [None]:
tsne = TSNE(n_components=2, perplexity=80)

In [None]:
res = tsne.fit_transform(vecs)

In [None]:
# pd.to_pickle(res, 'tess_tsne.pkl')

### TSNE Post Clustering

In [None]:
res2 = pd.read_pickle('tess_tsne.pkl')

In [None]:
res2.shape

In [None]:
# res2 = res2/res2.max(0)

In [None]:
x1s = res2[:,0]
x2s = res2[:,1]

In [None]:
x1s = (x1s-x1s.min())/(x1s.max()-x1s.min())
x2s = (x2s-x2s.min())/(x2s.max()-x2s.min())

In [None]:
res2[:,0] = x1s
res2[:,1] = x2s

In [None]:
# alg = DBSCAN(min_samples=50, n_jobs=-1, algorithm='ball_tree')
# alg = OPTICS(min_samples=50, metric='minkowski', p=2, cluster_method='xi')
# alg = KMeans(n_clusters=10)
alg = hdbscan.HDBSCAN(min_cluster_size=10)

In [None]:
clusters2 = alg.fit_predict(res)

In [None]:
clusters2.max()

### Summary

In [None]:
indir = 'tess/16_17/z_normalized/'
filenames = [indir+x+'.npy' for x in list(data.keys())]

In [None]:
clusters

In [None]:
# df = pd.DataFrame(zip(filenames, res2[:,0], res2[:,1], clusters),
#                   columns = ['file', 'x1', 'x2', 'cluster'])
# df = pd.DataFrame(zip(res2[:,0], res2[:,1], clusters2),
#                   columns = ['x1', 'x2', 'cluster'])
# df = pd.DataFrame(zip(res[:,0], res[:,1], clusters),
#                   columns = ['x1', 'x2', 'cluster'])
if clusters == 'random':
    print(clusters)
    df = pd.DataFrame(zip(filenames, res[:,0], res[:,1]),
                      columns = ['file','x1', 'x2'])
    df['cluster'] = 'random'

In [None]:
plt.figure(figsize=(15,10), dpi=600)
# sns.scatterplot(x='x1', y='x2', hue='cluster', data=df, 
#                 palette=sns.color_palette("Paired", df.cluster.nunique()))
ax=sns.scatterplot(x='x1', y='x2', data=df, s=5, alpha=0.6, color='r')
ax.axis('off')
plt.savefig('tess_tsne.pdf')
# plt.savefig('tess_tsne.svg')

In [None]:
df['has_cluster'] = 0

In [None]:
df['has_cluster'].loc[df['cluster'] != -1] = 1

In [None]:
plt.figure(figsize=(15,10))
# sns.scatterplot(x='x1', y='x2', hue='cluster', data=df, 
#                 palette=sns.color_palette("Paired", df.cluster.nunique()))
ax=sns.scatterplot(x='x1', y='x2', data=df, s=5, alpha=0.6, color='r')
ax.axis('on')
# plt.savefig('tess_tsne.pdf')

### Random plots

In [None]:
import random

In [None]:
def plot_random(save=True, plot_bounds=None):
    if not plot_bounds:
        s = df.sample(n=1)
    else:
        s = df.loc[(df['x1'] > plot_bounds[0]) & (df['x1'] < plot_bounds[1])
       & (df['x2'] > plot_bounds[2]) & (df['x2'] < plot_bounds[3]) ]
        s = s.sample(n=1)
    file = s.file.values[0]
    cluster = s.cluster.values[0]
    entry = s.file
    x1 = s.x1.apply(lambda x: np.round(x,2)).values[0]
    x2 = s.x2.apply(lambda x: np.round(x,2)).values[0]
    outstr = str(cluster)+'_x1_'+str(x1)+'_x2_'+str(x2)
    data = np.load(file)[1]
    plt.figure(figsize=(5,5))
    #     ax.set_title('cluster: ' + str(ixs[i]) + '  ' + 'score: ' + str(probs[i]))
    ax=sns.lineplot(x=np.arange(len(data)), y=data, color='k', alpha=0.8)
    ax.set(yticks=[], xticks=[])
    if save:
        plt.savefig('tess_tsne/'+outstr+'.svg')
    return ax


In [None]:
plot_bounds = (-50,-40, -10, 0)

In [None]:
for _ in range(5):
    plot_random(save=True, plot_bounds=plot_bounds)

### TS Stats / TSNE Hue

In [None]:
df['dip'] = 0
df['maximums'] = 0
df['max_fluctuation'] = 0
df['variance'] = 0

In [None]:
for row, value in tqdm(df.iterrows()):
    file = value['file']
    data = np.load(file)[1]
    # features
    dip = data.min()
    maxval = data.max()
    maxfluc = maxval-dip
    var = data.var()
    df.at[row, 'dip'] = dip
    df.at[row, 'maximums'] = maxval
    df.at[row, 'max_fluctuation'] = maxfluc
    df.at[row, 'variance'] = var

In [None]:
# plt.figure(figsize=(7,5))
# # sns.scatterplot(x='x1', y='x2', hue='cluster', data=df, 
# #                 palette=sns.color_palette("Paired", df.cluster.nunique()))
# ax=sns.scatterplot(x='x1', y='x2', data=df, s=5, alpha=1.0, hue='dip')
# ax.axis('off')

### Cluster plot

In [None]:
samples = dict(df.groupby('cluster')['file'].apply(lambda x: random.choice(list(x))))

In [None]:
d = {}
for cluster, file in samples.items():
    entry = df.loc[df['file'] == file]
    x1 = entry['x1'].apply(lambda x: np.round(x,0)).values[0]
    x2 = entry['x2'].apply(lambda x: np.round(x,0)).values[0]
    outstr = str(cluster)+'_x1_'+str(x1)+'_x2_'+str(x2)
    d[cluster] = np.load(file)[1]

In [None]:
f, axes = plt.subplots(len(np.unique(clusters)),1, figsize=(4, 25), sharey=True)
for i in range(len(d)):
    if i == 0:
        continue
    ax = axes[i-1]
    ax.set_xticks(ticks=[])
    ax.set_yticks(ticks=[])
#     ax.set_title('cluster: ' + str(ixs[i]) + '  ' + 'score: ' + str(probs[i]))
    sns.lineplot(x=np.arange(len(d[i])), y=d[i], color='b', ax=ax)