In [None]:
%load_ext autoreload
%autoreload 2
import matplotlib.pyplot as plt
from matplotlib import cm
from mpl_toolkits.mplot3d import Axes3D
from sklearn.datasets import make_blobs, make_circles
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.cluster import KMeans, AgglomerativeClustering
import numpy as np
from silhouette import make_figure, make_silhouette

In [None]:
n_clusters = 5
data, cluster_labels = make_blobs(n_samples=500, n_features=2000, centers=n_clusters, center_box=(-10.0, 10.0), cluster_std=[1,1,4,1,1], random_state=42)
data = data + np.abs(np.min(data))

In [None]:
make_figure(data, cluster_labels)

In [None]:
metrics = ['euclidean', 'correlation', 'cosine']

# Null model from data

In [None]:
import pandas as pd
import cloudpickle as pickle

In [None]:
df = pd.read_csv("../topics/datasets/gtex10/mainTable.csv", index_col=0)
df_files = pd.read_csv("../topics/datasets/gtex10/files.dat").set_index("SAMPID")

In [None]:
df_tissue_gb = df.transpose().applymap(lambda tpm: np.log2(tpm+1)).join(df_files.loc[:,"SMTS"]).groupby("SMTS")
means = df_tissue_gb.median().values
stds = df_tissue_gb.std().values

In [None]:
n_clusters = means.shape[0]
data, cluster_labels = make_blobs(n_samples=df.shape[1], n_features=means.shape[1], centers=means, cluster_std=stds, random_state=42)

In [None]:
assert(data.T.shape==df.shape)

In [None]:
with open("figs.pkl", "rb") as file:
    figs = pickle.load(file)

In [None]:
def orange_color():
    while True:
        yield "orange"
        
orange_itarator = orange_color()

In [None]:
make_silhouette(data,
                "SMTS+null", 
                df_tissue_gb.count().index, 
                cluster_labels, 
                cluster_labels.max()+1, 
                metrics = ["euclidean", "cosine", "correlation"],
                color_iterator=orange_itarator, 
                figs=figs
               )

In [None]:
figs[0]

# Tailing space

In [None]:
maxs = df_tissue_gb.max().max(0).values
mins = df_tissue_gb.min().min(0).values

In [None]:
[np.random.uniform(m, M) for m,M in zip(mins, maxs)]

In [None]:
from sklearn.neighbors import KNeighborsClassifier
import multiprocessing as mp

In [None]:
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(df.transpose().applymap(lambda fpkm: np.log2(fpkm+1)).values, np.unique(df_files.reindex(index=df.columns)["SMTS"], return_inverse=True)[1])

In [None]:
data = [[np.random.uniform(m*0.9, M*1.1) for m,M in zip(mins, maxs)] for _ in range(df.shape[1])]

In [None]:
cluster_labels = knn.predict(data)

In [None]:
np.unique(cluster_labels)

In [None]:
np.array(data).shape

In [None]:
make_silhouette(np.array(data),
                "SMTS+tail", 
                df_tissue_gb.count().index, 
                cluster_labels, 
                cluster_labels.max()+1, 
                metrics = ["euclidean", "cosine", "correlation"],
                color_iterator=orange_itarator, 
               )

# Compare with MNIST

In [None]:
from tensorflow.keras.datasets import mnist

In [None]:
(x_train, y_train), (_, _) = mnist.load_data()

In [None]:
x_train = np.concatenate([x_train[y_train==0][:100],
                x_train[y_train==1][:100],
                x_train[y_train==2][:100],
                x_train[y_train==3][:100],
                x_train[y_train==4][:100],
                x_train[y_train==5][:100],
                x_train[y_train==6][:100],
                x_train[y_train==7][:100],
                x_train[y_train==8][:100],
                x_train[y_train==9][:100]
               ]).reshape((1000,-1))

y_train = np.concatenate([y_train[y_train==0][:100],
                y_train[y_train==1][:100],
                y_train[y_train==2][:100],
                y_train[y_train==3][:100],
                y_train[y_train==4][:100],
                y_train[y_train==5][:100],
                y_train[y_train==6][:100],
                y_train[y_train==7][:100],
                y_train[y_train==8][:100],
                y_train[y_train==9][:100]
               ])

In [None]:
figs = [plt.subplots()[0] for _ in range(3)]

In [None]:
make_silhouette(x_train,
                "mnist", 
                np.unique(y_train), 
                y_train, 
                10, 
                metrics = ["euclidean", "cosine", "correlation"],
                figs=figs)

In [None]:
df_mnist = pd.DataFrame(data=x_train)
df_mnist["label"]=y_train
df_mnist_gb = df_mnist.groupby("label")
means = df_mnist_gb.median()
stds = df_mnist_gb.std()

In [None]:
stds.shape

In [None]:
data, cluster_labels = make_blobs(n_samples=x_train.shape[0], n_features=x_train.shape[1], centers=means, cluster_std=stds, random_state=42)

In [None]:
make_silhouette(data,
                "mnist+null", 
                np.unique(y_train), 
                cluster_labels, 
                10, 
                color_iterator=orange_itarator,
                metrics = ["euclidean", "cosine", "correlation"],
                figs=figs)

In [None]:
figs[1]