# https://scedar.readthedocs.io/en/latest/notebooks/mb3k-demo.html

In [None]:
# !pip install scedar

In [2]:
import sys
sys.path.append("..")

import matplotlib.pyplot as plt
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score,silhouette_score, calinski_harabasz_score
from collections import Counter
import glob2
import scedar as sce
import pandas as pd
import numpy as np
import h5py
import time
import os
plt.ion()
plt.show()
%load_ext autoreload
%autoreload 2


In [3]:
for category in ["balanced_data", "imbalanced_data", "real_data"
                ]:

    path= ".."
    if category in ["balanced_data", "imbalanced_data"]:
        files = glob2.glob(f'{path}/R/simulated_data/{category}/*.h5')
        files = [f[len(f"{path}/R/simulated_data/{category}/"):-3] for f in files]
    else:
        files = glob2.glob(f'{path}/real_data/*.h5')
        files = [f[len(f"{path}/real_data/"):-3] for f in files]
    print(files)

    df = pd.DataFrame(columns = ["dataset", "ARI", "NMI", "sil", "run", "time", "pred", "cal"])
    for dataset in files:
        if category in ["balanced_data", "imbalanced_data"]:
            data_mat = h5py.File(f"{path}/R/simulated_data/{category}/{dataset}.h5","r")
        else:
            data_mat = h5py.File(f"{path}/real_data/{dataset}.h5","r")

        Y = np.array(data_mat['Y'])
        X = np.array(data_mat['X'])
        print(f">>>>dataset {dataset}")
        if X.shape[0] > 10000:
            continue

        X = np.ceil(X).astype(np.int)

        for run in range(3):
            start = time.time()
            seed = run
            np.random.seed(seed)
            sdm = sce.eda.SampleDistanceMatrix(X, metric = 'cosine', nprocs = 25)

            tsne_x = sdm.tsne(perplexity=30, n_iter=3000, random_state=run)

            mirac_res = sce.cluster.MIRAC(sdm._last_tsne, metric='euclidean',
                linkage='ward', min_cl_n=25,
                min_split_mdl_red_ratio=0.00,
                optimal_ordering=False,
                cl_mdl_scale_factor=0.80, verbose=False)

            pred = np.array(mirac_res.labs)
            elapsed = time.time() - start
            ARI = np.around(adjusted_rand_score(Y, pred), 5)
            NMI = np.around(normalized_mutual_info_score(Y, pred), 5)
            ss = silhouette_score(sdm._last_tsne, pred)
            cal = calinski_harabasz_score(sdm._last_tsne, pred)

            df.loc[df.shape[0]] = [dataset, ARI, NMI, ss, run, elapsed, pred, cal]
            df.to_pickle(f"../output/pickle_results/{category}/{category}_scedar.pkl")

['data_1c8', 'data_-1c4', 'data_-1c8', 'data_0c4', 'data_0c8', 'data_0c16', 'data_1.5c4', 'data_1c4', 'data_1.5c8', 'data_1.5c16', 'data_-1c16', 'data_1c16']
>>>>dataset data_1c8
>>>>dataset data_-1c4
>>>>dataset data_-1c8
>>>>dataset data_0c4
>>>>dataset data_0c8
>>>>dataset data_0c16
>>>>dataset data_1.5c4
>>>>dataset data_1c4
>>>>dataset data_1.5c8
>>>>dataset data_1.5c16
>>>>dataset data_-1c16
>>>>dataset data_1c16
['data_1c8', 'data_-1c4', 'data_-1c8', 'data_0c4', 'data_0c8', 'data_0c16', 'data_1.5c4', 'data_1c4', 'data_1.5c8', 'data_1.5c16', 'data_-1c16', 'data_1c16']
>>>>dataset data_1c8
>>>>dataset data_-1c4
>>>>dataset data_-1c8
>>>>dataset data_0c4
>>>>dataset data_0c8
>>>>dataset data_0c16
>>>>dataset data_1.5c4
>>>>dataset data_1c4
>>>>dataset data_1.5c8
>>>>dataset data_1.5c16
>>>>dataset data_-1c16
>>>>dataset data_1c16
['Quake_Smart-seq2_Trachea', 'Quake_Smart-seq2_Diaphragm', 'Quake_10x_Spleen', 'Young', 'mouse_ES_cell', 'Adam', 'Quake_10x_Bladder', 'Quake_Smart-seq2_Lu

In [None]:
df.groupby("dataset").mean()