In [None]:
!pwd

In [None]:
from sklearn.manifold import TSNE
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
import os
import pandas as pd
import seaborn as sns
import numpy
from tqdm import tqdm
np.random.seed(1)
from minisom import MiniSom
from sklearn.cluster import KMeans
import itertools
import gc
import networkx as nx
from collections import Counter
from matplotlib.gridspec import GridSpec
import matplotlib.cm as cm
from time import time
from sklearn import metrics


# import warnings
# warnings.filterwarnings("ignore")

In [None]:

from minisom import MiniSom
import bisect
from itertools import combinations

class ConsensusCluster:
    """
      Implementation of Consensus clustering, following the paper
      https://link.springer.com/content/pdf/10.1023%2FA%3A1023949509487.pdf
      Args:
        * cluster -> clustering class
                    needs fit_predict method called with parameter n_clusters
        * L -> smallest number of clusters to try
        * K -> biggest number of clusters to try
        * H -> number of resamplings for each cluster number
        * resample_proportion -> percentage to sample
        * Mk -> consensus matrices for each k (shape =(K,data.shape[0],data.shape[0]))
                (NOTE: every consensus matrix is retained, like specified in the paper)
        * Ak -> area under CDF for each number of clusters 
                (see paper: section 3.3.1. Consensus distribution.)
        * deltaK -> changes in ares under CDF
                (see paper: section 3.3.1. Consensus distribution.)
        * self.bestK -> number of clusters that was found to be best
      """

    def __init__(self, cluster, L, K, H, resample_proportion=0.5):
        assert 0 <= resample_proportion <= 1, "proportion has to be between 0 and 1"
        self.cluster_ = cluster
        self.resample_proportion_ = resample_proportion
        self.L_ = L
        self.K_ = K
        self.H_ = H
        self.Mk = None
        self.Ak = None
        self.deltaK = None
        self.bestK = None

    def _internal_resample(self, data, proportion):
        """
        Args:
          * data -> (examples,attributes) format
          * proportion -> percentage to sample
        """
        resampled_indices = np.random.choice(
            range(data.shape[0]), size=int(data.shape[0]*proportion), replace=False)
        return resampled_indices, data[resampled_indices, :]

    def fit(self, data, verbose=False):
        """
        Fits a consensus matrix for each number of clusters
        Args:
          * data -> (examples,attributes) format
          * verbose -> should print or not
        """
        Mk = np.zeros((self.K_-self.L_, data.shape[0], data.shape[0]))
        Is = np.zeros((data.shape[0],)*2)
        for k in range(self.L_, self.K_):  # for each number of clusters
            i_ = k-self.L_
            if verbose:
                print("At k = %d, aka. iteration = %d" % (k, i_))
            for h in range(self.H_):  # resample H times
                if verbose:
                    print("\tAt resampling h = %d, (k = %d)" % (h, k))
                resampled_indices, resample_data = self._internal_resample(
                    data, self.resample_proportion_)
                Mh = self.cluster_(n_clusters=k).fit_predict(resample_data)
                # find indexes of elements from same clusters with bisection
                # on sorted array => this is more efficient than brute force search
                id_clusts = np.argsort(Mh)
                sorted_ = Mh[id_clusts]
                for i in range(k):  # for each cluster
                    ia = bisect.bisect_left(sorted_, i)
                    ib = bisect.bisect_right(sorted_, i)
                    is_ = id_clusts[ia:ib]
                    ids_ = np.array(list(combinations(is_, 2))).T
                    # sometimes only one element is in a cluster (no combinations)
                    if ids_.size != 0:
                        Mk[i_, ids_[0], ids_[1]] += 1
                # increment counts
                ids_2 = np.array(list(combinations(resampled_indices, 2))).T
                Is[ids_2[0], ids_2[1]] += 1
            Mk[i_] /= Is+1e-8  # consensus matrix
            # Mk[i_] is upper triangular (with zeros on diagonal), we now make it symmetric
            Mk[i_] += Mk[i_].T
            Mk[i_, range(data.shape[0]), range(
                data.shape[0])] = 1  # always with self
            Is.fill(0)  # reset counter
        self.Mk = Mk
        # fits areas under the CDFs
        self.Ak = np.zeros(self.K_-self.L_)
        for i, m in enumerate(Mk):
            hist, bins = np.histogram(m.ravel(), density=True)
            self.Ak[i] = np.sum(h*(b-a)
                             for b, a, h in zip(bins[1:], bins[:-1], np.cumsum(hist)))
        # fits differences between areas under CDFs
        self.deltaK = np.array([(Ab-Aa)/Aa if i > 2 else Aa
                                for Ab, Aa, i in zip(self.Ak[1:], self.Ak[:-1], range(self.L_, self.K_-1))])
        self.bestK = np.argmax(self.deltaK) + \
            self.L_ if self.deltaK.size > 0 else self.L_

    def predict(self):
        """
        Predicts on the consensus matrix, for best found cluster number
        """
        assert self.Mk is not None, "First run fit"
        return self.cluster_(n_clusters=self.bestK).fit_predict(
            1-self.Mk[self.bestK-self.L_])

    def predict_data(self, data):
        """
        Predicts on the data, for best found cluster number
        Args:
          * data -> (examples,attributes) format 
        """
        assert self.Mk is not None, "First run fit"
        return self.cluster_(n_clusters=self.bestK).fit_predict(
            data)
    
    def set_bestK(self, k):
        self.bestK = k

In [None]:
dem_info = pd.read_excel('/work/zg78/brain-flow-data/All_demographic_data_abridged dm.xlsx')
dem_info

In [None]:
dem_info['binary_druguse'] = dem_info['Group'].map({'dual':1, 'coc':1, 'mj':0, 'non':0})
dem_info

In [None]:
print(np.unique(dem_info['binary_druguse'].values, return_counts=True))

In [None]:
def get_classes(som_, x_test, k):
    x_test = (x_test - np.mean(x_test, axis=0)) / np.std(x_test, axis=0)
    label_list = []
    for i in range(len(x_test)):
        xx = x_test[i, :]
        winner = som_.winner(xx)
        c = map_class[winner] # from the location info get cluster info
        label_list.append(c)

    label_counts = [0]*k
    label, counts = np.unique(np.asarray(label_list), return_counts=True)
    for i in range(len(label)):
        label_counts[label[i]] = counts[i]

    return np.asarray(label_counts) / x_test.shape[0]


npy_path = '/work/zg78/brain-flow-data/PBMC_viable_npy/'

cell_events_per_subset = 10000

kf = KFold(n_splits=5, shuffle=True, random_state=1)
for fidx, splits in enumerate(kf.split(os.listdir(npy_path))):
    
    xs_train = []
    ys_train = []
    scores_train = []
    xs_val = []
    ys_val = []
    scores_val = []

    covars_train = []
    covars_val = []
    
    train_idx, val_idx = splits
    train_f = [os.listdir(npy_path)[i] for i in train_idx]
    val_f = [os.listdir(npy_path)[i] for i in val_idx]
    
    x_train = np.load(npy_path+train_f[0])
    subsample_idx = np.random.choice(x_train.shape[0], cell_events_per_subset, replace=False)
    x_train_sub = x_train[subsample_idx]

    for f in train_f[1:]:
        x_train_ = np.load(npy_path+f)
        
        subsample_idx_ = np.random.choice(x_train_.shape[0], cell_events_per_subset, replace=False)
        x_train_sub_ = x_train_[subsample_idx_]        
        x_train_sub = np.concatenate((x_train_sub, x_train_sub_), axis=0)

    mean_ = np.mean(x_train_sub, axis=0)
    std_ = np.std(x_train_sub, axis=0)
    x_train_sub = (x_train_sub - mean_) / std_
    
    print(x_train_sub.shape)
    
    np.random.shuffle(x_train_sub)
    
    size = 100
    som = MiniSom(size, size, x_train_sub.shape[1],
                  neighborhood_function='bubble', sigma=2, learning_rate=.8, topology='hexagonal',
                  random_seed=42)

    som.pca_weights_init(x_train_sub)
    som.train_random(x_train_sub, 10000, verbose=True)
    weights = som.get_weights()
    flatten_weights = weights.reshape(size*size, x_train_sub.shape[1])
    
    best_k = 14
    cluster_ = KMeans(n_clusters=14)
    print('K:', best_k)
    flatten_class = cluster_.fit_predict(flatten_weights)
    map_class = flatten_class.reshape(size, size)

    del x_train
    gc.collect()
    
    
    for f in tqdm(train_f):

        subject = int(f.split(' ')[0])
        if subject == '1001': subject = '2001'
        x = np.load(npy_path+f)
        rst = get_classes(som, x, best_k)
        y = dem_info.loc[dem_info['PID'] == subject]['binary_druguse'].values
        xs_train.append(rst)
        ys_train.append(y[0])

    for f in tqdm(val_f):

        subject = int(f.split(' ')[0])
        if subject == '1001': subject = '2001'
        x = np.load(npy_path+f)
        rst = get_classes(som, x, best_k)
        y = dem_info.loc[dem_info['PID'] == subject]['binary_druguse'].values
        xs_val.append(rst)
        ys_val.append(y[0])

    xs_train = np.asarray(xs_train)
    ys_train = np.asarray(ys_train)
    xs_val = np.asarray(xs_val)
    ys_val = np.asarray(ys_val)
    
    print(xs_train.shape, ys_train.shape, xs_val.shape, ys_val.shape)

    np.save('/work/zg78/som_folds/{}/X_train_{}.npy'.format(cut_folder, fidx), xs_train)
    np.save('/work/zg78/som_folds/{}/y_train_{}.npy'.format(cut_folder, fidx), ys_train)
    np.save('/work/zg78/som_folds/{}/X_val_{}.npy'.format(cut_folder, fidx), xs_val)
    np.save('/work/zg78/som_folds/{}/y_val_{}.npy'.format(cut_folder, fidx), ys_val)