Original implementation of Contrastive-sc method
(https://github.com/ciortanmadalina/contrastive-sc)

In [None]:
# !pip install scikit-learn  -U # AttributeError: 'str' object has no attribute 'decode' in fitting Logistic Regression Model

In [None]:
import sys
sys.path.append("..")
import argparse
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score
from sklearn.cluster import KMeans
from sklearn import metrics

import torch
import torch.nn as nn
import copy
from tqdm.notebook import tqdm
import models
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import st_loss

import h5py
import scipy as sp
import scanpy.api as sc
from collections import Counter
import random
import utils

import pickle
import time
import train
import os
import glob2
import interpret_utils
plt.ion()
plt.show()
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
path = "../"
category = "balanced_data"
category = "imbalanced_data"
files = glob2.glob(f'{path}R/simulated_data/{category}/*.h5')
files = [f[len(f"{path}R/simulated_data/{category}/"):-3] for f in files]
files

In [None]:
method = "contrastivesc"
run = 0
nb_features = -1
pval_cutoff = None
dropout = 0.9
lr = 0.4
layers = [200, 40, 60]
temperature = 0.07
for category in [ "balanced_data", "imbalanced_data",
                ]:
    files = glob2.glob(f'{path}R/simulated_data/{category}/*.h5')
    files = [f[len(f"{path}R/simulated_data/{category}/"):-3] for f in files]
    df = pd.DataFrame()

    for dataset in files:
        print(f">>>>> Data {dataset}")
        t0 = time.time()
        data_mat = h5py.File(f"{path}R/simulated_data/{category}/{dataset}.h5",
                             "r")
        print(f"Dropout {data_mat['dropout'][()][0]}")
        X = np.array(data_mat['X'])
        Y = np.array(data_mat['Y'])
        cluster_number = np.unique(Y).shape[0]
        nb_genes = 1500
        X, idx = train.preprocess(X, nb_genes=nb_genes, return_idx=True)

        torch.manual_seed(run)
        torch.cuda.manual_seed_all(run)
        np.random.seed(run)
        random.seed(run)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        start = time.time()
        dresults, model = train.run(X,
                                    cluster_number,
                                    dataset,
                                    Y=Y,
                                    nb_epochs=300,
                                    lr=lr,
                                    temperature=temperature,
                                    dropout=dropout,
                                    layers=layers,
                                    save_to=f"{path}output/{category}/{run}/",
                                    save_pred=True)
        clusters = dresults["kmeans_pred"]
        interpret_utils.de_analysis([X, np.array(data_mat['X'])],
                                    ["proc_",  "full_"],
                                    data_mat,
                                    idx,
                                    method,
                                    dataset,
                                    category,
                                    clusters,
                                    nb_features=nb_features,
                                    run=run,
                                    pval_cutoff=pval_cutoff)
        folder = f"../output/interpretability/{category}/{method}"
        write_to = f"{folder}/{dataset}"

        with open(f"{write_to}_all.pkl", 'rb') as f:
            results = pickle.load(f)
        start = time.time()
        device = train.get_device()
        criterion_rep = st_loss.SupConLoss(temperature=temperature)

        optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad,
                                            model.parameters()),
                                     lr=lr)

        results["features"]["saliency"] = []
        results["features"]["grad_x_input"] = []

        stot = pd.DataFrame()
        model.train()
        grad_time = 0
        for c in np.sort(np.unique(results["meta"]["clusters"])):
            ii = np.where(results["meta"]["clusters"] == c)[0]
            input1 = torch.FloatTensor(X[ii]).to(device)
            input2 = torch.FloatTensor(X[ii]).to(device)
            input1.requires_grad = True
            input2.requires_grad = True
            anchors_output = model(input1, dropout = 0)
            neighbors_output = model(input2, dropout = 0)

            features = torch.cat(
                [anchors_output.unsqueeze(1),
                 neighbors_output.unsqueeze(1)],
                dim=1)
            total_loss = criterion_rep(features)

            optimizer.zero_grad()
            total_loss.backward(retain_graph=True)

            g1 = np.mean(np.array([np.abs(input1.grad.cpu().numpy()), 
                                   np.abs(input2.grad.cpu().numpy())]), axis = 0)
        #     w1 = model.encoder[1].weight.detach().cpu().numpy()
        #     g1 = model.encoder[1].weight.grad.detach().cpu().numpy()
        #     wa1 = (w1*g1).mean(0)
        #     wa1 = wa1.mean(1)
        #     print(model.encoder[1].weight.grad.shape, X[ii].shape)
        #     saliency = np.abs(g1.mean(0))
            t1 = time.time()
            grad_x_input = g1 * X[ii]
            grad_x_input = grad_x_input.mean(0)
            t2 = time.time()
            grad_time += (t2-t1)
            saliency = np.abs(g1.mean(0))
        #     saliency = np.mean(np.abs(g1), axis = 0)

            s_r= np.argsort(saliency)[::-1][:nb_features].astype(str)
            results["features"]["saliency"].append(s_r)
            s = pd.DataFrame()
            s["x"] = s_r
            s["saliency"] = np.sort(saliency)[::-1][:nb_features]
            s["cluster"] = c
#             print(">> saliency ", len(np.intersect1d(results["features"]["truth"][int(c)], s_r)))
            t1 = time.time()
            gi_r= np.argsort(grad_x_input)[::-1][:nb_features].astype(str)
            results["features"]["grad_x_input"].append(gi_r)
            gi = pd.DataFrame()
            gi["x"] = gi_r
            gi["grad_x_input"] = np.sort(grad_x_input)[::-1][:nb_features]
            gi["cluster"] = c
            t2 =time.time()
            grad_time += (t2-t1)
#             print(">> gi ", len(np.intersect1d(results["features"]["truth"][int(c)], gi_r)))
        end = time.time()
        results["time"][f"saliency"] = end - start - grad_time
        results["time"][f"grad_x_input"] = end - start
        s["rank"] = s.groupby("cluster")["saliency"].rank("dense", ascending=False)
        gi["rank"] = gi.groupby("cluster")["grad_x_input"].rank("dense", ascending=False)

        results["scores"]["saliency"] = s
        results["scores"]["grad_x_input"] = gi
        with open(f"{write_to}_all.pkl", 'wb') as f:
            pickle.dump(results, f, pickle.HIGHEST_PROTOCOL)
