In [1]:
"""
This file is a combination of data.ipynb and clustering.ipynb,
modified a bit to run all pairs of clusters
"""

import sys
sys.path.insert(0, '/gpfs/software/Anaconda3/lib/python3.6/site-packages')
import os

import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import pairwise_distances

from distance import SquaredL2, L2
from neighborhood import neighbor_graph, laplacian
from correspondence import Correspondence
from stiefel import *
from kmedoids import *
import itertools

import torch
import torch.nn as nn
import torch.nn.functional as F
torch.set_default_tensor_type('torch.DoubleTensor')

In [2]:
"""Defines the neural network"""
class Net(nn.Module):
    def __init__(self, D_in, H1, H2, D_out):
        super(Net, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H1)
        self.linear2 = torch.nn.Linear(H1, H2)
        self.linear3 = torch.nn.Linear(H2, D_out)

    def forward(self, x):
        h1_sigmoid = self.linear1(x).sigmoid()
        h2_sigmoid = self.linear2(h1_sigmoid).sigmoid()
        y_pred = self.linear3(h2_sigmoid)
        return y_pred

In [14]:
def train_and_project(x1, x2):
    #import pdb
    #pdb.set_trace()
    N, D_in1, D_in2, H1, H2, D_out = x1.shape[0], x1.shape[1], x2.shape[1], 1024, 512, 3

    model1 = Net(D_in1, H1, H2, D_out)
    model2 = Net(D_in2, H1, H2, D_out)

    x1_np = x1.values
    x2_np = x2.values

    x1 = torch.from_numpy(x1_np)
    x2 = torch.from_numpy(x2_np)

    #%store x1
    #%store x2
    
    adj1 = neighbor_graph(x1_np, k=5)
    adj2 = neighbor_graph(x2_np, k=5)

    corr = Correspondence(matrix=np.eye(N))
    
    w = np.block([[corr.matrix(),adj1],
              [adj2, corr.matrix()]])

    L_np, D_np = laplacian(w, normed=True, return_diag=True)
    L = torch.from_numpy(L_np)
    
    params = list(model1.parameters()) + list(model2.parameters())
    optimizer = torch.optim.Adam(params, lr = 0.00001)
    
    for t in range(200):
        # Forward pass: Compute predicted y by passing x to the model
        y1_pred = model1(x1)
        y2_pred = model2(x2)

        outputs = torch.cat((y1_pred, y2_pred), 0)

        # Project the output onto Stiefel Manifold
        u, s, v = torch.svd(outputs, some=True)
        proj_outputs = u@v.t()

        # Compute and print loss
        loss = torch.trace(proj_outputs.t()@L@proj_outputs)
        if (t % 20 == 19):
            print(t, loss.item())

        # Zero gradients, perform a backward pass, and update the weights.
        proj_outputs.retain_grad()

        optimizer.zero_grad()
        loss.backward(retain_graph=True)

        # Project the (Euclidean) gradient onto the tangent space of Stiefel Manifold (to get Rimannian gradient)
        rgrad = proj_stiefel(proj_outputs, proj_outputs.grad)
        grd = torch.norm(rgrad)
        if (t % 20 == 19):
            print(torch.norm(rgrad))

        optimizer.zero_grad()
        # Backpropogate the Rimannian gradient w.r.t proj_outputs
        proj_outputs.backward(rgrad)

        optimizer.step()
        
    proj_outputs_np = proj_outputs.detach().numpy()
    return proj_outputs_np
    

In [23]:
torch.manual_seed(123)
torch.backends.cudnn.deterministic = True
np.random.seed(123)

df = pd.read_csv("data/DER-22_Single_cell_expression_raw_UMI.tsv", sep='\t')
xl = pd.read_excel("data/DER-21_Single_cell_markergenes_UMI.xlsx")

# 20 C 2 = 190
#all_code = ['Ex1', 'Ex2', 'Ex3', 'Ex4', 'Ex5', 'Ex6', 'Ex8', 'Ex9',
#           'In1', 'In3', 'In4', 'In6', 'In7', 'In8',
#           'Endo', 'Per', 'Astro', 'Oligo', 'OPC', 'Microglia']

# 6 C 2 = 15
all_code = ['Ex1', 'Ex2',
            'In1', 'In3',
           'Endo', 'Per']

#all_code = ['Ex1', 'Ex2',
#           'In1']
if (not os.path.isdir('result')):
    os.mkdir('result')

for i, code_i in enumerate(all_code):
    for j, code_j in enumerate(all_code):
        if (j <= i):
            continue
        
        print("Working on %s and %s" % (code_i, code_j))
        
        i_cols = [col for col in df.columns if code_i in col]
        j_cols = [col for col in df.columns if code_j in col]

        i_data = df[i_cols].loc[df.index.isin(xl['All Clusters'])]
        j_data = df[j_cols].loc[df.index.isin(xl['All Clusters'])]

        i_log = np.log2(i_data+1)
        j_log = np.log2(j_data+1)
        
        # Project the data
        proj_outputs_np = train_and_project(i_log, j_log)
        out_dir = './result/%s_%s/' % (code_i, code_j)
        
        # Save the projected data
        if (not os.path.isdir(out_dir)):
            os.mkdir(out_dir)
        np.save(out_dir + 'proj_outputs_np.npy', proj_outputs_np)
        
        # Clustering
        D = pairwise_distances(proj_outputs_np, metric='euclidean')
        M, C = kMedoids(D, 10)
        C_label = np.zeros(proj_outputs_np.shape[0])
        for label in C:
            for point_idx in C[label]:
                C_label[point_idx] = label
                
        df_result = pd.DataFrame(proj_outputs_np)
        df_result = df_result.add_prefix('Val')
        X_or_Y = np.repeat(np.array([code_i, code_j]), 
                           [proj_outputs_np.shape[0]/2, proj_outputs_np.shape[0]/2], axis=0)
        
        df_ = pd.DataFrame({'module':C_label, 'data':X_or_Y})
        df_result = pd.concat([df_, df_result], axis=1)
        df_result.index = list(itertools.chain(i_data.index, j_data.index))
        
        df_result.to_csv(out_dir + "cluster_result.csv")
        

Working on Ex1 and Ex2
19 2.145151110688942
tensor(1.7994, grad_fn=<NormBackward0>)
39 2.12073739383044
tensor(1.7776, grad_fn=<NormBackward0>)
59 2.1021572925253196
tensor(1.7632, grad_fn=<NormBackward0>)
79 2.0694547263835594
tensor(1.7809, grad_fn=<NormBackward0>)
99 0.7310180231797161
tensor(2.0735, grad_fn=<NormBackward0>)
119 0.45147390241411844
tensor(1.8301, grad_fn=<NormBackward0>)
139 0.39797896233800545
tensor(1.7725, grad_fn=<NormBackward0>)
159 0.3715377932907794
tensor(1.7390, grad_fn=<NormBackward0>)
179 0.3515392478972639
tensor(1.7158, grad_fn=<NormBackward0>)
199 0.3350248531290141
tensor(1.6931, grad_fn=<NormBackward0>)
Working on Ex1 and In1
19 2.280258572132821
tensor(2.0799, grad_fn=<NormBackward0>)
39 2.2208039593114997
tensor(2.0599, grad_fn=<NormBackward0>)
59 1.387271005821826
tensor(2.0974, grad_fn=<NormBackward0>)
79 1.0757994114744895
tensor(1.8858, grad_fn=<NormBackward0>)
99 0.8384946167461905
tensor(1.8588, grad_fn=<NormBackward0>)
119 0.7120986120173617

159 1.0074743230554692
tensor(1.9811, grad_fn=<NormBackward0>)
179 0.9164596008080413
tensor(1.9536, grad_fn=<NormBackward0>)
199 0.7794194236758571
tensor(1.8938, grad_fn=<NormBackward0>)
Working on In3 and Per
19 2.2793870382393906
tensor(3.7695, grad_fn=<NormBackward0>)
39 2.157831466173227
tensor(3.7248, grad_fn=<NormBackward0>)
59 1.9318998861215553
tensor(3.0894, grad_fn=<NormBackward0>)
79 1.4192935817379333
tensor(2.9497, grad_fn=<NormBackward0>)
99 1.0391966721576582
tensor(3.0362, grad_fn=<NormBackward0>)
119 0.9172978997743522
tensor(3.1581, grad_fn=<NormBackward0>)
139 0.8844425551317834
tensor(3.1410, grad_fn=<NormBackward0>)
159 0.8668259145250367
tensor(3.1357, grad_fn=<NormBackward0>)
179 0.8489618130285642
tensor(3.1287, grad_fn=<NormBackward0>)
199 0.8292700672407056
tensor(3.1266, grad_fn=<NormBackward0>)
Working on Endo and Per
19 2.2405776047147077
tensor(4.3120, grad_fn=<NormBackward0>)
39 2.13333065070713
tensor(4.3718, grad_fn=<NormBackward0>)
59 2.0911680702536