In [29]:
import numpy as np
import seaborn as sns
import torch

import umap
import matplotlib.pyplot as plt
import pandas as pd
from community import community_louvain
from torch_geometric.utils import k_hop_subgraph,to_networkx,from_networkx
import matplotlib

import utils
import plots
from model_AE import reduction_AE
from model_GAT import Encoder,SenGAE,train_GAT
from model_Sencell import Sencell

import logging
import os
import argparse
from tqdm import tqdm

parser = argparse.ArgumentParser(description='Main program for sencells')

parser.add_argument('--output_dir', type=str, default='./outputs', help='')
parser.add_argument('--exp_name', type=str, default='', help='')

args = parser.parse_args(args=[])

args.exp_name='s5'

if not os.path.exists(args.output_dir):
    os.makedirs(args.output_dir)

logging.basicConfig(format='%(asctime)s.%(msecs)03d [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s',
                    datefmt='# %Y-%m-%d %H:%M:%S')

logging.getLogger().setLevel(logging.INFO)
logger = logging.getLogger()

# Part 1: load and process data
# cell_cluster_arr在画umap的时候用
adata,cluster_cell_ls,cell_cluster_arr,celltype_names=utils.load_data()
# plots.umapPlot(adata.obsm['X_umap'],clusters=cell_cluster_arr,labels=celltype_names)

new_data,markers_index,\
sen_gene_ls,nonsen_gene_ls,gene_names=utils.process_data(adata,cluster_cell_ls,cell_cluster_arr)

print(f'cell num: {new_data.shape[0]}, gene num: {new_data.shape[1]}')

gene_cell=new_data.X.toarray().T
cell_gene=gene_cell.T
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print('device: ',device)

cluster 数量： 21
celltype names: ['Macrophages', 'T cell lineage', 'Unknown', 'B cell lineage', 'Innate lymphoid cell NK', 'AT2', 'Monocytes', 'Multiciliated lineage', 'Dendritic cells', 'EC capillary', 'Mast cells', 'Fibroblasts', 'Secretory', 'EC venous', 'Lymphatic EC mature', 'AT1', 'Basal', 'EC arterial', 'Myofibroblasts', 'None', 'Submucosal Secretory']
-----------------------  ----
Macrophages              6941
T cell lineage            749
Unknown                   618
B cell lineage            374
Innate lymphoid cell NK   327
AT2                       294
Monocytes                 228
Multiciliated lineage     194
Dendritic cells           177
EC capillary              138
Mast cells                100
Fibroblasts                93
Secretory                  86
EC venous                  74
Lymphatic EC mature        68
AT1                        27
Basal                      26
EC arterial                20
Myofibroblasts             17
None                        6
Submucosal

In [2]:
%%time
def get_simi(i,my_dict):
    for j in range(i+1,cell_gene.shape[0]):
        u1=cell_gene[i]
        u2=cell_gene[j]
        # u1,u2必须是numpy.array，not tensor
        nz_u1 = u1.nonzero()[0]
        nz_u2 = u2.nonzero()[0]
        nz_inter = set(nz_u1) & set(nz_u2)
        nz_union = set(nz_u1) | set(nz_u2)
        if len(nz_inter) == 0:
            simi_score = 1 / (len(nz_union) + len(u1))
        elif len(nz_inter) == len(nz_union):
            simi_score = (len(nz_union) + len(u1) - 1) / (len(nz_union) + len(u1))
        else:
            simi_score = len(nz_inter) / len(nz_union)
        my_dict[(i,j)]=simi_score


def eucliDistance(v1,v2):
    # 计算欧氏距离
    return F.pairwise_distance(v1.view(1,-1),v2.view(1,-1),p=2)

def loss_exp(v1,v2):
    return torch.exp(-0.1*eucliDistance(v1,v2))


sim1_ls=[]
cell_gene=gene_cell.T    
results_matrix=np.zeros((cell_gene.shape[0],cell_gene.shape[0])) 


from multiprocessing import Pool,Manager
import os, time, random


print('Parent process %s.' % os.getpid())
p = Pool()
manager = Manager()
my_dict = manager.dict()
for i in tqdm(range(cell_gene.shape[0])):
    p.apply_async(get_simi, args=(i,my_dict,))

print('Waiting for all subprocesses done...')
p.close()
p.join()
print('All subprocesses done.')

Parent process 156409.


100%|██████████| 10558/10558 [00:00<00:00, 35942.79it/s]


Waiting for all subprocesses done...
All subprocesses done.
CPU times: user 3.27 s, sys: 1.38 s, total: 4.64 s
Wall time: 55min 10s


In [4]:
torch.save(my_dict,"./my_dict")

In [5]:
my_dict[(0,1)]

0.07983193277310924

In [6]:
a=torch.load("./my_dict")

In [8]:
a[(0,100)]

0.03289473684210526

In [9]:
from utils import save_objs

save_objs(my_dict,"./my_dict.pl")

obj saved ./my_dict.pl


In [None]:
# 17:17

In [10]:
for i in tqdm(range(cell_gene.shape[0])):
    for j in range(i+1,cell_gene.shape[0]):
        results_matrix[i][j]=my_dict[(i,j)]

100%|██████████| 10558/10558 [31:46<00:00,  5.54it/s] 


In [12]:
torch.save(results_matrix,"./results_matrix")

In [44]:
bool_arr=torch.tensor(1*(cell_gene==0)).to(device)

In [19]:
sim1=torch.tensor(results_matrix).to(device)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ..., -0.0000, -0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.1797,  0.0000,  0.0000,  ..., -0.0000,  0.0000,  0.0000]],
       device='cuda:0', grad_fn=<MulBackward0>)

In [None]:
%%time
import torch
from torch import nn, optim
from torch.nn import functional as F
import torch.utils.data as Data
import numpy as np


class AE(nn.Module):
    def __init__(self, dim, emb_dim=128):
        super(AE, self).__init__()
        self.dim = dim
        self.fc1 = nn.Linear(dim, 512)
        self.fc2 = nn.Linear(512, emb_dim)
        self.fc3 = nn.Linear(emb_dim, 512)
        self.fc4 = nn.Linear(512, dim)

    def encode(self, x):
        h1 = F.relu(self.fc1(x))
        return F.relu(self.fc2(h1))

    def decode(self, z):
        h3 = F.relu(self.fc3(z))
        return torch.relu(self.fc4(h3))

    def forward(self, x):
        z = self.encode(x.view(-1, self.dim))
        return self.decode(z), z
    
    
feature=torch.tensor(gene_cell.T)
feature=feature.to(device)
model = AE(dim=feature.shape[1]).to(device)
ba=feature.shape[0]
loader = Data.DataLoader(feature, ba)
optimizer = optim.Adam(model.parameters(), lr=1e-3)

def loss_expmatrix(encoded):
    return torch.exp(-0.1*torch.cdist(encoded,encoded))


def get_loss1(batch_x, decoded,bool_arr):
    return torch.abs((batch_x-decoded)*bool_arr).sum()


loss_func = nn.MSELoss()
EPOCH_AE = 2000
for epoch in range(EPOCH_AE):
    embeddings = []
    # loss_ls=[]
    for _, batch_x in enumerate(loader):
        decoded, encoded = model(batch_x)
#         if epoch %2  ==0:
#             loss1 = get_loss1(batch_x, decoded,bool_arr)
#         else:
        loss1=loss_func(batch_x,decoded)
        sim2=loss_expmatrix(encoded)
        loss2=10*(sim1*(torch.triu(sim2-sim1).abs())).mean()
        print(loss1.item(),loss2.item())
        loss=loss1+loss2
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        embeddings.append(encoded)
    #     loss_ls.append(loss.item())
    # scheduler.step(np.mean(loss_ls))
    print('Epoch :', epoch, '|', 'train_loss:%.12f' % loss.data)
#     break

0.1708962321281433 0.7118785437114148
Epoch : 0 | train_loss:0.882774775840
0.16793963313102722 0.6656433636494935
Epoch : 1 | train_loss:0.833582996781
0.16152916848659515 0.6165209743689954
Epoch : 2 | train_loss:0.778050142856
0.15216512978076935 0.562801331543929
Epoch : 3 | train_loss:0.714966461325
0.14431104063987732 0.5039958455393712
Epoch : 4 | train_loss:0.648306886179
0.14084160327911377 0.44097283295416656
Epoch : 5 | train_loss:0.581814436233
0.13875281810760498 0.37817224352909107
Epoch : 6 | train_loss:0.516925061637
0.134443461894989 0.31930900843718407
Epoch : 7 | train_loss:0.453752470332
0.12806689739227295 0.26544836164683966
Epoch : 8 | train_loss:0.393515259039
0.1207534670829773 0.2179095267681499
Epoch : 9 | train_loss:0.338662993851
0.1148814782500267 0.18014597524503698
Epoch : 10 | train_loss:0.295027453495
0.11288569122552872 0.15642693121387685
Epoch : 11 | train_loss:0.269312622439
0.11277997493743896 0.14518054705959774
Epoch : 12 | train_loss:0.25796052

0.07736320793628693 0.039615282272960795
Epoch : 106 | train_loss:0.116978490209
0.07688687741756439 0.03924686288109235
Epoch : 107 | train_loss:0.116133740299
0.07651743292808533 0.03889311460374947
Epoch : 108 | train_loss:0.115410547532
0.07629422098398209 0.03854412771926128
Epoch : 109 | train_loss:0.114838348703
0.07620344310998917 0.03818856487811884
Epoch : 110 | train_loss:0.114392007988
0.07614196836948395 0.03782662188251436
Epoch : 111 | train_loss:0.113968590252
0.0760202631354332 0.037467106710646826
Epoch : 112 | train_loss:0.113487369846
0.07583703845739365 0.03711723574441723
Epoch : 113 | train_loss:0.112954274202
0.07560402899980545 0.03678309770097983
Epoch : 114 | train_loss:0.112387126701
0.07539516687393188 0.03646492724205141
Epoch : 115 | train_loss:0.111860094116
0.07521449774503708 0.03615604650510518
Epoch : 116 | train_loss:0.111370544250
0.07508029043674469 0.035849357490869295
Epoch : 117 | train_loss:0.110929647928
0.07496192306280136 0.0355393088957620

0.06441622227430344 0.02218196764963922
Epoch : 208 | train_loss:0.086598189924
0.06430891156196594 0.022118827961322536
Epoch : 209 | train_loss:0.086427739523
0.06421475112438202 0.02205673146922105
Epoch : 210 | train_loss:0.086271482594
0.06412819027900696 0.021996517376459246
Epoch : 211 | train_loss:0.086124707655
0.0640370100736618 0.021933661954285262
Epoch : 212 | train_loss:0.085970672028
0.06393861025571823 0.021881214305419984
Epoch : 213 | train_loss:0.085819824561
0.06386168301105499 0.021815057111945372
Epoch : 214 | train_loss:0.085676740123
0.06377695500850677 0.02176256948515139
Epoch : 215 | train_loss:0.085539524494
0.06372802704572678 0.02169919351831129
Epoch : 216 | train_loss:0.085427220564
0.06368138641119003 0.021660830466178446
Epoch : 217 | train_loss:0.085342216877
0.06367575377225876 0.021608980946291698
Epoch : 218 | train_loss:0.085284734719
0.06363469362258911 0.02162384504555651
Epoch : 219 | train_loss:0.085258538668
0.063595250248909 0.02154138535156

In [None]:
cell_embed=torch.cat(embeddings)
cell_embed.shape

In [None]:
import numpy as np
from sklearn.manifold import TSNE

def umapPlot(embedding,clusters=None,reduce=False,labels=None):
    # if tensor: embedding should be .cpu().detach()
    # clusters: Nxt
    # t里面存的是行的index
    if reduce:
        reducer = umap.UMAP()
#         embedding = TSNE(n_components=2, learning_rate='auto',
#                   init='random', perplexity=3).fit_transform(embedding)
        embedding = reducer.fit_transform(embedding)
    
    plt.figure(figsize=(6,6),dpi=300)
    if clusters is None:
        plt.scatter(embedding[:,0],embedding[:,1],alpha=0.5,s=5)
    else:
        for cluster,label in zip(clusters,labels):
            plt.scatter(embedding[cluster,0],embedding[cluster,1],alpha=0.5,s=5,label=label)
        plt.legend(bbox_to_anchor=(1.04, 1), loc="upper left")
         
            
            
umapPlot(cell_embed.cpu().detach().numpy(),reduce=True,
         clusters=cluster_cell_ls,labels=celltype_names)

In [62]:
a=cell_embed.cpu().detach().numpy()

In [66]:
a

array([[0.2833134 , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.00663263, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.1639814 , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.0848366 , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]], dtype=float32)

In [67]:
embedding = TSNE(n_components=2, learning_rate='auto',
                  init='random', perplexity=3).fit_transform(torch.tensor(a))

UFuncTypeError: ufunc 'multiply' did not contain a loop with signature matching types (dtype('<U4'), dtype('float32')) -> None

In [17]:
torch.tensor([[1,2],[3,4]])*torch.tensor([[1,2],[3,4]])

tensor([[ 1,  4],
        [ 9, 16]])

In [3]:
%%time
import torch
from torch import nn, optim
from torch.nn import functional as F
import torch.utils.data as Data
import numpy as np


class AE(nn.Module):
    def __init__(self, dim, emb_dim=128):
        super(AE, self).__init__()
        self.dim = dim
        self.fc1 = nn.Linear(dim, 512)
        self.fc2 = nn.Linear(512, emb_dim)
        self.fc3 = nn.Linear(emb_dim, 512)
        self.fc4 = nn.Linear(512, dim)

    def encode(self, x):
        h1 = F.relu(self.fc1(x))
        return F.relu(self.fc2(h1))

    def decode(self, z):
        h3 = F.relu(self.fc3(z))
        return torch.relu(self.fc4(h3))

    def forward(self, x):
        z = self.encode(x.view(-1, self.dim))
        return self.decode(z), z
    
    
feature=torch.tensor(gene_cell.T)
feature=feature.to(device)
model = AE(dim=feature.shape[1]).to(device)
ba=5000
loader = Data.DataLoader(feature, ba)
optimizer = optim.Adam(model.parameters(), lr=1e-3)

loss_func = nn.MSELoss()
EPOCH_AE = 2000
for epoch in range(EPOCH_AE):
    embeddings = []
    # loss_ls=[]
    for _, batch_x in enumerate(loader)	:
        decoded, encoded = model(batch_x)
        loss1 = loss_func(batch_x, decoded)
        loss2 = 0
        for i in range(batch_x.shape[0]):
            for j in range(i+1,batch_x.shape[0]):
                v1=batch_x[i]
                v2=batch_x[j]
                h1=encoded[i]
                h2=encoded[j]
                sim1=get_simi(np.array(v1.cpu()),np.array(v2.cpu()))
                sim2=loss_exp(h1,h2)
                loss2+=sim1*(sim2-sim1).abs()
        print(loss1,loss2)
        loss=loss1+loss2
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        embeddings.append(encoded)
    #     loss_ls.append(loss.item())
    # scheduler.step(np.mean(loss_ls))
    print('Epoch :', epoch, '|', 'train_loss:%.12f' % loss.data)
    break

KeyboardInterrupt: 

25000000