In [1]:
import numpy as np
import seaborn as sns
import torch

import umap
import matplotlib.pyplot as plt
import pandas as pd
from community import community_louvain
from torch_geometric.utils import k_hop_subgraph,to_networkx,from_networkx
import matplotlib

import utils
import plots
from model_AE import reduction_AE
from model_GAT import Encoder,SenGAE,train_GAT
from model_Sencell import Sencell

import logging
import os
import argparse
from tqdm import tqdm

parser = argparse.ArgumentParser(description='Main program for sencells')

parser.add_argument('--output_dir', type=str, default='./outputs', help='')
parser.add_argument('--exp_name', type=str, default='', help='')

args = parser.parse_args(args=[])

args.exp_name='s5'

if not os.path.exists(args.output_dir):
    os.makedirs(args.output_dir)

logging.basicConfig(format='%(asctime)s.%(msecs)03d [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s',
                    datefmt='# %Y-%m-%d %H:%M:%S')

logging.getLogger().setLevel(logging.DEBUG)
logger = logging.getLogger()

# Part 1: load and process data
# cell_cluster_arr在画umap的时候用
adata,cluster_cell_ls,cell_cluster_arr,celltype_names=utils.load_data()
# plots.umapPlot(adata.obsm['X_umap'],clusters=cell_cluster_arr,labels=celltype_names)

new_data,markers_index,\
sen_gene_ls,nonsen_gene_ls,gene_names=utils.process_data(adata,cluster_cell_ls,cell_cluster_arr)

print(f'cell num: {new_data.shape[0]}, gene num: {new_data.shape[1]}')

gene_cell=new_data.X.toarray().T
cell_gene=gene_cell.T
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print('device: ',device)

# 2022-12-02 16:12:43.094 [DEBUG] [attrs.py:77] Creating converter from 3 to 5


cluster 数量： 21
celltype names: ['Macrophages', 'T cell lineage', 'Unknown', 'B cell lineage', 'Innate lymphoid cell NK', 'AT2', 'Monocytes', 'Multiciliated lineage', 'Dendritic cells', 'EC capillary', 'Mast cells', 'Fibroblasts', 'Secretory', 'EC venous', 'Lymphatic EC mature', 'AT1', 'Basal', 'EC arterial', 'Myofibroblasts', 'None', 'Submucosal Secretory']
-----------------------  ----
Macrophages              6941
T cell lineage            749
Unknown                   618
B cell lineage            374
Innate lymphoid cell NK   327
AT2                       294
Monocytes                 228
Multiciliated lineage     194
Dendritic cells           177
EC capillary              138
Mast cells                100
Fibroblasts                93
Secretory                  86
EC venous                  74
Lymphatic EC mature        68
AT1                        27
Basal                      26
EC arterial                20
Myofibroblasts             17
None                        6
Submucosal

In [2]:
def get_simi(u1, u2):
    # u1,u2必须是numpy.array，not tensor
    nz_u1 = u1.nonzero()[0]
    nz_u2 = u2.nonzero()[0]
    nz_inter = np.array(list(set(nz_u1) & set(nz_u2)))
    nz_union = np.array(list(set(nz_u1) | set(nz_u2)))
    if len(nz_inter) == 0:
        simi_score = 1 / (len(nz_union) + len(u1))
    elif len(nz_inter) == len(nz_union):
        simi_score = (len(nz_union) + len(u1) - 1) / (len(nz_union) + len(u1))
    else:
        simi_score = len(nz_inter) / len(nz_union)
    return float(simi_score)

def eucliDistance(v1,v2):
    # 计算欧氏距离
    return F.pairwise_distance(v1.view(1,-1),v2.view(1,-1),p=2)

def loss_exp(v1,v2):
    return torch.exp(-0.1*eucliDistance(v1,v2))

# sim1_ls=[]
# cell_gene=gene_cell.T
# for i in tqdm(range(cell_gene.shape[0])):
#     for j in range(i+1,cell_gene.shape[0]):
#         v1=cell_gene[i]
#         v2=cell_gene[j]
#         sim1=get_simi(v1,v2)
#         sim1_ls.append(sim1)

In [None]:
# torch.save(sim1_ls,"./outputs/sim1_ls")

In [16]:
a = torch.tensor([[1.,  2], [1., 1]])
# b = torch.tensor([[1., 0], [0.,  0],[-1,-1]])
torch.cdist(a, a, p=2)

tensor([[0., 1.],
        [1., 0.]])

In [17]:
%%time
import torch
from torch import nn, optim
from torch.nn import functional as F
import torch.utils.data as Data
import numpy as np


class AE(nn.Module):
    def __init__(self, dim, emb_dim=128):
        super(AE, self).__init__()
        self.dim = dim
        self.fc1 = nn.Linear(dim, 512)
        self.fc2 = nn.Linear(512, emb_dim)
        self.fc3 = nn.Linear(emb_dim, 512)
        self.fc4 = nn.Linear(512, dim)

    def encode(self, x):
        h1 = F.relu(self.fc1(x))
        return F.relu(self.fc2(h1))

    def decode(self, z):
        h3 = F.relu(self.fc3(z))
        return torch.relu(self.fc4(h3))

    def forward(self, x):
        z = self.encode(x.view(-1, self.dim))
        return self.decode(z), z
    
    
feature=torch.tensor(gene_cell.T)
feature=feature.to(device)
model = AE(dim=feature.shape[1]).to(device)
ba=feature.shape[0]
loader = Data.DataLoader(feature, ba)
optimizer = optim.Adam(model.parameters(), lr=1e-3)

loss_func = nn.MSELoss()
EPOCH_AE = 2000
for epoch in range(EPOCH_AE):
    embeddings = []
    # loss_ls=[]
    for _, batch_x in enumerate(loader)	:
        decoded, encoded = model(batch_x)
        res=torch.cdist(encoded,encoded,p=2)
        break
        loss1 = loss_func(batch_x, decoded)
        loss2 = 0
        for i in range(batch_x.shape[0]):
            for j in range(i+1,batch_x.shape[0]):
                v1=batch_x[i]
                v2=batch_x[j]
                h1=encoded[i]
                h2=encoded[j]
                sim1=get_simi(np.array(v1.cpu()),np.array(v2.cpu()))
                sim2=loss_exp(h1,h2)
                loss2+=sim1*(sim2-sim1).abs()
        print(loss1,loss2)
        loss=loss1+loss2
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        embeddings.append(encoded)
    #     loss_ls.append(loss.item())
    # scheduler.step(np.mean(loss_ls))
#     print('Epoch :', epoch, '|', 'train_loss:%.12f' % loss.data)
    break

CPU times: user 102 ms, sys: 24.7 ms, total: 126 ms
Wall time: 125 ms


In [22]:
res

tensor([[5.9802e-04, 7.2308e-01, 7.1889e-01,  ..., 7.0415e-01, 7.8821e-01,
         8.0227e-01],
        [7.2308e-01, 0.0000e+00, 4.6027e-01,  ..., 4.9548e-01, 6.5668e-01,
         5.9445e-01],
        [7.1889e-01, 4.6027e-01, 0.0000e+00,  ..., 4.7793e-01, 7.2780e-01,
         6.1345e-01],
        ...,
        [7.0415e-01, 4.9548e-01, 4.7793e-01,  ..., 0.0000e+00, 7.4396e-01,
         6.8867e-01],
        [7.8821e-01, 6.5668e-01, 7.2780e-01,  ..., 7.4396e-01, 3.4527e-04,
         7.8473e-01],
        [8.0227e-01, 5.9445e-01, 6.1345e-01,  ..., 6.8867e-01, 7.8473e-01,
         0.0000e+00]], device='cuda:0', grad_fn=<ViewBackward>)

In [3]:
%%time
import torch
from torch import nn, optim
from torch.nn import functional as F
import torch.utils.data as Data
import numpy as np


class AE(nn.Module):
    def __init__(self, dim, emb_dim=128):
        super(AE, self).__init__()
        self.dim = dim
        self.fc1 = nn.Linear(dim, 512)
        self.fc2 = nn.Linear(512, emb_dim)
        self.fc3 = nn.Linear(emb_dim, 512)
        self.fc4 = nn.Linear(512, dim)

    def encode(self, x):
        h1 = F.relu(self.fc1(x))
        return F.relu(self.fc2(h1))

    def decode(self, z):
        h3 = F.relu(self.fc3(z))
        return torch.relu(self.fc4(h3))

    def forward(self, x):
        z = self.encode(x.view(-1, self.dim))
        return self.decode(z), z
    
    
feature=torch.tensor(gene_cell.T)
feature=feature.to(device)
model = AE(dim=feature.shape[1]).to(device)
ba=5000
loader = Data.DataLoader(feature, ba)
optimizer = optim.Adam(model.parameters(), lr=1e-3)

loss_func = nn.MSELoss()
EPOCH_AE = 2000
for epoch in range(EPOCH_AE):
    embeddings = []
    # loss_ls=[]
    for _, batch_x in enumerate(loader)	:
        decoded, encoded = model(batch_x)
        loss1 = loss_func(batch_x, decoded)
        loss2 = 0
        for i in range(batch_x.shape[0]):
            for j in range(i+1,batch_x.shape[0]):
                v1=batch_x[i]
                v2=batch_x[j]
                h1=encoded[i]
                h2=encoded[j]
                sim1=get_simi(np.array(v1.cpu()),np.array(v2.cpu()))
                sim2=loss_exp(h1,h2)
                loss2+=sim1*(sim2-sim1).abs()
        print(loss1,loss2)
        loss=loss1+loss2
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        embeddings.append(encoded)
    #     loss_ls.append(loss.item())
    # scheduler.step(np.mean(loss_ls))
    print('Epoch :', epoch, '|', 'train_loss:%.12f' % loss.data)
    break

KeyboardInterrupt: 

25000000