In [1]:
import numpy as np
import seaborn as sns
import torch

import umap
import matplotlib.pyplot as plt
import pandas as pd
from community import community_louvain
from torch_geometric.utils import k_hop_subgraph,to_networkx,from_networkx
import matplotlib

import utils
import plots
from model_AE import reduction_AE
from model_GAT import Encoder,SenGAE,train_GAT
from model_Sencell import Sencell

import logging
import os
import argparse


parser = argparse.ArgumentParser(description='Main program for sencells')

parser.add_argument('--output_dir', type=str, default='./outputs', help='')
parser.add_argument('--exp_name', type=str, default='', help='')

args = parser.parse_args(args=[])

args.exp_name='disease'

if not os.path.exists(args.output_dir):
    os.makedirs(args.output_dir)

logging.basicConfig(format='%(asctime)s.%(msecs)03d [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s',
                    datefmt='# %Y-%m-%d %H:%M:%S')

logging.getLogger().setLevel(logging.DEBUG)
logger = logging.getLogger()

# Part 1: load and process data
# cell_cluster_arr在画umap的时候用
adata,cluster_cell_ls,cell_cluster_arr,celltype_names=utils.load_data_disease()
# plots.umapPlot(adata.obsm['X_umap'],clusters=cell_cluster_arr,labels=celltype_names)

new_data,markers_index,\
sen_gene_ls,nonsen_gene_ls,gene_names=utils.process_data(adata,cluster_cell_ls,cell_cluster_arr)

print(f'cell num: {new_data.shape[0]}, gene num: {new_data.shape[1]}')

gene_cell=new_data.X.toarray().T
graph_nx=utils.build_graph_nx(gene_cell,cell_cluster_arr,sen_gene_ls,nonsen_gene_ls,gene_names)
logger.info("Part 1, data loading and processing end!")

# Part 2: generate init embedding
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print('device:',device)
args.device=device

retrain=True
if retrain:
    gene_embed,cell_embed=reduction_AE(gene_cell,device)
    print(gene_embed.shape,cell_embed.shape)
    torch.save(gene_embed,os.path.join(args.output_dir,f'{args.exp_name}_gene.emb'))
    torch.save(cell_embed,os.path.join(args.output_dir,f'{args.exp_name}_cell.emb'))
else:
    gene_embed=torch.load(os.path.join(args.output_dir,f'{args.exp_name}_gene.emb'))
    cell_embed=torch.load(os.path.join(args.output_dir,f'{args.exp_name}_cell.emb'))

graph_nx=utils.add_nx_embedding(graph_nx,gene_embed,cell_embed)
graph_pyg=utils.build_graph_pyg(gene_cell,gene_embed,cell_embed)
logger.info("Part 2, AE end!")

# Part 3: train GAT
# graph_pyg=graph_pyg.to('cpu')
args.gene_num=gene_cell.shape[0]
args.cell_num=gene_cell.shape[1]



GAT_model=train_GAT(graph_nx,graph_pyg,args,retrain=True,resampling=True)
logger.info("Part 3, training GAT end!")



from sampling import identify_sencell_marker_graph
from sampling import sub_sampling_by_random
from model_Sencell import cell_optim,update_cell_embeddings,old_cell_optim

from sampling import identify_sengene_then_sencell


all_gene_ls=[]

list_sencell_cover=[]
list_sengene_cover=[]

def get_sencell_cover(old_sencell_dict,sencell_dict):
    set1=set(list(old_sencell_dict.keys()))
    set2=set(list(sencell_dict.keys()))
    set3=set1.intersection(set2)
    print('sencell cover:',len(set3)/len(set2))
    
    return len(set3)/len(set2)

def get_sengene_cover(old_sengene_ls,sengene_ls):
    set1=set(old_sengene_ls)
    set2=set(sengene_ls)
    set3=set1.intersection(set2)
    print('sengene cover:',len(set3)/len(set2))
    
    return len(set3)/len(set2)


# 2022-11-30 14:16:13.975 [DEBUG] [attrs.py:77] Creating converter from 3 to 5


cluster 数量： 14
celltype names: ['0', '1', '2', '3', '5', 'Epithelial cells', '7', '8', '10', 'Pericytes', 'Fibroblasts-Pericytes', '14', '9', 'Endothelial cells']
---------------------  -----
0                      10712
1                       7273
2                       5435
3                       3923
5                       2695
Epithelial cells        2385
7                       1630
8                       1607
10                       900
Pericytes                571
Fibroblasts-Pericytes    466
14                       361
9                        263
Endothelial cells        186
---------------------  -----
各marker list所包含的gene数：
  Markers1    Markers2    Markers3    Markers4
----------  ----------  ----------  ----------
       126          78         145          84
total marker genes:  380


  self.data[key] = value


highly_genes num:  2000
After highly genes dropped duplicate:  1924
Total gene num: 2286
cell num: 38407, gene num: 2286
The number of edges: 3688551


# 2022-11-30 14:16:35.860 [INFO] [4270138861.py:53] Part 1, data loading and processing end!


device: cuda:0
Epoch : 1999 | train_loss:0.040717933327
Epoch : 1999 | train_loss:0.073554396629
torch.Size([2286, 128]) torch.Size([38407, 128])
the number of edges: 3688551
edge index:  torch.Size([2, 3688551])
node feature:  torch.Size([40693, 128])
Pyg graph: Data(x=[40693, 128], edge_index=[2, 7377102], y=[40693])


# 2022-11-30 14:23:17.302 [INFO] [4270138861.py:72] Part 2, AE end!


graph.is_directed(): False
Start sampling ...


100%|██████████| 50/50 [07:52<00:00,  9.46s/it]


sampling end, time:  472.9620280265808
Epoch:  0
subgraph loss:  34.307864379882815
subgraph loss:  34.356353759765625
subgraph loss:  34.48566665649414
subgraph loss:  34.2494400024414
subgraph loss:  34.33081817626953
subgraph loss:  34.309766387939455
subgraph loss:  34.39461669921875
subgraph loss:  34.333100128173825
subgraph loss:  34.39757461547852
subgraph loss:  34.42898406982422
subgraph loss:  34.33153076171875
subgraph loss:  34.425802612304686
subgraph loss:  34.372962188720706
subgraph loss:  34.33596496582031
subgraph loss:  34.355980682373044
subgraph loss:  34.35541687011719
subgraph loss:  34.32461776733398
subgraph loss:  34.357496643066405
subgraph loss:  34.18871154785156
subgraph loss:  34.331060791015624
subgraph loss:  34.35059127807617
subgraph loss:  34.375727844238284
subgraph loss:  34.31633758544922
subgraph loss:  34.213270568847655
subgraph loss:  34.37082443237305
subgraph loss:  34.32482757568359
subgraph loss:  34.30204391479492
subgraph loss:  34.3115

subgraph loss:  1.378463363647461
subgraph loss:  1.367323350906372
subgraph loss:  1.3905138731002809
subgraph loss:  1.382067608833313
subgraph loss:  1.373087430000305
subgraph loss:  1.3879197835922241
subgraph loss:  1.338252353668213
subgraph loss:  1.3627898693084717
subgraph loss:  1.373057985305786
subgraph loss:  1.3546236515045167
subgraph loss:  1.3365100383758546
subgraph loss:  1.3598469257354737
subgraph loss:  1.355629801750183
subgraph loss:  1.3388076782226563
subgraph loss:  1.3266125917434692
subgraph loss:  1.340764594078064
subgraph loss:  1.3253390789031982
subgraph loss:  1.310771608352661
subgraph loss:  1.3291292905807495
EPOCH loss 1.4533852458000185
Epoch:  5
subgraph loss:  1.3264060497283936
subgraph loss:  1.3305948734283448
subgraph loss:  1.3362071990966797
subgraph loss:  1.3249119758605956
subgraph loss:  1.2857951164245605
subgraph loss:  1.3212431430816651
subgraph loss:  1.312535810470581
subgraph loss:  1.3244885206222534
subgraph loss:  1.3229169

subgraph loss:  1.1070281505584716
subgraph loss:  1.091644024848938
subgraph loss:  1.0939806938171386
subgraph loss:  1.1008066415786744
subgraph loss:  1.0808791875839234
subgraph loss:  1.1055509567260742
subgraph loss:  1.0933452606201173
subgraph loss:  1.1167636394500733
subgraph loss:  1.088701605796814
subgraph loss:  1.0985451221466065
subgraph loss:  1.0918460845947267
subgraph loss:  1.1069285869598389
subgraph loss:  1.099402618408203
subgraph loss:  1.1061376094818116
subgraph loss:  1.0928312301635743
subgraph loss:  1.0770437240600585
subgraph loss:  1.097299337387085
subgraph loss:  1.099859929084778
subgraph loss:  1.0882968187332154
subgraph loss:  1.1055302143096923
subgraph loss:  1.0727711915969849
subgraph loss:  1.1004894018173217
subgraph loss:  1.1143057346343994
subgraph loss:  1.133693814277649
subgraph loss:  1.1321044445037842
subgraph loss:  1.0862814903259277
subgraph loss:  1.1065406799316406
subgraph loss:  1.1180493593215943
subgraph loss:  1.10975975

subgraph loss:  1.0610580921173096
subgraph loss:  1.0464670658111572
subgraph loss:  1.0561529397964478
subgraph loss:  1.0500933170318603
subgraph loss:  1.054315161705017
subgraph loss:  1.0651443958282472
EPOCH loss 1.059359021663666
Epoch:  14
subgraph loss:  1.0550976037979125
subgraph loss:  1.066539716720581
subgraph loss:  1.0654324054718018
subgraph loss:  1.0613760709762574
subgraph loss:  1.0271422863006592
subgraph loss:  1.0628525018692017
subgraph loss:  1.0452768564224244
subgraph loss:  1.0552117586135865
subgraph loss:  1.0489041566848756
subgraph loss:  1.0525462865829467
subgraph loss:  1.0512150049209594
subgraph loss:  1.0334940433502198
subgraph loss:  1.0595643281936646
subgraph loss:  1.049836564064026
subgraph loss:  1.0533127069473267
subgraph loss:  1.0568892002105712
subgraph loss:  1.0483633995056152
subgraph loss:  1.0746011972427367
subgraph loss:  1.048819375038147
subgraph loss:  1.0678162813186645
subgraph loss:  1.0422603607177734
subgraph loss:  1.0

subgraph loss:  1.0166308641433717
subgraph loss:  1.005155611038208
subgraph loss:  1.003989827632904
subgraph loss:  1.0000364661216736
subgraph loss:  1.014210081100464
subgraph loss:  1.021518063545227
subgraph loss:  1.005981183052063
subgraph loss:  1.016547179222107
subgraph loss:  0.9907563924789429
subgraph loss:  1.0206701755523682
subgraph loss:  1.0048142313957213
subgraph loss:  1.0119422197341919
subgraph loss:  1.025962519645691
subgraph loss:  0.9928576946258545
subgraph loss:  1.015222930908203
subgraph loss:  1.0268104076385498
subgraph loss:  1.0259939432144165
subgraph loss:  1.0115337133407594
subgraph loss:  1.020262360572815
subgraph loss:  1.024524712562561
subgraph loss:  1.0112165927886962
subgraph loss:  0.9935127854347229
subgraph loss:  1.0147458791732789
subgraph loss:  1.0065769672393798
subgraph loss:  1.0096088171005249
subgraph loss:  1.010554575920105
EPOCH loss 1.0116805973052978
Epoch:  19
subgraph loss:  1.0032443284988404
subgraph loss:  1.0142941

subgraph loss:  0.9678805112838745
subgraph loss:  0.9882104873657227
subgraph loss:  0.9800740838050842
subgraph loss:  0.9882476210594178
subgraph loss:  0.9822487473487854
subgraph loss:  0.9840553164482116
subgraph loss:  0.9818844556808471
subgraph loss:  0.9626545548439026
subgraph loss:  0.9969021916389466
subgraph loss:  0.98099285364151
subgraph loss:  0.9751089096069336
subgraph loss:  0.9905609965324402
subgraph loss:  0.9715672612190247
subgraph loss:  0.9891563296318054
subgraph loss:  0.9796489000320434
subgraph loss:  1.0003417491912843
subgraph loss:  0.972683560848236
subgraph loss:  0.9808354258537293
subgraph loss:  0.999365758895874
subgraph loss:  0.9936172962188721
subgraph loss:  0.9907487869262696
subgraph loss:  0.9779042482376099
subgraph loss:  0.98634033203125
subgraph loss:  0.9745186805725098
subgraph loss:  0.9895226001739502
subgraph loss:  0.9969387412071228
subgraph loss:  0.9816813349723816
subgraph loss:  0.9915712356567383
subgraph loss:  0.96587805

subgraph loss:  0.9869994997978211
subgraph loss:  0.9950382828712463
subgraph loss:  0.9642711162567139
subgraph loss:  0.9919893383979798
subgraph loss:  0.9927555918693542
subgraph loss:  0.987962293624878
subgraph loss:  0.9796348333358764
subgraph loss:  0.9832915425300598
subgraph loss:  0.9902644157409668
subgraph loss:  0.9807487964630127
subgraph loss:  0.9669571399688721
subgraph loss:  0.9840837597846985
subgraph loss:  0.9743767023086548
subgraph loss:  0.9813285946846009
subgraph loss:  0.9844171285629273
EPOCH loss 0.9797723982334137
Epoch:  28
subgraph loss:  0.9779579281806946
subgraph loss:  0.990868091583252
subgraph loss:  0.9897830367088318
subgraph loss:  0.9931286096572876
subgraph loss:  0.9582703828811645
subgraph loss:  0.9817494511604309
subgraph loss:  0.971600079536438
subgraph loss:  0.9739600896835328
subgraph loss:  0.9747853875160217
subgraph loss:  0.9763310432434082
subgraph loss:  0.9794126629829407
subgraph loss:  0.9550966501235962
subgraph loss:  0

# 2022-11-30 15:44:34.754 [INFO] [4270138861.py:82] Part 3, training GAT end!


subgraph loss:  0.9806500196456909
EPOCH loss 0.9774478840827945


In [None]:
cellmodel=Sencell().to(device)
optimizer = torch.optim.Adam(cellmodel.parameters(), lr=0.001, 
                                weight_decay=1e-3)
all_marker_index=sen_gene_ls
    
iteration_results=[]
for iteration in range(5):
    logger.info(f"iteration: {iteration}")
    sampled_graph,sencell_dict,nonsencell_dict,cell_clusters,big_graph_index_dict=sub_sampling_by_random(graph_nx,
                                                            sen_gene_ls,
                                                            nonsen_gene_ls,
                                                            GAT_model,
                                                            args,
                                                            all_marker_index,
                                                            n_gene=len(all_marker_index),                                                        
                                                            gene_rate=0.3,cell_rate=0.5,
                                                            debug=False)
    old_sengene_indexs=all_marker_index
    for epoch in range(10):
        logger.info(f"epoch: {epoch}")
        old_sencell_dict=sencell_dict
        cellmodel,sencell_dict,nonsencell_dict=cell_optim(cellmodel,optimizer,
                                                          sencell_dict,nonsencell_dict,args,
                                                         train=True)
        sampled_graph=update_cell_embeddings(sampled_graph,sencell_dict,nonsencell_dict)
        sencell_dict,nonsencell_dict, \
        sen_gene_indexs,nonsen_gene_indexs=identify_sengene_then_sencell(sampled_graph,GAT_model,
                                                                      sencell_dict,nonsencell_dict,
                                                                      cell_clusters,
                                                                      big_graph_index_dict,
                                                                      len(all_marker_index),args)

        get_sencell_cover(old_sencell_dict,sencell_dict)
        get_sengene_cover(old_sengene_indexs,sen_gene_indexs)
        old_sengene_indexs=sen_gene_indexs
    iteration_results.append([sen_gene_indexs,sencell_dict])

# 2022-11-30 16:11:22.899 [INFO] [2770481884.py:5] iteration: 0


Start sampling subgraph randomly ...
    Sengene num: 362, Nonsengen num: 362
subgraph total node num: (39131,)
After sampling, gene num:  tensor(724)


# 2022-11-30 16:28:03.701 [INFO] [2770481884.py:17] epoch: 0


obj saved ./outputs/disease_cell_score_dict
    Sencell num: 200, Nonsencell num: 2000
43.1505126953125
41.624977111816406
39.33974075317383
35.76642990112305
30.387348175048828
27.28223419189453
30.835697174072266
31.54355239868164
29.1556453704834
26.820158004760742
26.36113739013672
26.94892120361328
27.219985961914062
27.0041561126709
26.36723518371582
25.47367286682129
24.53369903564453
24.171098709106445
24.420310974121094
24.23015022277832
rechoice sengene num: 362 rechoice nonsengene num: 362


# 2022-11-30 16:44:01.479 [INFO] [2770481884.py:17] epoch: 1


obj saved ./outputs/disease_cell_score_dict
    Sencell num: 200, Nonsencell num: 2000
sencell cover: 0.91
sengene cover: 0.7292817679558011
26.17998695373535
25.592561721801758
24.621034622192383
23.41092872619629
22.537019729614258
22.447372436523438
22.739042282104492
23.049320220947266
23.023664474487305
22.648277282714844
22.18230438232422
21.874549865722656
21.700515747070312
21.556041717529297
21.40627670288086
21.233484268188477
21.0069637298584
20.827110290527344
20.75449562072754
20.749006271362305
rechoice sengene num: 362 rechoice nonsengene num: 362


# 2022-11-30 16:59:59.336 [INFO] [2770481884.py:17] epoch: 2


obj saved ./outputs/disease_cell_score_dict
    Sencell num: 200, Nonsencell num: 2000
sencell cover: 0.985
sengene cover: 0.9558011049723757
20.737197875976562
20.407264709472656
20.118803024291992
19.9378719329834
19.865114212036133
19.860815048217773
19.853660583496094
rechoice sengene num: 362 rechoice nonsengene num: 362


# 2022-11-30 17:15:55.872 [INFO] [2770481884.py:17] epoch: 3


obj saved ./outputs/disease_cell_score_dict
    Sencell num: 200, Nonsencell num: 2000
sencell cover: 1.0
sengene cover: 0.9972375690607734
18.966453552246094
18.934476852416992
18.897491455078125
18.876102447509766
18.860889434814453
18.831260681152344
18.77983856201172
18.717609405517578
18.666086196899414
18.642414093017578
18.63123321533203
18.612367630004883
18.579910278320312
18.54352569580078
18.51455307006836
18.4945125579834
18.475921630859375
18.453474044799805
18.428394317626953
18.404205322265625
rechoice sengene num: 362 rechoice nonsengene num: 362


# 2022-11-30 17:32:03.505 [INFO] [2770481884.py:17] epoch: 4


obj saved ./outputs/disease_cell_score_dict
    Sencell num: 200, Nonsencell num: 2000
sencell cover: 1.0
sengene cover: 1.0
18.376129150390625
18.352331161499023
18.324722290039062
18.295866012573242
18.26979637145996
18.2475528717041
18.22568130493164
18.200603485107422
18.17513084411621
18.152219772338867
18.130882263183594
18.10991668701172
18.08833885192871
18.066604614257812
18.045061111450195
18.023195266723633
18.00090217590332
17.978458404541016
17.957120895385742
17.93602752685547
rechoice sengene num: 362 rechoice nonsengene num: 362


# 2022-11-30 17:48:04.866 [INFO] [2770481884.py:17] epoch: 5


obj saved ./outputs/disease_cell_score_dict
    Sencell num: 200, Nonsencell num: 2000
sencell cover: 1.0
sengene cover: 1.0
17.913007736206055
17.8916072845459
17.870254516601562
17.849214553833008
17.828462600708008
17.807823181152344
17.78719711303711
17.766508102416992
17.745725631713867
17.724884033203125
17.70404052734375
17.683183670043945
17.662303924560547
17.641393661499023
17.620502471923828
17.599607467651367
17.578720092773438
17.55780029296875
17.536855697631836
17.515869140625
rechoice sengene num: 362 rechoice nonsengene num: 362


# 2022-11-30 18:04:06.840 [INFO] [2770481884.py:17] epoch: 6


obj saved ./outputs/disease_cell_score_dict
    Sencell num: 200, Nonsencell num: 2000
sencell cover: 1.0
sengene cover: 1.0
17.494522094726562
17.47349739074707
17.452816009521484
17.43219566345215
17.411754608154297
17.391319274902344
17.370929718017578
17.350727081298828
17.330780029296875
17.310930252075195
17.29124641418457
17.271865844726562
17.252460479736328
17.233022689819336
17.213666915893555
17.194334030151367
17.17501449584961
17.155681610107422
17.136293411254883
17.11682891845703
rechoice sengene num: 362 rechoice nonsengene num: 362


# 2022-11-30 18:20:07.396 [INFO] [2770481884.py:17] epoch: 7


obj saved ./outputs/disease_cell_score_dict
    Sencell num: 200, Nonsencell num: 2000
sencell cover: 1.0
sengene cover: 1.0
17.097034454345703
17.077392578125
17.057716369628906
17.038047790527344
17.01841926574707
16.99876594543457
16.978981018066406
16.959041595458984
16.939228057861328
16.919246673583984
16.89925765991211
16.879228591918945
16.858985900878906
16.838525772094727
16.817996978759766
16.797407150268555
16.776662826538086
16.7557373046875
16.734914779663086
16.71408462524414
rechoice sengene num: 362 rechoice nonsengene num: 362


# 2022-11-30 18:36:06.517 [INFO] [2770481884.py:17] epoch: 8


obj saved ./outputs/disease_cell_score_dict
    Sencell num: 200, Nonsencell num: 2000
sencell cover: 1.0
sengene cover: 1.0
16.69295883178711
16.67179298400879
16.650365829467773
16.628704071044922
16.60688018798828
16.584707260131836
16.562288284301758
16.53949737548828
16.516193389892578
16.492416381835938
16.46795654296875
16.442724227905273
16.416654586791992
16.389404296875
16.360671997070312
16.330242156982422
16.29783821105957
16.26301383972168
16.225399017333984
16.18424415588379
rechoice sengene num: 362 rechoice nonsengene num: 362


# 2022-11-30 18:52:04.781 [INFO] [2770481884.py:17] epoch: 9


obj saved ./outputs/disease_cell_score_dict
    Sencell num: 200, Nonsencell num: 2000
sencell cover: 1.0
sengene cover: 1.0
16.137889862060547
16.086145401000977
16.02753448486328
15.964775085449219
15.897684097290039
15.832527160644531
15.783247947692871
15.760805130004883
15.769433975219727
15.774261474609375
15.74560832977295
15.695846557617188
15.656020164489746
15.636523246765137
15.62187671661377
15.604048728942871
15.585883140563965
15.566476821899414
15.547176361083984
15.525290489196777
rechoice sengene num: 362 rechoice nonsengene num: 362


# 2022-11-30 19:08:03.778 [INFO] [2770481884.py:5] iteration: 1


obj saved ./outputs/disease_cell_score_dict
    Sencell num: 200, Nonsencell num: 2000
sencell cover: 1.0
sengene cover: 1.0
Start sampling subgraph randomly ...
    Sengene num: 362, Nonsengen num: 362
subgraph total node num: (39131,)
After sampling, gene num:  tensor(724)
obj saved ./outputs/disease_cell_score_dict


# 2022-11-30 19:24:20.362 [INFO] [2770481884.py:17] epoch: 0


    Sencell num: 200, Nonsencell num: 2000
27.113658905029297
19.39166259765625
15.038947105407715
16.824562072753906
17.33770751953125
16.931110382080078
18.038036346435547
18.276811599731445
17.07700538635254
15.965641021728516
15.946925163269043
16.148305892944336
16.10000228881836
16.132640838623047
15.8193941116333
15.308293342590332
15.078901290893555
15.052093505859375
15.024344444274902
14.919920921325684
rechoice sengene num: 362 rechoice nonsengene num: 362


# 2022-11-30 19:39:39.168 [INFO] [2770481884.py:17] epoch: 1


obj saved ./outputs/disease_cell_score_dict
    Sencell num: 200, Nonsencell num: 2000
sencell cover: 0.905
sengene cover: 0.7403314917127072
19.26766586303711
18.834596633911133
17.464595794677734
15.940375328063965
15.482205390930176
16.330942153930664
17.01392364501953
16.684162139892578
15.786877632141113
15.158141136169434
15.233431816101074
15.781523704528809
16.03028106689453
15.677838325500488
15.09222412109375
14.814044952392578
15.00277042388916
15.30971908569336
15.335526466369629
15.053603172302246
rechoice sengene num: 362 rechoice nonsengene num: 362
obj saved ./outputs/disease_cell_score_dict


# 2022-11-30 19:55:00.031 [INFO] [2770481884.py:17] epoch: 2


    Sencell num: 200, Nonsencell num: 2000
sencell cover: 0.995
sengene cover: 0.9640883977900553
14.79797649383545
14.861987113952637
14.982522964477539
14.98729133605957
14.878374099731445
14.762664794921875
14.708982467651367
14.725641250610352
14.764449119567871
14.758496284484863
14.693153381347656
14.617863655090332
14.586393356323242
14.600530624389648
14.621828079223633
14.607587814331055
14.557353973388672
14.507540702819824
14.484506607055664
14.48620319366455
rechoice sengene num: 362 rechoice nonsengene num: 362


In [7]:
torch.save([sencell_dict,sen_gene_indexs],'./outputs/disease_outputs.data')

# 2022-11-28 13:35:49.829 [INFO] [4290767024.py:5] iteration: 0


Start sampling subgraph randomly ...
    Sengene num: 343, Nonsengen num: 343
subgraph total node num: (8288,)
After sampling, gene num:  tensor(686)


# 2022-11-28 13:38:45.396 [INFO] [4290767024.py:18] epoch: 0


[0.00122944 0.00172688 0.00234938 ... 0.39748853 0.54444087 0.57576734]
    Sencell num: 196, Nonsencell num: 1960
24.877323150634766
18.285886764526367
18.5355167388916
17.720273971557617
16.566776275634766
17.011201858520508
17.641918182373047
17.057353973388672
16.591041564941406
16.47360610961914
16.787492752075195
16.910785675048828
16.50572395324707
16.403051376342773
16.437009811401367
16.507965087890625
16.56315803527832
16.55409812927246
16.479291915893555
16.388391494750977
rechoice sengene num: 343 rechoice nonsengene num: 343
[0.00390377 0.00416368 0.0044079  ... 0.08458243 0.08706584 0.08825421]
    Sencell num: 0, Nonsencell num: 7602


ZeroDivisionError: division by zero

In [4]:
import scanpy as sp
path="/users/PCON0022/haocheng/Basu_lab/rmarkdown/combined_g8.h5ad"
adata=sp.read_h5ad(path)
adata=adata[adata.obs['disease']!='Healthy']

In [5]:
adata

View of AnnData object with n_obs × n_vars = 38407 × 24543
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.ribo', 'percent.mito', 'count.mad.higher', 'integrated_snn_res.0.2', 'seurat_clusters', 'cell_type_seurat', 'disease', 'location'
    var: 'vst.mean', 'vst.variance', 'vst.variance.expected', 'vst.variance.standardized', 'vst.variable'
    obsm: 'X_pca', 'X_umap'

In [4]:
graph_nx.nodes[10956]

KeyError: 10956

In [3]:
import networkx as nx


In [4]:
g_index,c_index = np.nonzero(gene_cell)
print('The number of edges:',len(g_index))
# 加上偏移量作为cell的节点标号
gene_num = gene_cell.shape[0]
c_index += gene_num
edge_index=torch.tensor([g_index, c_index], dtype=torch.long)

# step 2: build nx graph, add attributes
graph_nx=nx.Graph(edge_index.T.tolist())

# 再加一个属性，是每个节点在大图上的index
for i in range(gene_num):
    graph_nx.nodes[i]['type']='g'
    graph_nx.nodes[i]['index']=i
    graph_nx.nodes[i]['name']=gene_names[i]
    graph_nx.nodes[i]['is_sen']= i in sen_gene_ls

The number of edges: 3177602


KeyError: 5

In [9]:
graph_nx.nodes[5]

KeyError: 5

In [13]:
sum(edge_index[0]==5)

tensor(0)

In [14]:
adata

View of AnnData object with n_obs × n_vars = 7602 × 24543
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.ribo', 'percent.mito', 'count.mad.higher', 'integrated_snn_res.0.2', 'seurat_clusters', 'cell_type_seurat', 'disease', 'location'
    var: 'vst.mean', 'vst.variance', 'vst.variance.expected', 'vst.variance.standardized', 'vst.variable'
    obsm: 'X_pca', 'X_umap'

In [25]:
list(adata.var.index).index('FGG')

5110

In [28]:
adata.var.iloc[5110,:]

vst.mean                      0.044274
vst.variance                  0.695201
vst.variance.expected         0.058568
vst.variance.standardized    11.399175
vst.variable                      True
Name: FGG, dtype: object

In [32]:
max(adata.X.toarray()[:,5110])

0.0

In [19]:
list(adata.var[adata.var['vst.variable']==True].index)

['HES4',
 'ISG15',
 'TTLL10',
 'TNFRSF4',
 'MXRA8',
 'CFAP74',
 'ERRFI1',
 'FHAD1',
 'EPHA2',
 'IFFO2',
 'NBL1',
 'PLA2G2A',
 'PLA2G2D',
 'UBXN10',
 'CAMK2N1',
 'ALPL',
 'HSPG2',
 'WNT4',
 'C1QA',
 'C1QC',
 'C1QB',
 'ID3',
 'CLIC4',
 'STMN1',
 'CD52',
 'SFN',
 'SYTL1',
 'FCN3',
 'CD164L2',
 'IFI6',
 'FABP3',
 'SERINC2',
 'DCDC2B',
 'GJA4',
 'CLSPN',
 'TEKT2',
 'EVA1B',
 'DNALI1',
 'CDCA8',
 'MACF1',
 'MFSD2A',
 'EDN2',
 'CCDC30',
 'SLC2A1',
 'FAM183A',
 'CFAP57',
 'CDC20',
 'PTPRF',
 'KIF2C',
 'TCTEX1D4',
 'CCDC17',
 'PIK3R3',
 'TSPAN1',
 'CYP4B1',
 'RAB3B',
 'PODN',
 'LDLRAD1',
 'DHCR24',
 'FYB2',
 'TACSTD2',
 'C1orf87',
 'NFIA',
 'NFIA-AS2',
 'TCTEX1D1',
 'WDR78',
 'C1orf141',
 'GADD45A',
 'GNG12',
 'DEPDC1',
 'ERICH3',
 'AK5',
 'NEXN',
 'DNAJB4',
 'AC103591.3',
 'IFI44L',
 'ADGRL4',
 'DNASE2B',
 'WDR63',
 'DDAH1',
 'CCN1',
 'CLCA2',
 'GBP1',
 'GBP6',
 'LRRC8B',
 'GCLM',
 'ARHGAP29',
 'F3',
 'CNN3',
 'PALMD',
 'VCAM1',
 'C1orf194',
 'KIAA1324',
 'GSTM3',
 'CHI3L2',
 'PIFO',
 'RHOC',


In [18]:
for line in gene_cell:
    print(max(line))

2.1897926
2.1651661
2.7920196
1.5468937
1.1931738
0.0
2.357961
0.0
2.7372148
3.1248958
2.9193091
2.1014135
1.3594567
0.0
1.5821656
1.1162518
2.330452
2.822734
0.0
2.6776004
0.9731857
3.2701945
0.6520417
2.8407748
1.6788143
0.8222758
3.6340375
0.0
2.1570113
3.6965876
0.84676486
3.6703413
2.2040915
2.966108
3.1278186
4.4532127
2.3830287
5.2903624
0.97565055
0.0
2.0996513
1.095555
4.2244964
2.7202685
2.2531729
2.002366
7.7091126
1.6924223
3.5590467
0.0
2.9470956
0.0
4.3975067
1.8707706
1.6348754
1.1475731
0.9430844
0.0
1.9216229
3.596243
0.8845393
1.2267766
3.316125
0.87594736
3.5458758
2.7437682
2.559324
3.3743048
1.9273283
1.7303318
2.848401
3.8516963
2.3379884
0.0
2.529509
5.550671
3.1601174
2.8811638
0.0
3.4783754
0.0
1.9385238
3.360945
0.0
2.4900556
0.0
1.3366392
2.5709705
1.7380133
2.7242525
3.2835107
4.2497034
2.5608974
2.7242525
1.5098591
2.822734
3.1898735
1.0127478
1.1493376
0.87570214
4.01055
2.9135242
4.248985
2.0507746
4.8487616
2.5881233
2.609829
2.782014
0.0
2.740126
1.2529

2.12723
2.7749355
2.960883
2.0587845
3.4720051
0.0
1.5084871
2.2122989
3.236711
2.7749355
2.7651205
2.6852047
0.80126154
2.6174762
0.0
0.5999886
2.9926975
2.140949
1.2806811
2.2799077
2.7920196
3.04834
5.190573
1.325674
1.9740293
0.57803583
2.0965147
0.0
3.0560234
1.4821767
2.7924304
0.0
0.0
1.6192158
3.920186
5.013773
2.9801915
2.7019117
5.396119
1.613565
1.8889557
2.479637
1.9986255
2.9201388
2.5754306
0.0
1.265588
1.3347801
0.0
1.9735719
2.8610513
1.5363592
0.0
1.5743808
3.6809661
0.0
2.8811638
2.3348932
5.4435563
3.4857092
0.0
3.1408665
2.6832974
2.1152053
0.7328875
1.2303656
1.6442469
0.0
2.9696083
2.2691858
3.4176784
0.83551204
3.631375
2.889145
2.2787268
0.9106284
2.3933623
0.0
0.0
3.4757519
1.0759374
2.6852047
2.8811638
2.3144243
2.8324606
0.0
1.096218
1.0058771
0.0
2.7202685
2.977829
2.960883
1.1623127
3.579154
0.9775598
0.8581871
3.3799326
0.0
0.0
3.082643
1.8990681
2.9419792
2.5821044
2.9053237
1.1959825
2.8811638
3.053455
3.3666685
2.1156862
3.579154
2.7019117
3.4262772
3.8

3.8489087
3.893389
3.3205955
2.2592943
3.804969
3.2923365
3.511206
1.291769
3.5458758
3.023808
2.597497
4.624764
2.8811638
1.9480561
2.7749355
2.4809742
4.0211377
3.2234952
3.193967
3.6083915
2.4340293
2.3614337
4.8984356
3.570724
1.7139107
0.97926944
2.0965147
3.3109012
2.3323696
2.4015396
2.9470956
2.8811638
1.496539
3.3062584
2.9801915
2.9488075
2.2759783
2.0613234
2.8407748
3.4364097
3.6236517
2.9820664
2.7417293
2.528447
2.0965147
1.3275043
2.822734
3.4364097
2.9135242
3.4843953
3.2818797
3.126168
1.2837865
3.5458758
2.9419792
2.7417293
1.6513617
3.0127053
2.977829
2.31574
1.1612568
2.3459196
2.9424045
1.4203894
2.7905834
2.142939
3.0725694
2.3374248
2.8811638
2.0688393
2.8098314
2.7651205
4.1810265
1.1455952
2.9419792
2.7019117
2.944363
3.1563425
1.7257631
6.6910405
4.3462095
3.1227474
2.528447
3.2778804
2.5675378
3.3436532
2.7417293
1.3617914
1.503572
1.9853241
3.8345158
2.9926975
2.9053237
2.0773444
1.3811601
4.059952
1.7876049
2.8591888
3.6809661
2.4340293
1.948314
2.7651205
3