In [1]:
import numpy as np
import pickle
import anndata as ad
from sklearn.model_selection import train_test_split
import warnings
import copy

from data.data_process import data_process
from model.deconv_model_domain_param import MBdeconv
from model.utils import *
from model.stage2 import *

seed = 2021
torch.manual_seed(seed)
np.random.seed(seed)

# 在使用GPU时，还可以设置以下代码来确保结果的一致性
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
warnings.filterwarnings("ignore")

In [2]:
type_list = ['Luminal_Macrophages', 'Type 2 alveolar', 'Fibroblasts', 'Dendritic cells']
noise = ['Neutrophils']
train_data_file = 'data/lung_rna/296C_train.h5ad'
test_data_file = 'data/lung_rna/302C_test.h5ad'
train_data = ad.read_h5ad(train_data_file)
test_data = ad.read_h5ad(test_data_file)

In [3]:
if noise:
    data_h5ad_noise = test_data[test_data.obs['CellType'].isin(noise)]
    data_h5ad_noise.obs.reset_index(drop=True, inplace=True)
# extract selected cells 
train_data = train_data[train_data.obs['CellType'].isin(type_list)]
train_data.obs.reset_index(drop=True, inplace=True)
test_data = test_data[test_data.obs['CellType'].isin(type_list)]
test_data.obs.reset_index(drop=True, inplace=True)
print('selected cells:', train_data)
print('noise cells:', data_h5ad_noise)

selected cells: View of AnnData object with n_obs × n_vars = 3601 × 3346
    obs: 'Sample', 'Donor', 'Source', 'Location', 'CellType', 'BroadCellType'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'leiden', 'neighbors_hm', 'pca'
    obsm: 'X_umap_hm'
    varm: 'PCs'
noise cells: View of AnnData object with n_obs × n_vars = 293 × 3346
    obs: 'Sample', 'Donor', 'Source', 'Location', 'CellType', 'BroadCellType'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'leiden', 'neighbors_hm', 'pca'
    obsm: 'X_umap_hm'
    varm: 'PCs'


In [4]:
dp = data_process(type_list, train_sample_num=4000, tissue_name='lung_rna', 
                  test_sample_num=1000, sample_size=20, num_artificial_cells=20)

In [5]:
dp.fit(train_data, test_data, data_h5ad_noise)

The data processing is complete


In [6]:
with open(f'data/lung_rna/lung_rna{len(type_list)}cell.pkl', 'rb') as f:
    train = pickle.load(f)
    test = pickle.load(f)
    test_with_noise = pickle.load(f)

In [7]:
train_x_sim, train_with_noise_1, train_with_noise_2, train_y = train
test_x_sim, test_y = test
train_dataset = TrainCustomDataset(train_x_sim, train_with_noise_1, train_with_noise_2, train_y)
test_dataset = TestCustomDataset(test_x_sim, test_y)
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=False)
source_data = data2h5ad(train_x_sim, train_y, type_list)
target_data = data2h5ad(test_x_sim, test_y, type_list)

AnnData object with n_obs × n_vars = 4000 × 3346
    obs: 'Luminal_Macrophages', 'Type 2 alveolar', 'Fibroblasts', 'Dendritic cells'
    uns: 'cell_types'
AnnData object with n_obs × n_vars = 1000 × 3346
    obs: 'Luminal_Macrophages', 'Type 2 alveolar', 'Fibroblasts', 'Dendritic cells'
    uns: 'cell_types'


In [8]:
num_MB = 3346
feat_map_w = 256
feat_map_h = 10
num_cell_type = len(type_list)
epoches = 42
Alpha = 1
Beta = 1

In [9]:
model_da = DANN(30, 50, 0.0001)
pred_loss, disc_loss, disc_loss_DA = model_da.train(source_data, target_data) 

pred_loss=0.020076, disc_loss=1.386874, disc_loss_DA=1.387275
pred_loss=0.018123, disc_loss=1.387182, disc_loss_DA=1.386503
pred_loss=0.011915, disc_loss=1.387460, disc_loss_DA=1.386920
pred_loss=0.007926, disc_loss=1.387383, disc_loss_DA=1.387199
pred_loss=0.006495, disc_loss=1.387894, disc_loss_DA=1.386406
pred_loss=0.004142, disc_loss=1.387118, disc_loss_DA=1.386634
pred_loss=0.002711, disc_loss=1.387646, disc_loss_DA=1.385677
pred_loss=0.002168, disc_loss=1.387485, disc_loss_DA=1.386234
pred_loss=0.001936, disc_loss=1.387499, disc_loss_DA=1.385830
pred_loss=0.001788, disc_loss=1.387363, disc_loss_DA=1.385986
pred_loss=0.001733, disc_loss=1.387865, disc_loss_DA=1.385571
pred_loss=0.001607, disc_loss=1.387741, disc_loss_DA=1.385954
pred_loss=0.001404, disc_loss=1.386991, disc_loss_DA=1.385639
pred_loss=0.001455, disc_loss=1.386983, disc_loss_DA=1.385803
pred_loss=0.001359, disc_loss=1.387689, disc_loss_DA=1.385966
pred_loss=0.001267, disc_loss=1.388261, disc_loss_DA=1.385423
pred_los

In [10]:
train_x_sim = [s.tolist() for s in train_x_sim]  
train_with_noise_1 = [s.tolist() for s in train_with_noise_1]  
train_with_noise_2 = [s.tolist() for s in train_with_noise_2]  
test_x_sim = [s.tolist() for s in test_x_sim] 
train_dataset = TrainCustomDataset(train_x_sim, train_with_noise_1, train_with_noise_2, train_y)
test_dataset = TestCustomDataset(test_x_sim, test_y)
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [11]:
model = MBdeconv(num_MB, feat_map_w, feat_map_h, num_cell_type, epoches, Alpha, Beta, train_dataloader, test_dataloader)

In [12]:
device = torch.device('cuda')
if model.gpu_available:
    model = model.to(model.gpu)
encoder_params = copy.deepcopy(model_da.encoder_da.state_dict())
model.encoder.load_state_dict(encoder_params)
loss1_list, loss2_list, nce_loss_list = model.train_model(True)

[3.79s] ep 0, loss 4.1820
[85.10s] ep 20, loss 3.1961
[168.29s] ep 40, loss 2.9847


In [13]:
model_test = MBdeconv(num_MB, feat_map_w, feat_map_h, num_cell_type, epoches, Alpha, Beta, train_dataloader, test_dataloader)

In [14]:
model_test.load_state_dict(torch.load('save_models/3346/last.pt'))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_test.to(device)
model_test.eval()
CCC, RMSE, Corr = predict(test_dataloader, type_list, model_test, 'lung_rna', True)

In [15]:
CCC, RMSE, Corr 

(0.9736062102404472, 0.031155115298747583, 0.9759987603911596)