In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
def Pseudocell_analysis_pipeline(DGE_tab_data, phenotype_tab_data, pseudocell_size=100, discard_t=0.2):
    """
    @inputs
    DGE_tab_data: genes * cells matrix
    phenotype_tab_data: should contain ordered CellID, Celltype
    pseudocell_size: size of pseudocell                                 DEFAULT: 100
    discard_t: cell number < discard_t * pseudocell will be discard     DEFAULT: 0.2
    """

    data = pd.read_csv(DGE_tab_data, sep=',', index_col=0, header=0)
    anno = pd.read_csv(phenotype_tab_data, sep =',', index_col=None, header=0)
    data=data.T
    data=data.loc[:,~(data==0).all(axis=0)]
    data=data.div(data.sum(axis=1),axis=0)*1e6
    
    anno.columns = ["CellID", "Celltype", "Tissue","Stage"]
    anno.index = anno["CellID"].values

    anno['pseudo.id'] = 0

    for cell_type in np.unique(anno['Celltype']):
        idx = anno["Celltype"]==cell_type
        anno.loc[idx, "pseudo.id"] = range(np.sum(idx))

    anno['pseudo.id'] = np.floor_divide(anno['pseudo.id'], pseudocell_size).astype(str)
    anno['pseudo.id'] = anno["Celltype"]+"." + anno['pseudo.id']
    idx = anno.groupby(['pseudo.id'])['CellID'].count() >= discard_t * pseudocell_size

    data['pseudo.id'] = anno['pseudo.id'].values
    data_mean = data.groupby(['pseudo.id']).mean()[idx]

    new_anno = anno.groupby(['pseudo.id'])[["Celltype"]].max()[idx]
    new_anno.columns=['ctanno']
    
    return data_mean, new_anno

In [3]:
root = "/media/ggj/ggjlab/RData/agingatlas/MCA2/all/pseudocell_cluster/"

In [4]:
dir_list = os.listdir(root)

In [5]:
anno_list = []
dge_list = []

In [6]:
for tissue in dir_list:
    file_path = os.path.join(root, tissue)
    if os.path.isdir(file_path):
        temp = os.listdir(file_path)
        for file in temp:
            if file.endswith("dge.csv"):
                dge_list.append(os.path.join(file_path, file))
            elif file.endswith("anno.csv"):
                anno_list.append(os.path.join(file_path, file))

In [7]:
anno_list = sorted(anno_list)
anno_list

['/media/ggj/ggjlab/RData/agingatlas/MCA2/all/pseudocell_cluster/cluster0/cluster0_anno.csv',
 '/media/ggj/ggjlab/RData/agingatlas/MCA2/all/pseudocell_cluster/cluster1/cluster1_anno.csv',
 '/media/ggj/ggjlab/RData/agingatlas/MCA2/all/pseudocell_cluster/cluster10/cluster10_anno.csv',
 '/media/ggj/ggjlab/RData/agingatlas/MCA2/all/pseudocell_cluster/cluster100/cluster100_anno.csv',
 '/media/ggj/ggjlab/RData/agingatlas/MCA2/all/pseudocell_cluster/cluster101/cluster101_anno.csv',
 '/media/ggj/ggjlab/RData/agingatlas/MCA2/all/pseudocell_cluster/cluster102/cluster102_anno.csv',
 '/media/ggj/ggjlab/RData/agingatlas/MCA2/all/pseudocell_cluster/cluster103/cluster103_anno.csv',
 '/media/ggj/ggjlab/RData/agingatlas/MCA2/all/pseudocell_cluster/cluster104/cluster104_anno.csv',
 '/media/ggj/ggjlab/RData/agingatlas/MCA2/all/pseudocell_cluster/cluster105/cluster105_anno.csv',
 '/media/ggj/ggjlab/RData/agingatlas/MCA2/all/pseudocell_cluster/cluster106/cluster106_anno.csv',
 '/media/ggj/ggjlab/RData/agin

In [8]:
dge_list = sorted(dge_list)
dge_list

['/media/ggj/ggjlab/RData/agingatlas/MCA2/all/pseudocell_cluster/cluster0/cluster0_dge.csv',
 '/media/ggj/ggjlab/RData/agingatlas/MCA2/all/pseudocell_cluster/cluster1/cluster1_dge.csv',
 '/media/ggj/ggjlab/RData/agingatlas/MCA2/all/pseudocell_cluster/cluster10/cluster10_dge.csv',
 '/media/ggj/ggjlab/RData/agingatlas/MCA2/all/pseudocell_cluster/cluster100/cluster100_dge.csv',
 '/media/ggj/ggjlab/RData/agingatlas/MCA2/all/pseudocell_cluster/cluster101/cluster101_dge.csv',
 '/media/ggj/ggjlab/RData/agingatlas/MCA2/all/pseudocell_cluster/cluster102/cluster102_dge.csv',
 '/media/ggj/ggjlab/RData/agingatlas/MCA2/all/pseudocell_cluster/cluster103/cluster103_dge.csv',
 '/media/ggj/ggjlab/RData/agingatlas/MCA2/all/pseudocell_cluster/cluster104/cluster104_dge.csv',
 '/media/ggj/ggjlab/RData/agingatlas/MCA2/all/pseudocell_cluster/cluster105/cluster105_dge.csv',
 '/media/ggj/ggjlab/RData/agingatlas/MCA2/all/pseudocell_cluster/cluster106/cluster106_dge.csv',
 '/media/ggj/ggjlab/RData/agingatlas/MCA

In [9]:
os.makedirs("../pseudo_result", exist_ok=True)

In [10]:
os.chdir("/media/ggj/ggjlab/RData/agingatlas/MCA2/all/")

In [11]:
for i, (dge, anno) in enumerate(zip(dge_list, anno_list)):
#     print(i, os.path.basename(dge), os.path.basename(anno))
    tissuename = os.path.basename(dge).split("_")[0]
    out_dge_path = "./pseudocell_cluster/"+tissuename+"_dge_new.csv"
    out_anno_path = "./pseudocell_cluster/"+tissuename+"_anno_new.csv"
#     print(out_dge_path, out_anno_path)
    data_mean, new_anno = Pseudocell_analysis_pipeline(dge, 
                                                       anno, 
                                                       pseudocell_size=100, 
                                                       discard_t=0.2)
    data_mean.to_csv(out_dge_path)
    new_anno.to_csv(out_anno_path)
