# Goal

Here, we will perform z-score normalization and subset anndatas to DEGs at different gene expression thresholds (fraction of cells with > 0 counts).

In [None]:
import pandas as pd
import numpy as np
import scanpy as sc
import pandas as pd
import sys
sys.path.insert(0, '/data1/rudenska/EYW/git_projects/SIG13/functions')
import perturbseq as ps

# Z-score processing 0.05 filter genes

In [2]:
singleLfc = pd.read_csv("/data1/rudenska/EYW/git_projects/SIG13/analysis_outs/glmGamPoi/glmGamPoi_single_term/glmGamPoi_singleTerm_lfc_0.05filter.csv")
adata = sc.read_h5ad("/data1/rudenska/EYW/SIG13/scanpy_outs/SIG13_doublets_DSB7.h5ad")

In [3]:
## select significant degs
genes_selected = (
    singleLfc
    .query('adj_pval < 0.01')
    .name.unique().tolist()
)

In [4]:
adata.X = adata.layers['norm'].copy()
del adata.layers['norm']
del adata.layers['log1p_norm']
del adata.layers['counts']

In [5]:
## filter adata for relevant genes
adata_hvg = adata[:,genes_selected].copy()

In [6]:
## zscore to control
adata_hvg.obs['replicate_lane'] = adata_hvg.obs['replicate'].astype(str) + '_' + adata_hvg.obs['lane'].astype(str)
adata_hvg = ps.normalize_to_control_adata_multithread(adata_hvg,
                                          control_cells_query='ligand_call_DSB7 == "linker_linker"',
                                          groupby_column='replicate_lane')
adata_hvg.X = adata_hvg.layers['zscore'].copy()
del adata_hvg.layers['zscore']

Normalizing groups: 100%|██████████| 4/4 [01:13<00:00, 18.40s/it]


In [7]:
adata_hvg.write("/data1/rudenska/EYW/SIG13/scanpy_outs/SIG13_doublets_DSB7_zscore_degs0.05cutoff.h5ad")

In [8]:
# make aggregate adata
adata_pb = sc.get.aggregate(adata_hvg, by=['ligand_call_DSB7','replicate'], func='mean')
adata_pb.X = adata_pb.layers['mean'].copy()
del adata_pb.layers['mean']
adata_pb.obs['ligand_replicate'] = adata_pb.obs['ligand_call_DSB7'].astype(str) + '_' + adata_pb.obs['replicate'].astype(str)
adata_pb.write("/data1/rudenska/EYW/SIG13/scanpy_outs/SIG13_doublets_DSB7_zscore_degs0.05cutoff_pb.h5ad")

# Z-score processing 0.1 filter genes

In [9]:
singleLfc = pd.read_csv("/data1/rudenska/EYW/git_projects/SIG13/analysis_outs/glmGamPoi/glmGamPoi_single_term/glmGamPoi_singleTerm_lfc_0.1filter.csv")
adata = sc.read_h5ad("/data1/rudenska/EYW/SIG13/scanpy_outs/SIG13_doublets_DSB7.h5ad")

In [10]:
## select significant degs
genes_selected = (
    singleLfc
    .query('adj_pval < 0.01')
    .name.unique().tolist()
)

In [11]:
adata.X = adata.layers['norm'].copy()
del adata.layers['norm']
del adata.layers['log1p_norm']
del adata.layers['counts']

In [12]:
## filter adata for relevant genes
adata_hvg = adata[:,genes_selected].copy()

In [13]:
## zscore to control
adata_hvg.obs['replicate_lane'] = adata_hvg.obs['replicate'].astype(str) + '_' + adata_hvg.obs['lane'].astype(str)
adata_hvg = ps.normalize_to_control_adata_multithread(adata_hvg,
                                          control_cells_query='ligand_call_DSB7 == "linker_linker"',
                                          groupby_column='replicate_lane')
adata_hvg.X = adata_hvg.layers['zscore'].copy()
del adata_hvg.layers['zscore']

Normalizing groups: 100%|██████████| 4/4 [01:02<00:00, 15.63s/it]


In [14]:
adata_hvg.write("/data1/rudenska/EYW/SIG13/scanpy_outs/SIG13_doublets_DSB7_zscore_degs0.1cutoff.h5ad")

In [15]:
# make aggregate adata
adata_pb = sc.get.aggregate(adata_hvg, by=['ligand_call_DSB7','replicate'], func='mean')
adata_pb.X = adata_pb.layers['mean'].copy()
del adata_pb.layers['mean']
adata_pb.obs['ligand_replicate'] = adata_pb.obs['ligand_call_DSB7'].astype(str) + '_' + adata_pb.obs['replicate'].astype(str)
adata_pb.write("/data1/rudenska/EYW/SIG13/scanpy_outs/SIG13_doublets_DSB7_zscore_degs0.1cutoff_pb.h5ad")

# Z-score processing 0.2 filter genes

In [16]:
singleLfc = pd.read_csv("/data1/rudenska/EYW/git_projects/SIG13/analysis_outs/glmGamPoi/glmGamPoi_single_term/glmGamPoi_singleTerm_lfc_0.2filter.csv")
adata = sc.read_h5ad("/data1/rudenska/EYW/SIG13/scanpy_outs/SIG13_doublets_DSB7.h5ad")

In [17]:
## select significant degs
genes_selected = (
    singleLfc
    .query('adj_pval < 0.01')
    .name.unique().tolist()
)

In [18]:
adata.X = adata.layers['norm'].copy()
del adata.layers['norm']
del adata.layers['log1p_norm']
del adata.layers['counts']

In [19]:
## filter adata for relevant genes
adata_hvg = adata[:,genes_selected].copy()

In [20]:
## zscore to control
adata_hvg.obs['replicate_lane'] = adata_hvg.obs['replicate'].astype(str) + '_' + adata_hvg.obs['lane'].astype(str)
adata_hvg = ps.normalize_to_control_adata_multithread(adata_hvg,
                                          control_cells_query='ligand_call_DSB7 == "linker_linker"',
                                          groupby_column='replicate_lane')
adata_hvg.X = adata_hvg.layers['zscore'].copy()
del adata_hvg.layers['zscore']

Normalizing groups: 100%|██████████| 4/4 [09:08<00:00, 137.09s/it]


In [21]:
adata_hvg.write("/data1/rudenska/EYW/SIG13/scanpy_outs/SIG13_doublets_DSB7_zscore_degs0.2cutoff.h5ad")

In [22]:
# make aggregate adata
adata_pb = sc.get.aggregate(adata_hvg, by=['ligand_call_DSB7','replicate'], func='mean')
adata_pb.X = adata_pb.layers['mean'].copy()
del adata_pb.layers['mean']
adata_pb.obs['ligand_replicate'] = adata_pb.obs['ligand_call_DSB7'].astype(str) + '_' + adata_pb.obs['replicate'].astype(str)
adata_pb.write("/data1/rudenska/EYW/SIG13/scanpy_outs/SIG13_doublets_DSB7_zscore_degs0.2cutoff_pb.h5ad")