# Option 1

In [1]:
# %load_ext autoreload
# %autoreload 2

In [2]:
from pathlib import Path
import scanpy as sc
import numpy as np
import json
import pandas as pd
from tqdm import tqdm
import os

## Load Authors H5AD Data

In [3]:
replogle_dir = Path('/data2/czbenchmarks/replogle2022')
! ls {replogle_dir}

．
deg_jg
K562
k562_ctrl_for_batch_integration.h5ad
K562_essential_normalized_singlecell_01.h5ad
K562_essential_raw_singlecell_01.h5ad
K562_gwps_normalized_singlecell_01.h5ad
K562_gwps_raw_singlecell_01.h5ad
raw_cr4
raw_filt_h5ad
raw_h5ad_from_cr4
README.md


In [4]:
adata = sc.read_h5ad(f"{replogle_dir}/K562_essential_raw_singlecell_01.h5ad")
adata.obs = adata.obs.rename(columns={'gene_id':'condition', 'gene':'condition_name'})
adata.var = adata.var.rename(columns={'gene_name':'gene'})
print(adata.shape)

(310385, 8563)


In [5]:
adata.obs.head(n=2).T

cell_barcode,AAACCCAAGAAATCCA-27,AAACCCAAGAACTTCC-31
gem_group,27,31
condition_name,NAF1,BUB1
condition,ENSG00000145414,ENSG00000169679
transcript,P1P2,P1P2
gene_transcript,5449_NAF1_P1P2_ENSG00000145414,935_BUB1_P1P2_ENSG00000169679
sgID_AB,NAF1_+_164087918.23-P1P2|NAF1_-_164087674.23-P1P2,BUB1_-_111435363.23-P1P2|BUB1_-_111435372.23-P1P2
mitopercent,0.112083,0.179895
UMI_count,11438.0,5342.0
z_gemgroup_UMI,0.013047,-1.522247
core_scale_factor,0.813253,0.844107


In [6]:
adata.var.head(n=2)

Unnamed: 0_level_0,gene,chr,start,end,class,strand,length,in_matrix,mean,std,cv,fano
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
ENSG00000237491,LINC01409,chr1,778747,810065,gene_version10,+,31318,True,0.137594,0.380048,2.762105,1.049733
ENSG00000228794,LINC01128,chr1,825138,868202,gene_version9,+,43064,True,0.25672,0.520162,2.026184,1.053944


## Gene Name to Ensembl ID Mapper
This will be used to map the condition from gene name to ensembl_id in the auxilary data.

In [7]:
map_gene_name_to_ensembl_id_pkl = 'map_gene_name_to_ensembl_id.pkl'

obs_mapper = adata.obs[['condition_name', 'condition']].drop_duplicates().set_index('condition_name').rename(columns={'condition':'ensembl_id'})
assert obs_mapper.shape[0] == obs_mapper['ensembl_id'].nunique() == obs_mapper.index.nunique()

# var_mapper = adata.var[['gene_name']].reset_index().rename(columns={'gene_name':'gene', 'gene_id':'ensembl_id'}).set_index('gene')
# assert var_mapper.shape[0] == var_mapper['ensembl_id'].nunique() == var_mapper.index.nunique()

map_gene_name_to_ensembl_id = obs_mapper # var_mapper is not fully unique
# map_gene_name_to_ensembl_id.to_pickle(map_gene_name_to_ensembl_id_pkl)

In [8]:
map_gene_name_to_ensembl_id.head()

Unnamed: 0_level_0,ensembl_id
condition_name,Unnamed: 1_level_1
NAF1,ENSG00000145414
BUB1,ENSG00000169679
UBL5,ENSG00000198258
C9orf16,ENSG00000171159
TIMM9,ENSG00000100575


## Background Matched Controls

In [25]:
control_cells_path = replogle_dir / "deg_jg/ReplogleEssentialsCr4_GEM_libsizeMatched_NonTargetingCellIdsPerTarget.json"

with open(control_cells_path, 'r') as fh:
    control_cells_ids = json.load(fh)

control_cells_ids.pop('non-targeting')

[]

In [26]:
index_set = set(adata.obs.index.values)

# Map gene names to ensembl IDs
skipped_conditions = []
control_cells_ensembl_ids = {}
for key in control_cells_ids.keys():
    ensembl_id = map_gene_name_to_ensembl_id.loc[key].values[0]

    if len(control_cells_ids[key]) == 0:
        print(f"No control cells for {key}")
    else:
        if not isinstance(control_cells_ids[key], list):
            assert control_cells_ids[key].ndim == 1
        elif ensembl_id not in adata.obs.condition.values:
            skipped_conditions.append(ensembl_id)
        else:
            assert all(x in index_set for x in control_cells_ids[key])
            control_cells_ensembl_ids[ensembl_id] = list(control_cells_ids[key])

print(f"Skipped {len(skipped_conditions)} conditions: {skipped_conditions}")
print(list(control_cells_ensembl_ids.keys())[:10])
# print(list(control_cells_ensembl_ids['ENSG00000094914'])[:10])

Skipped 0 conditions: []
['ENSG00000094914', 'ENSG00000127837', 'ENSG00000090861', 'ENSG00000124608', 'ENSG00000149313', 'ENSG00000275700', 'ENSG00000131269', 'ENSG00000135776', 'ENSG00000164163', 'ENSG00000204574']


## Differential Expression Data

In [37]:
de_base_path = "/data2/compare_de_results_{de_type}.arrow"

de_results = {}
for de_type in ['wilcoxon']:
    de_data_path = de_base_path.format(de_type=de_type)
    de_data = pd.read_parquet(de_data_path)

    col_has_nulls = de_data.isnull().any()
    col_has_nulls = col_has_nulls[col_has_nulls].index.values
    print(f'The following dataframe columns contain nulls: {col_has_nulls}')

    # No column name mapping required for Wilcoxon

    orig_size = len(de_data)
    de_data = pd.merge(de_data, 
                       map_gene_name_to_ensembl_id.reset_index().set_index('ensembl_id'), 
                       left_on='condition', 
                       right_index=True, 
                       how='left')
    assert len(de_data) == orig_size
    assert not de_data['condition'].isnull().any()
    
    # de_data.index = de_data.index.astype(str)
    de_results[de_type] = de_data

The following dataframe columns contain nulls: []


In [37]:
# old path "deg_jg/ReplogleEssentialsCr4_GEM_SigsOnly_wilcoxon-removeAvgZerosFalse-filtMinCells10.txt"

de_base_path = replogle_dir / "K562" / "zero_shot_benchmark"
de_results = {}

for de_type in ['wilcoxon', 't_test']:
    print(de_type)
    de_data_path = de_base_path / de_type / "de_results.csv"
    de_data = pd.read_csv(de_data_path)

    col_has_nulls = de_data.isnull().any()
    col_has_nulls = col_has_nulls[col_has_nulls].index.values
    print(f'The following dataframe columns contain nulls: {col_has_nulls}')
    
    if de_type == 'wilcoxon':
        col_mapper = {'names':'gene_id', 
                      'target_gene':'condition_name',
                       'scores':'score', 
                       'logfoldchanges':'logfoldchange', 
                       'pvals':'pval', 
                       'pvals_adj':'pval_adj'}
        
        de_data = de_data.rename(columns=col_mapper)
        de_data = de_data[list(col_mapper.values())]
        assert not de_data.isnull().any().any()

    elif de_type == 't_test':
        col_mapper = {'gene':'gene_id',
                      'condition': 'condition_name',
                      'score':'score',
                      'logfoldchange':'logfoldchange',
                      'pval':'pval',
                      'pval_adj':'pval_adj',
                      'smd': 'standardized_mean_diff',
                      'group':'group'
                      }
        
        de_data = de_data.rename(columns=col_mapper)
        de_data = de_data[list(col_mapper.values())]
        assert not de_data.drop(columns='logfoldchange').isnull().any().any()
    
    orig_size = len(de_data)
    de_data = pd.merge(de_data, 
                       map_gene_name_to_ensembl_id, 
                       left_on='condition_name', 
                       right_index=True, 
                       how='left')
    de_data = de_data.rename(columns={'ensembl_id':'condition'})
    assert len(de_data) == orig_size
    assert not de_data['condition'].isnull().any()
    
    # de_data.index = de_data.index.astype(str)
    de_results[de_type] = de_data

wilcoxon
The following dataframe columns contain nulls: ['gene_name' 'gene_length' 'log_genelen']
t_test
The following dataframe columns contain nulls: ['logfoldchange']


In [38]:
de_results['wilcoxon'].head()

Unnamed: 0,gene_id,condition_name,score,logfoldchange,pval,pval_adj,condition
0,ENSG00000245910,SLU7,10.678079,1.04975,1.289127e-26,1.4359590000000001e-22,ENSG00000164609
1,ENSG00000177410,SLU7,9.709518,1.018977,2.746308e-22,1.529556e-18,ENSG00000164609
2,ENSG00000255717,SLU7,9.608023,1.691616,7.395531e-22,2.745961e-18,ENSG00000164609
3,ENSG00000149806,SLU7,9.155279,0.675583,5.421733e-20,1.207854e-16,ENSG00000164609
4,ENSG00000161970,SLU7,9.070821,0.609069,1.181242e-19,2.192975e-16,ENSG00000164609


In [17]:
de_results['t_test'].head()

Unnamed: 0,gene_id,condition_name,score,logfoldchange,pval,pval_adj,standardized_mean_diff,group,condition
0,ENSG00000113161,NAF1,3.641039,,0.000357,0.998205,0.5311,NAF1,ENSG00000145414
1,ENSG00000215251,NAF1,3.35528,,0.000997,0.998205,0.489418,NAF1,ENSG00000145414
2,ENSG00000178694,NAF1,3.352882,,0.001035,0.998205,0.489068,NAF1,ENSG00000145414
3,ENSG00000131148,NAF1,3.333118,,0.001052,0.998205,0.486185,NAF1,ENSG00000145414
4,ENSG00000196418,NAF1,3.263372,,0.001313,0.998205,0.476012,NAF1,ENSG00000145414


In [18]:
adata.obs.head(n=2).T

cell_barcode,AAACCCAAGAAATCCA-27,AAACCCAAGAACTTCC-31
gem_group,27,31
condition_name,NAF1,BUB1
condition,ENSG00000145414,ENSG00000169679
transcript,P1P2,P1P2
gene_transcript,5449_NAF1_P1P2_ENSG00000145414,935_BUB1_P1P2_ENSG00000169679
sgID_AB,NAF1_+_164087918.23-P1P2|NAF1_-_164087674.23-P1P2,BUB1_-_111435363.23-P1P2|BUB1_-_111435372.23-P1P2
mitopercent,0.112083,0.179895
UMI_count,11438.0,5342.0
z_gemgroup_UMI,0.013047,-1.522247
core_scale_factor,0.813253,0.844107


In [19]:
adata.var.head(n=2).T

gene_id,ENSG00000237491,ENSG00000228794
gene,LINC01409,LINC01128
chr,chr1,chr1
start,778747,825138
end,810065,868202
class,gene_version10,gene_version9
strand,+,+
length,31318,43064
in_matrix,True,True
mean,0.137594,0.25672
std,0.380048,0.520162


## Update and Write H5AD Ground Truth

In [42]:
output_h5ad_name = '/data2/czbenchmarks/replogle_k562_essential_perturbpredict_de_results_control_cells.h5ad'

print('Adding control cell ids')
adata.uns['control_cells_ids'] = control_cells_ensembl_ids

print('Adding de results wilcoxon')
adata.uns['de_results_wilcoxon'] = de_results['wilcoxon']

print('Adding de results t_test')
adata.uns['de_results_t_test'] = de_results['t_test']

print('Saving h5ad file')
sc.write(output_h5ad_name, adata)
print('Finished')

Adding control cell ids
Adding de results wilcoxon
Adding de results t_test
Saving h5ad file
Finished


In [44]:
! md5sum {output_h5ad_name}

838a10967f13fcddec6062c9b78d3a19  /data2/czbenchmarks/replogle_k562_essential_perturbpredict_de_results_control_cells.h5ad


In [45]:
print('Copying to cache')
output_file_basename = os.path.basename(output_h5ad_name)
! cp {output_h5ad_name} /home/mgill/.cz-benchmarks/datasets/{output_file_basename}

print('Finished')

Copying to cache
Finished


In [46]:
print('Copying to s3')
output_file_basename = os.path.basename(output_h5ad_name)
! rclone copy {output_h5ad_name}  swift-bio-foundation-s3:/scgym-files/{output_file_basename} --verbose
print('Finished')

Copying to s3
2025/08/21 04:20:11 NOTICE: s3: s3 provider "" not known - please set correctly
2025/08/21 04:21:11 INFO  : 
Transferred:   	    2.361 GiB / 4.609 GiB, 51%, 46.329 MiB/s, ETA 49s
Transferred:            0 / 1, 0%
Elapsed time:        59.9s
Transferring:
 * replogle_k562_essentia…lts_control_cells.h5ad: 51% /4.609Gi, 47.565Mi/s, 48s

2025/08/21 04:22:08 INFO  : replogle_k562_essential_perturbpredict_de_results_control_cells.h5ad: Multi-thread Copied (replaced existing)
2025/08/21 04:22:08 INFO  : 
Transferred:   	    4.609 GiB / 4.609 GiB, 100%, 36.083 MiB/s, ETA 0s
Transferred:            1 / 1, 100%
Elapsed time:      1m56.1s

Finished


In [47]:
adata.shape, len(adata.uns['control_cells_ids'].keys()), adata.uns['de_results_wilcoxon'].shape, adata.uns['de_results_t_test'].shape

((310385, 8563), 2058, (20793007, 7), (16334637, 9))

In [15]:
adata.shape, len(adata.uns['control_cells_ids'].keys()), adata.uns['de_results_wilcoxon'].shape, adata.uns['de_results_t_test'].shape

((310385, 8563), 2058, (20793007, 7), (16334637, 9))