# Select targets for donor robustness analysis

Load DE summary statistics and select a random subsample of targets with downstream effects across conditions.

In [11]:
import pandas as pd
import numpy as np
import os

In [12]:
datadir = '/mnt/oak/users/emma/data/GWT/CD4i_final/'
results_dir = datadir + 'donor_robustness_analysis/'
if not os.path.exists(results_dir):
    os.mkdir(results_dir)

In [13]:
# Load DE summary statistics
de_counts = pd.read_csv(f'{datadir}/DE_results_all_confounders/DE_summary_stats_per_target.csv', index_col=0)
de_counts.head()

Unnamed: 0,target_contrast,target_name,condition,n_cells_target,n_up_genes,n_down_genes,n_total_de_genes,ontarget_effect_size,ontarget_significant,baseMean,offtarget_flag,n_total_genes_category,ontarget_effect_category
0,ENSG00000012963,UBR7,Stim8hr,491.0,0,2,2,-12.952742,True,43.169196,True,2-10 DE genes,on-target KD
1,ENSG00000017260,ATP2C1,Stim8hr,469.0,0,1,1,-16.307246,True,102.399025,False,1 DE gene,on-target KD
2,ENSG00000067606,PRKCZ,Stim8hr,427.0,1,1,2,-1.658755,False,0.965897,False,2-10 DE genes,no on-target KD
3,ENSG00000092929,UNC13D,Stim8hr,830.0,0,2,2,-19.259466,True,60.904483,False,2-10 DE genes,on-target KD
4,ENSG00000100504,PYGL,Stim8hr,414.0,1,0,1,,False,,False,1 DE gene,no on-target KD


In [14]:
# Filter targets with at least 30 DE genes and 75 perturbed cells
min_de_genes = 30
min_cells = 75

high_quality_targets = de_counts[
    (de_counts['n_total_de_genes'] >= min_de_genes) & 
    (de_counts['n_cells_target'] >= min_cells)
].copy()

print(f"Targets with >= {min_de_genes} DE genes and >= {min_cells} cells:")
print(f"Total entries: {len(high_quality_targets)}")
print(f"Unique targets: {high_quality_targets['target_name'].nunique()}")
print(f"By condition: {high_quality_targets['condition'].value_counts()}")

Targets with >= 30 DE genes and >= 75 cells:
Total entries: 4696
Unique targets: 2505
By condition: condition
Stim8hr     1627
Stim48hr    1539
Rest        1530
Name: count, dtype: int64


In [16]:
np.random.seed(42)
n_targets_per_condition = 50  # Number of targets to select per condition

selected_targets_by_condition = {}

for condition in high_quality_targets['condition'].unique():
    condition_targets = high_quality_targets[high_quality_targets['condition'] == condition]['target_name'].unique()
    
    n_to_select = min(n_targets_per_condition, len(condition_targets))
    selected = np.random.choice(condition_targets, size=n_to_select, replace=False)
    selected_targets_by_condition[condition] = selected
    
    print(f"\n{condition}: Selected {len(selected)} targets from {len(condition_targets)} available")
    for target in sorted(selected):
        print(f"  {target}")

# Save selected targets to separate text files for each condition
for condition, targets in selected_targets_by_condition.items():
    output_file = os.path.join(results_dir, f'selected_targets_robustness_{condition}.txt')
    
    with open(output_file, 'w') as f:
        for target in sorted(targets):
            f.write(f"{target}\n")
    


Stim8hr: Selected 50 targets from 1627 available
  ABCA1
  ANKZF1
  APPL2
  BCAT2
  BHLHE40
  BRD8
  C2CD2L
  C9orf64
  CNBP
  CPSF6
  CRIPT
  CTSZ
  EPB41L3
  EXOC1
  FAM69A
  FEM1A
  FITM2
  GATAD1
  GPT2
  HDAC3
  IL2RB
  INO80
  KPNA1
  LAMTOR2
  LAMTOR4
  MALT1
  MPG
  MRPS21
  NIPSNAP3A
  NUDT5
  OSTF1
  PAXIP1
  PPP1R11
  PTK6
  RAP2A
  RC3H1
  SMARCC1
  SND1
  SP2
  ST14
  SYK
  TADA2B
  TBL1X
  TMEM214
  TTF1
  UBE2V2
  USO1
  USP10
  ZNF131
  ZNF574

Stim48hr: Selected 50 targets from 1539 available
  ABHD17A
  ANP32B
  AP2A1
  ATF7IP
  ATG101
  BLOC1S2
  BORCS8-MEF2B
  BRD1
  CCR6
  CDC25B
  CDCA3
  CHIC2
  CNOT11
  CRLS1
  CTDSPL2
  DMXL1
  EMC6
  EP300
  FAM204A
  FDX1
  FGD3
  GIPC3
  GOLGA8A
  HDHD3
  HPS1
  ICOS
  KAT6A
  KIF22
  L3MBTL2
  MCM8
  MEF2D
  MRM2
  MRPS25
  MTPAP
  PAK2
  PCBP1
  PPP1R15B
  PSD
  RBM27
  RMDN3
  SCO1
  SYVN1
  TIMP1
  TMEM30A
  TNPO1
  TNRC6A
  TSC2
  WIZ
  XPR1
  ZNF574

Rest: Selected 50 targets from 1530 available
  APEX1
  AVPR2
  CALR