### Correct the gene names and gene id in pseudobulk, DE results, and summary statistics

In [12]:
import anndata as ad
import pandas as pd

In [13]:
datadir = '/Users/rzhu/Gladstone Dropbox/Ronghui Zhu/GRNPerturbSeq/3_expts/processed_data/CD4i_final'
experiment_name = 'CD4i_final'
adata_de = ad.read_h5ad(datadir + f'/DE_results_all_confounders/{experiment_name}.merged_DE_results.h5ad')
adata_pb = ad.read_h5ad(datadir + '/CD4i_final_merged.DE_pseudobulk.h5ad')
de_summary_stats = pd.read_csv(datadir + f'/DE_results_all_confounders/DE_summary_stats_per_target.csv', index_col=0)

In [None]:
# Get reference gene_id gene_name list
name_df = pd.read_parquet('../1_preprocess/sgRNA_annotation/genome/GWCD4CRISPRi_targeting_sgRNA_list_updated.parquet')
oldname_geneid = name_df[['target_gene_name_from_sgRNA', 'gene_id']].copy()
oldname_geneid.drop_duplicates(inplace=True)
oldname_geneid = oldname_geneid.set_index('gene_id', drop=False)
newname_geneid = name_df[['corrected_target_gene_name', 'gene_id']].copy()
newname_geneid.drop_duplicates(inplace=True)
newname_geneid = newname_geneid.set_index('gene_id', drop=False)

In [12]:
# First identify gene id that are not in the reference id list, correct their gene id
wrongid_list = list(set(adata_de.obs.target_contrast) - set(name_df.gene_id))
de_oldname_with_wrongid = adata_de[adata_de.obs.target_contrast.isin(wrongid_list)].obs.target_contrast_gene_name.unique()
de_oldname_to_geneid = oldname_geneid[oldname_geneid.target_gene_name_from_sgRNA.isin(de_oldname_with_wrongid)].copy()
de_oldname_to_geneid = de_oldname_to_geneid.set_index('target_gene_name_from_sgRNA')
adata_de.obs['target_contrast_corrected'] = adata_de.obs['target_contrast'].astype(str)
for oldname in de_oldname_with_wrongid:
    adata_de.obs.loc[adata_de.obs.target_contrast_gene_name==oldname, 'target_contrast_corrected']=de_oldname_to_geneid.loc[oldname, 'gene_id']
adata_de.obs['target_contrast_corrected'] = adata_de.obs['target_contrast_corrected'].astype('category')

In [15]:
# Also fixing summary stats file
wrongid_list = list(set(de_summary_stats.target_contrast) - set(name_df.gene_id))
de_summary_oldname_with_wrongid = de_summary_stats[de_summary_stats.target_contrast.isin(wrongid_list)].target_name.unique()
de_summary_oldname_to_geneid = oldname_geneid[oldname_geneid.target_gene_name_from_sgRNA.isin(de_summary_oldname_with_wrongid)].copy()
de_summary_oldname_to_geneid = de_summary_oldname_to_geneid.set_index('target_gene_name_from_sgRNA')
de_summary_stats['target_contrast_corrected'] = de_summary_stats['target_contrast'].copy()
for oldname in de_summary_oldname_with_wrongid:
    de_summary_stats.loc[de_summary_stats.target_name==oldname, 'target_contrast_corrected']=de_summary_oldname_to_geneid.loc[oldname, 'gene_id']

In [4]:
# Also fixing pseudobulk stats file
wrongid_list = list(set(adata_pb.obs.perturbed_gene_id) - set(name_df.gene_id))
wrongid_list.remove('NTC')
pb_oldname_with_wrongid = adata_pb[adata_pb.obs.perturbed_gene_id.isin(wrongid_list)].obs.perturbed_gene_name.unique()
pb_oldname_to_geneid = oldname_geneid[oldname_geneid.target_gene_name_from_sgRNA.isin(pb_oldname_with_wrongid)].copy()
pb_oldname_to_geneid = pb_oldname_to_geneid.set_index('target_gene_name_from_sgRNA')
adata_pb.obs['perturbed_gene_id_corrected'] = adata_pb.obs['perturbed_gene_id'].astype(str)
for oldname in pb_oldname_with_wrongid:
    adata_pb.obs.loc[adata_pb.obs.perturbed_gene_name==oldname, 'perturbed_gene_id_corrected']=pb_oldname_to_geneid.loc[oldname, 'gene_id']
adata_pb.obs['perturbed_gene_id_corrected'] = adata_pb.obs['perturbed_gene_id_corrected'].astype('category')

In [16]:
# Check cases where one gene id has more than one gene names gene names
duplicated_cases = oldname_geneid.loc[oldname_geneid['gene_id'].value_counts()>1]
duplicated_cases

Unnamed: 0_level_0,target_gene_name_from_sgRNA,gene_id
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000137265,IRF4,ENSG00000137265
ENSG00000105887,MTPN,ENSG00000105887
ENSG00000196535,MYO18A,ENSG00000196535
ENSG00000137265,MUM1,ENSG00000137265
ENSG00000105887,LUZP6,ENSG00000105887
ENSG00000196535,TIAF1,ENSG00000196535


#### LUZP6 and MTPN are duplicates in the current analysis, after checking sgRNA position in genomic browser, LUZP6-1 and LUZP6-2 target 3' end of MTPN, and DE_summary_stat shows no on-target KD, thus remove it from DE and pseudobulk

In [17]:
adata_de = adata_de[adata_de.obs.target_contrast_gene_name!='LUZP6'].copy()
de_summary_stats = de_summary_stats[de_summary_stats.target_name!='LUZP6'].copy()
adata_pb = adata_pb[adata_pb.obs.perturbed_gene_name!='LUZP6'].copy()

In [18]:
# Rename obs_names and add a new columns in de_summary_stats
adata_de.obs_names = adata_de.obs.target_contrast_corrected.astype(str).values + '_' + adata_de.obs.culture_condition.astype(str).values
de_summary_stats['obs_names'] = de_summary_stats.target_contrast_corrected + '_' + de_summary_stats.condition

In [16]:
# Make a new column for corrected gene name
adata_de.obs['target_contrast_gene_name_corrected'] = newname_geneid.loc[adata_de.obs.target_contrast_corrected, 'corrected_target_gene_name'].values
adata_de.obs['target_contrast_gene_name_corrected'] = adata_de.obs['target_contrast_gene_name_corrected'].astype('category')
adata_de.write_h5ad(datadir + f'/DE_results_all_confounders/{experiment_name}.merged_DE_results_corrected.h5ad')

de_summary_stats['target_name_corrected'] = newname_geneid.loc[de_summary_stats.target_contrast_corrected, 'corrected_target_gene_name'].values
de_summary_stats.to_csv(datadir + f'/DE_results_all_confounders/DE_summary_stats_per_target_corrected.csv')

newname_geneid_withNTC = newname_geneid.copy()
newname_geneid_withNTC.loc['NTC'] = ['NTC', 'NTC']
adata_pb.obs['perturbed_gene_name_corrected'] = newname_geneid_withNTC.loc[adata_pb.obs.perturbed_gene_id, 'corrected_target_gene_name'].values
adata_pb.obs['perturbed_gene_name_corrected'] = adata_pb.obs['perturbed_gene_name_corrected'].astype('category')
adata_pb.write_h5ad(datadir + '/CD4i_final_merged.DE_pseudobulk_corrected.h5ad')