# Curate sgRNA library metadata

In [52]:
import pandas as pd
import numpy as np
import genomic_features as gf

## -- Read Replogle 2022 library gene names + IDs to settle multimapping -- ##

replogle_library_genes = pd.read_csv('../../../../data/wesvae-data/K562_gwps_raw_singlecell_01.obs.csv')[['gene', 'gene_id']].drop_duplicates()

## -- Get deprecated HGNC gene names -- ##
# Download current HGNC dataset with previous symbols
url = "https://ftp.ebi.ac.uk/pub/databases/genenames/out_of_date_hgnc/tsv/hgnc_complete_set.txt"
hgnc_complete = pd.read_csv(url, sep='\t')

## -- Get Ensembl gene reference -- ##

ensdb = gf.ensembl.annotation(species="Hsapiens", version="110") # matching reference in cellranger v9.0.0
genes = ensdb.genes()
# Filter biotypes
keep_biotype = ['protein_coding', 'lncRNA', 'IG_V_pseudogene',
       'TR_V_pseudogene', 'IG_V_gene', 'snRNA',
       'miRNA', 'snoRNA', 'rRNA_pseudogene', 'rRNA',
       'TR_V_gene', 'Mt_tRNA', 'Mt_rRNA', 'IG_C_gene', 'IG_J_gene',
       'TR_J_gene', 'TR_C_gene', 'TR_J_pseudogene', 'IG_D_gene',
       'ribozyme', 'IG_C_pseudogene', 'TR_D_gene', 
       'IG_J_pseudogene', 'scRNA', 'scaRNA',
       'sRNA', 'IG_pseudogene']
filt_genes = genes[genes.gene_biotype.isin(keep_biotype)]
# Filter chromosomes
filt_genes = filt_genes[filt_genes['seq_name'].isin([str(x) for x in np.arange(0,24)] + ['X', 'Y'])]
# Use one source
filt_genes['source'] = filt_genes['description'].str.split('[').str[-1].str.split(' ').str[0]
filt_genes = filt_genes[filt_genes['source'] != 'Source:NCBI'].copy()

## -- Annotate gene IDs for targeted genes -- ##

# Read raw sgRnA library metadata
sgrna_library_metadata = pd.read_csv('../../metadata/LRZ002RHSgRNA.csv', index_col=0)

# Sanitize excel problems
sgrna_library_metadata['name'] = np.where(sgrna_library_metadata['name'] == '1-Jun', 'JUN-1', sgrna_library_metadata['name'])
sgrna_library_metadata['name'] = np.where(sgrna_library_metadata['name'] == '2-Jun', 'JUN-2', sgrna_library_metadata['name'])
# Annotate perturbed gene
sgrna_library_metadata['perturbed_gene_name'] = sgrna_library_metadata['name'].str.split('-').str[0:-1].str.join('-')

sgrna_library_metadata = sgrna_library_metadata.rename({'name':'sgrna_id'}, axis=1)

  hgnc_complete = pd.read_csv(url, sep='\t')


In [53]:
# Merge with Ensembl gene reference
sgrna_library_metadata = pd.merge(sgrna_library_metadata, filt_genes.rename({'gene_name':'perturbed_gene_name'}, axis=1)[['perturbed_gene_name', 'gene_id', 'gene_biotype', 'description', 'source']], how='left')

## -- Missing gene IDs -- ##
missing_gene_id = sgrna_library_metadata[sgrna_library_metadata['gene_id'].isna() & (~sgrna_library_metadata.perturbed_gene_name.isin(['NTC', 'ProbeNTC']))].perturbed_gene_name.unique()
print(len(missing_gene_id))
# Settle missing with ID in Replogle2022
missing_replogle2022 = replogle_library_genes[replogle_library_genes['gene'].isin(missing_gene_id)].set_index('gene').to_dict()['gene_id']
sgrna_library_metadata.loc[sgrna_library_metadata['perturbed_gene_name'].isin(missing_replogle2022.keys()), 'gene_id'] = [missing_replogle2022[x] for x in sgrna_library_metadata.loc[sgrna_library_metadata['perturbed_gene_name'].isin(missing_replogle2022.keys()), 'perturbed_gene_name']] 

missing_gene_id = sgrna_library_metadata[sgrna_library_metadata['gene_id'].isna() & (~sgrna_library_metadata.perturbed_gene_name.isin(['NTC', 'ProbeNTC']))].perturbed_gene_name.unique()
print(len(missing_gene_id))


325
152


In [54]:
# Search gene names in renamed genes from HGNC
keep_cols = ['perturbed_gene_name', 'hgnc_id', 'symbol',
        'alias_symbol', 'alias_name',
       'prev_symbol', 'prev_name', 'entrez_id', 'ensembl_gene_id', 'vega_id', 'ucsc_id',
       ]

def check_aliases(alias_str, target_genes):
    if pd.isna(alias_str):
        return False
    aliases = str(alias_str).split('|')
    return any(alias in target_genes for alias in aliases)

def find_matching_query_gene(row, missing_gene_id):
    # Check direct symbol match
    if row['symbol'] in missing_gene_id:
        return row['symbol']
    
    # Check alias matches
    if pd.notna(row['alias_symbol']):
        aliases = str(row['alias_symbol']).split('|')
        matching = [a for a in aliases if a in missing_gene_id]
        if matching:
            return matching[0]
            
    # Check previous symbol matches
    if pd.notna(row['prev_symbol']):
        prev_symbols = str(row['prev_symbol']).split('|')
        matching = [p for p in prev_symbols if p in missing_gene_id]
        if matching:
            return matching[0]
    
    return None

# missing_gene_id = sgrna_library_metadata['perturbed_gene_name'].unique()

# Create mask for direct symbol matches
symbol_mask = hgnc_complete.symbol.isin(missing_gene_id)

# Create mask for alias symbol matches 
alias_mask = hgnc_complete.alias_symbol.apply(lambda x: check_aliases(x, missing_gene_id))

# Create mask for previous symbol matches
prev_mask = hgnc_complete.prev_symbol.apply(lambda x: check_aliases(x, missing_gene_id))

# Combine masks and filter
matching_genes = hgnc_complete[symbol_mask | alias_mask | prev_mask].copy()

# Add query gene column
matching_genes['perturbed_gene_name'] = matching_genes.apply(lambda x: find_matching_query_gene(x, missing_gene_id), axis=1)

# Select final columns
matching_genes = matching_genes[keep_cols]

print(f"Found {len(matching_genes)} matching genes")

# Update gene_id in sgrna_library_metadata based on matching_genes using merge
sgrna_library_metadata = sgrna_library_metadata.merge(
    matching_genes[['perturbed_gene_name', 'ensembl_gene_id']],
    on='perturbed_gene_name',
    how='left'
)

# Update gene_id with ensembl_gene_id where gene_id is null
sgrna_library_metadata.loc[sgrna_library_metadata['gene_id'].isna(), 'gene_id'] = \
    sgrna_library_metadata.loc[sgrna_library_metadata['gene_id'].isna(), 'ensembl_gene_id']

# Drop the temporary ensembl_gene_id column
sgrna_library_metadata = sgrna_library_metadata.drop('ensembl_gene_id', axis=1)

missing_gene_id = sgrna_library_metadata[sgrna_library_metadata['gene_id'].isna() & (~sgrna_library_metadata.perturbed_gene_name.isin(['NTC', 'ProbeNTC']))].perturbed_gene_name.unique()
print(len(missing_gene_id))
# Just use gene name for highly deprecated genes
sgrna_library_metadata['gene_id'] = np.where(sgrna_library_metadata['perturbed_gene_name'].isin(missing_gene_id), sgrna_library_metadata['perturbed_gene_name'], sgrna_library_metadata['gene_id'])

Found 149 matching genes
7


In [55]:
# Check double mappings
mappings = sgrna_library_metadata[['perturbed_gene_name', 'gene_id']].drop_duplicates()
n_mapped_ids = mappings.groupby('perturbed_gene_name').size()
multimapping_genes = mappings.groupby('perturbed_gene_name').first()[n_mapped_ids > 1].index.tolist()
print(multimapping_genes)

# Settle multimappers with ID in Replogle2022
multimappers_replogle2022 = replogle_library_genes[replogle_library_genes['gene'].isin(multimapping_genes)].set_index('gene').to_dict()['gene_id']
sgrna_library_metadata = sgrna_library_metadata[
    ~((sgrna_library_metadata['perturbed_gene_name'].isin(multimappers_replogle2022.keys())) &
    (~sgrna_library_metadata['gene_id'].isin(multimappers_replogle2022.values())))
]

mappings = sgrna_library_metadata[['perturbed_gene_name', 'gene_id']].drop_duplicates()
n_mapped_ids = mappings.groupby('perturbed_gene_name').size()
multimapping_genes = mappings.groupby('perturbed_gene_name').first()[n_mapped_ids > 1].index.tolist()
print(f'Remaining multi-mapping genes: {multimapping_genes}')

# Keep first entry for remaining multimapping genes
sgrna_library_metadata = sgrna_library_metadata.groupby('sgrna_id').first().reset_index()

['AKAP17A', 'ASMTL', 'CD99', 'CRLF2', 'CSF2RA', 'GTPBP6', 'IL3RA', 'IL9R', 'MUM1', 'P2RY8', 'PLCXD1', 'POLR2J3', 'PPP2R3B', 'SEPT2', 'SHOX', 'SLC25A6', 'VAMP7', 'ZBED1']
Remaining multi-mapping genes: ['CRLF2', 'CSF2RA', 'IL3RA', 'IL9R', 'MUM1', 'P2RY8', 'SEPT2']


Test output

In [56]:
# Read raw sgRnA library metadata
raw_sgrna_library_metadata = pd.read_csv('../../metadata/LRZ002RHSgRNA.csv', index_col=0)
d = sgrna_library_metadata[['perturbed_gene_name', 'gene_id']].drop_duplicates().set_index('perturbed_gene_name')['gene_id'].to_dict()
assert len([k for k,v in d.items() if v is None and k != 'NTC' and k != "ProbeNTC"]) == 0, 'Missing IDs'
assert all([len(v) == 1 for k,v in d.items() if k != 'NTC' and k != "ProbeNTC"]) == 0, 'Duplicate ID'

raw_sgrnas = raw_sgrna_library_metadata['name'].unique()
post_sgrna = sgrna_library_metadata['sgrna_id'].unique()
assert len(raw_sgrnas) == len(post_sgrna), 'Lost some sgRNA'

In [60]:
# Save
sgrna_library_metadata = sgrna_library_metadata.drop(['description', 'source', 'gene_biotype', 'read', 'pattern', 'feature_type'], axis=1)
sgrna_library_metadata = sgrna_library_metadata.rename({"gene_id":'perturbed_gene_id'}, axis=1)
sgrna_library_metadata.to_csv('../../metadata/sgRNA_library_curated.csv')