In [1]:
import json
import pandas as pd

# Variables

In [2]:
fasta_file_path = './data/genome/gencode.v38.transcripts.fa'
json_geneset_fname = './data/c2.cp.kegg_medicus.v2023.2.Hs.json'

target_gene_sets = ["M47362", "M47365", "M47412", "M47419", "M47422", "M47425", "M47450", "M47452", "M47478", "M47501"]


out_indexed_gset = './output/gene_sets_with_indices.tsv'


# Extract Target genes with matching index from Fasta file

In [3]:
with open(json_geneset_fname, 'r') as json_file:
    gene_sets = json.load(json_file)

# Extract the gene names from the JSON file - gene sets
gene_set_data = []
for gene_set_name, gene_set_info in gene_sets.items():
    if gene_set_info["systematicName"] in target_gene_sets:
        for gene in gene_set_info["geneSymbols"]:
            gene_set_data.append((gene_set_info["systematicName"], gene))

# Map gene names to their first occurrence index
gene_index_map = {}
with open(fasta_file_path, 'r') as fasta:
    index = 1
    for line in fasta:
        if line.startswith('>'):
            gene_name = line.strip().split('|')[5]
            if gene_name not in gene_index_map:
                gene_index_map[gene_name] = index
            index += 1

# Compare the gene names from the JSON file to the FASTA file
genes_not_found = []
output_data = []
for gene_set_name, gene in gene_set_data:
    if gene in gene_index_map:
        output_data.append((gene_set_name, gene, gene_index_map[gene]))
    else:
        genes_not_found.append(gene)

# Names of genes not found in the FASTA file
if len(genes_not_found) > 0:
    _go = ", ".join(genes_not_found)
    print(f"Genes not found in FASTA file:{_go}")
else:
    print('All genes were found in the reference Fasta file.')

# Write output table: gene set - target gene - index in Fasta
output_df = pd.DataFrame(output_data, columns=['gene_set', 'target_gene', 'index'])
output_df

All genes were found in the reference Fasta file.


Unnamed: 0,gene_set,target_gene,index
0,M47362,ARAF,230520
1,M47362,BRAF,94630
2,M47362,CCND1,130954
3,M47362,EGF,58870
4,M47362,EGFR,88559
...,...,...,...
110,M47501,PIK3CA,50464
111,M47501,PIK3CB,48219
112,M47501,PIK3CD,1490
113,M47501,RPS6KB1,192243


In [4]:
# Verify index
output_df['index'].agg(['min','max'])

min      1490
max    235809
Name: index, dtype: int64

### Filter out shared genes

TODO: find other way to handle shared genes between gene sets

In [5]:
# keep first occurance
df_filtered = output_df.drop_duplicates(subset='target_gene', keep='first')

### Save resulted table

In [6]:

df_filtered.to_csv(out_indexed_gset, sep='\t', index=False)