In [18]:
import pandas as pd
from tqdm import tqdm

def read_excel_sheets_to_dict(file_path):
    sheets_dict = pd.read_excel(file_path, sheet_name=None)
    return sheets_dict

dataframes_dict = read_excel_sheets_to_dict('../data/raw/260115_impm_signatures.xlsx')

In [19]:
import mygene

def ensembl_to_entrez_symbol(ensembl_id):
    mg = mygene.MyGeneInfo()
    result = mg.query(ensembl_id, scopes='ensembl.gene', fields='symbol', species='human')
    if result and 'hits' in result and result['hits']:
        return result['hits'][0].get('symbol')
    return None

all_ensembl_ids = set()
for df in dataframes_dict.values():
    all_ensembl_ids.update(df['ensembl_id'].dropna().unique())


all_ensembl_ids = list(all_ensembl_ids)

ensembl_to_gene = {}
for ensembl_id in tqdm(all_ensembl_ids, desc="Mapping Ensembl IDs to Gene Symbols"):
    ensembl_to_gene[ensembl_id] = ensembl_to_entrez_symbol(ensembl_id)

Mapping Ensembl IDs to Gene Symbols:   0%|          | 0/104 [00:00<?, ?it/s]Input sequence provided is already in string format. No operation performed
Mapping Ensembl IDs to Gene Symbols:   1%|          | 1/104 [00:00<01:32,  1.11it/s]Input sequence provided is already in string format. No operation performed
Mapping Ensembl IDs to Gene Symbols:   2%|▏         | 2/104 [00:01<01:29,  1.14it/s]Input sequence provided is already in string format. No operation performed
Mapping Ensembl IDs to Gene Symbols:   3%|▎         | 3/104 [00:02<01:32,  1.10it/s]Input sequence provided is already in string format. No operation performed
Mapping Ensembl IDs to Gene Symbols:   4%|▍         | 4/104 [00:03<01:37,  1.02it/s]Input sequence provided is already in string format. No operation performed
Mapping Ensembl IDs to Gene Symbols:   5%|▍         | 5/104 [00:04<01:40,  1.01s/it]Input sequence provided is already in string format. No operation performed
Mapping Ensembl IDs to Gene Symbols:   6%|▌     

In [23]:
for sheet_name, df in dataframes_dict.items():
    columns = df.columns.tolist()
    df['gene_symbol'] = df['ensembl_id'].map(ensembl_to_gene)
    df = df[['ensembl_id', 'gene_symbol'] + [col for col in columns if col != 'ensembl_id']]
    df.to_csv(f"../data/processed/00_signatures/{sheet_name}.csv", index=False)