In [None]:
import pandas as pd

## Gene expression

In [None]:
# create gene expression feature file
gene_exp = pd.read_csv("gene_expression/beataml_waves1to4_norm_exp_dbgap.txt", sep="\t", index_col=0)
gene_exp = gene_exp.reset_index().drop(columns=["stable_id", "description", "biotype"]).set_index("display_label")
gene_exp_processed = gene_exp.T
gene_exp_processed = gene_exp_processed.reset_index().rename(columns={"index": "dbgap_rnaseq_sample"})
gene_exp_processed.iloc[0:5, 0:5]

In [None]:
mapping = pd.read_excel("beataml_waves1to4_sample_mapping.xlsx", sheet_name='sample_map')
mapping.head()

In [None]:
gene_exp_processed = gene_exp_processed.merge(mapping[['dbgap_subject_id', 'dbgap_rnaseq_sample']], on='dbgap_rnaseq_sample', how='left')
gene_exp_processed['dbgap_subject_id'].value_counts()

In [None]:
gene_exp_processed_small = gene_exp_processed.set_index(['dbgap_subject_id', 'dbgap_rnaseq_sample']).iloc[:, 0:10]

In [None]:
# create cell line names file
response_data = data.merge(df_map, left_on="inhibitor", right_on="drug_name_beataml", how="left")
response_data["LN_IC50"] = response_data["ic50"]
response_data.drop(columns=["ic50", "drug_name_beataml", "status", "paper_inclusion"], inplace=True)
response_data["cell_line_name"] = response_data["dbgap_subject_id"].astype(str)

cell_line_names = response_data.drop_duplicates("cell_line_name")[["cell_line_name"]]
cell_line_names["Cellosaurus_id"] = pd.NA
cell_line_names["tissue"] = "Blood"
cell_line_names.to_csv("cell_line_names.csv", index=False)

In [None]:
# create drug names file
drug_names = df_map[["drug_name_beataml", "drug_name", "pubchem_id"]]
#if pubchemid is NA, use drug_name
drug_names["pubchem_id"] = drug_names["pubchem_id"].replace("", pd.NA)
drug_names["pubchem_id"] = drug_names["pubchem_id"].replace(" ", pd.NA)

drug_names["pubchem_id"] = drug_names["pubchem_id"].fillna(drug_names["drug_name"])

drug_names.to_csv("drug_names.csv", index=False)

In [None]:
#create fingerprints
import os
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs

# df has columns: pubchem_id, canonical_smiles
drug_smiles = pd.read_csv("drug_smiles.csv")

df = drug_smiles.copy()

out_dir = "drug_fingerprints"
os.makedirs(out_dir, exist_ok=True)

def morgan_bits(smiles, n_bits, radius=2):
    m = Chem.MolFromSmiles(smiles)
    if m is None:
        return np.full(n_bits, np.nan)  # keep shape even if invalid
    fp = AllChem.GetMorganFingerprintAsBitVect(m, radius, nBits=n_bits)
    arr = np.zeros((n_bits,), dtype=int)
    DataStructs.ConvertToNumpyArray(fp, arr)
    return arr

sizes = [64, 128, 256, 512, 1024, 2048]

for n in sizes:
    print(f"computing {n}-bit fingerprints")
    # compute and stack into a DataFrame (rows = bit positions, cols = pubchem_id)
    fp_matrix = {}
    for pid, smi in zip(df["pubchem_id"], df["canonical_smiles"]):
        fp_matrix[pid] = morgan_bits(smi, n)
    fp_df = pd.DataFrame(fp_matrix)  # index 0..n-1, columns = pubchem_id
    fp_df.to_csv(f"{out_dir}/pubchem_id_to_demorgan_{n}_map.csv", index=False)


In [None]:
# rename genes in gene expression file  to correct symbols
ge = pd.read_csv("gene_expression.csv", index_col=0)
genes = list(ge.columns)
new_gene_names = {
    "AARS": "AARS1",
    "EPRS": "EPRS1",
    "FAM57A": "TLCD3A",
    "FAM69A": "DIPK1A",
    "H2AFV": "H2AZ2",
    "HIST1H2BK": "H2BC12",
    "HIST2H2BE": "H2BC21",
    "KIAA0100": "BLTP2",
    "KIAA0355": "GARRE1",
    "NARFL": "CIAO3",
    "PAPD7": "TENT4A",
    "SKIV2L": "SKIC2",
    "TSTA3": "GFUS",
    "WDR61": "SKIC8",
    "WRB": "GET1",
}
new_genes = [new_gene_names.get(g, g) for g in genes]
ge.columns = new_genes
landmark_genes  = pd.read_csv("gene_lists/landmark_genes.csv", index_col=0)
set(landmark_genes["Symbol"]) - set(new_genes)
#save again
ge.index.name = "cell_line_name"
ge.to_csv("gene_expression.csv", index=True)

In [None]:
# correct pubchem ids in BeatAML2.csv
import pandas as pd

d = pd.read_csv("BeatAML2.csv")
d["pubchem_id"] = d["pubchem_id"].astype(str).str.replace(".0", "", regex=False)

d.loc[d["drug_name"] == "MEK1/2 Inhibitor", "pubchem_id"] = "MEK1/2 Inhibitor"
d.loc[d["drug_name"] == "JQ12", "pubchem_id"] = "JQ12"

d.to_csv("BeatAML2.csv", index=False)
