# Build a harmonized dataset for model benchmarking studies.

For the first paper, it is okay to use the minimal dataset. For the next paper,
I should create a larger datset for just training my model and the interpretability and for comparisons, I just use a subset that is shared across them.

GO BACK TO JUST USING THE NORMALIZATION LAYER SINCE THIS CAN BE INTEGRATED INTO THE MODEL

Harmonize DrugCell

1. consider using the DepMap dataset instead for DeepCDR compatability
2. get lists of drug targets and and SMILES strings


1. reogranize - datasets folder (current inputs folder)
   1. inputs folder - subfolder for each model with their inputs - each model has subfolder for each dataset with inputs
2. refactor screendl model
   1. ScreenDLModelWrapper, build_screendl_model, ScreenDL

In [None]:
from __future__ import annotations

import pandas as pd
import numpy as np

from pathlib import Path

In [None]:
gdsc_folder = Path("../../../data/processed/GDSC")
depmap_folder = Path("../../../data/raw/DepMap")
genelist_folder =  Path("../../../data/genelists")

In [None]:
dataset_folder = Path("../../../data/datasets/GDSCv2DepMap")
dataset_folder.mkdir(exist_ok=True)

In [None]:
drug_info = pd.read_csv(gdsc_folder / "DrugAnnotations.csv")
screen_data = pd.read_csv(gdsc_folder / "ScreenFittedDoseResponse.csv")

screen_data = screen_data[screen_data["dataset"] == "GDSC2"]
screen_data = screen_data.reset_index(drop=True)
screen_data.head()

In [None]:
# load depmap model annotations
model_info = pd.read_csv(depmap_folder / "Model.csv")
model_info.head()

In [None]:
# create a mapping from depmap to sanger model ids
depmap_to_sanger_id = model_info[["ModelID", "SangerModelID"]].dropna()
depmap_to_sanger_id = dict(
    zip(depmap_to_sanger_id["ModelID"], depmap_to_sanger_id["SangerModelID"])
)

In [None]:
# load the depmap expression data
exp_data = pd.read_csv(
    depmap_folder / "OmicsExpressionProteinCodingGenesTPMLogp1.csv",
    index_col=0,
)

# map to sanger model ids
exp_data = exp_data[exp_data.index.isin(depmap_to_sanger_id)]
exp_data.index = exp_data.index.map(depmap_to_sanger_id)

# parse columns and sort by gene name for convenience
exp_data.columns = [str(c).split(" ")[0] for c in exp_data.columns]
exp_data = exp_data[sorted(exp_data.columns)].sort_index()

exp_data.head()

In [None]:
# load somatic mutation data

mut_data = pd.read_csv(
    depmap_folder / "OmicsSomaticMutations.csv",
    low_memory=False,
)

mut_data["SangerModelID"] = mut_data["ModelID"].map(depmap_to_sanger_id)
mut_data = mut_data.dropna(subset=["SangerModelID"])

mut_data.head()

In [None]:
# load the copy number data
cn_data = pd.read_csv(depmap_folder / "OmicsCNGene.csv", index_col=0)

# map to sanger model ids
cn_data = cn_data[cn_data.index.isin(depmap_to_sanger_id)]
cn_data.index = cn_data.index.map(depmap_to_sanger_id)

# parse columns and sort by gene name for convenience
cn_data.columns = [str(c).split(" ")[0] for c in cn_data.columns]
cn_data = cn_data[sorted(cn_data.columns)].sort_index()

cn_data.head()

In [None]:
# load methylation data

methyl_data = pd.read_csv(
    depmap_folder / "Methylation_(1kb_upstream_TSS).csv",
    index_col=0,
)

# map to sanger model ids
methyl_data = methyl_data[methyl_data.index.isin(depmap_to_sanger_id)]
methyl_data.index = methyl_data.index.map(depmap_to_sanger_id)

# extract gene names from columns
methyl_data.columns = [str(c).split("_")[0] for c in methyl_data.columns]

methyl_data.head()

## Harmonize Data Modalities

In [None]:
mut_cell_ids = set(mut_data["SangerModelID"].unique().tolist())
exp_cell_ids = set(exp_data.index.to_list())
cn_cell_ids = set(cn_data.index.to_list())
methyl_cell_ids = set(methyl_data.index.to_list())

screen_cell_ids = set(screen_data["model_id"].unique().tolist())

common_cell_ids = set.intersection(
    mut_cell_ids, exp_cell_ids, cn_cell_ids, screen_cell_ids
)

In [None]:
# harmonize screen data

screen_data_common = (
    screen_data[screen_data["model_id"].isin(common_cell_ids)][
        ["model_id", "canonical_smiles", "auc", "ln_ic50"]
    ]
    .sort_values(["model_id", "canonical_smiles"])
    .rename(columns={"model_id": "cell_id", "canonical_smiles": "drug_id"})
    .reset_index(drop=True)
)

In [None]:
screen_data_common["id"] = range(screen_data_common.shape[0])

screen_data_common.to_csv(
    dataset_folder / "ScreenDoseResponseLabels.csv",
    index=False,
)

In [None]:
# harmonize omics data

mut_data_common = mut_data[
    mut_data["SangerModelID"].isin(common_cell_ids)
].reset_index(drop=True)

exp_data_common = exp_data.loc[list(common_cell_ids)].sort_index()

cn_data_common = cn_data.loc[list(common_cell_ids)].sort_index()

In [None]:
mut_data_common.to_csv(
    dataset_folder / "OmicsSomaticMutationsHarmonizedMAF.csv",
    index=False,
)

In [None]:
exp_data_common = exp_data_common.rename_axis(index="model_id")
exp_data_common.to_csv(
    dataset_folder / "OmicsExpressionProteinCodingGenesTPMLogp1.csv"
)

In [None]:
cn_data_common = cn_data_common.rename_axis(index="model_id")
cn_data_common.to_csv(
    dataset_folder / "OmicsCNGeneCNRatioLogp1.csv"
)

In [None]:
model_info_common = model_info[
    model_info["SangerModelID"].isin(common_cell_ids)
].drop_duplicates(subset=["SangerModelID"])

col_mapping = {
    "SangerModelID": "model_id",
    "OncotreeCode": "oncotree_code",
    "OncotreePrimaryDisease": "oncotree_primary_disease",
    "OncotreeLineage": "oncotree_lineage",
}
model_info_common = model_info_common[list(col_mapping)].rename(
    columns=col_mapping
)

model_info_common.to_csv(
    dataset_folder / "CellLineOncotreeAnnotations.csv", index=False
)

In [None]:
common_smiles = screen_data_common["drug_id"].unique()
drug_info_common = drug_info[drug_info["canonical_smiles"].isin(common_smiles)]

drug_info_common.to_csv(dataset_folder / "DrugAnnotations.csv", index=False)