# Prepare Cell/Tumor Input Features

In [None]:
from __future__ import annotations

import pickle

import pandas as pd
import networkx as nx
import numpy as np
import typing as t

from pathlib import Path

from cdrpy.util.io import read_list
from cdrpy.util.io import read_gmt

In [None]:
dataset_folder = Path("../../../data/datasets/GDSCv2DepMap")
input_folder = Path("../../../data/inputs/GDSCv2DepMap")
genelist_folder =  Path("../../../data/genelists")

## Generate labels

In [None]:
screen_data = pd.read_csv(dataset_folder / "ScreenDoseResponseLabels.csv")

# screen_data[["id", "cell_id", "drug_id", "ln_ic50"]].rename(
#     columns={"ln_ic50": "label"}
# ).to_csv(input_folder / "LabelsLogIC50.csv", index=False)

# screen_data[["id", "cell_id", "drug_id", "auc"]].rename(
#     columns={"auc": "label"}
# ).to_csv(input_folder / "LabelsAUC.csv", index=False)

In [None]:
# load gene lists
mcg_genes = pd.read_csv(genelist_folder / "VermaatMiniCancerGenome.csv")
mcg_genes = set(mcg_genes["HGNC_SYMBOL"].to_list())

cgc_genes = pd.read_csv(genelist_folder / "CancerGeneCensusFeb062023.csv")
cgc_genes = set(cgc_genes["Gene Symbol"].to_list())

lincs_genes = pd.read_csv("../../../data/raw/LINCS/geneinfo_beta.txt", sep="\t")
lincs_genes = lincs_genes[lincs_genes["feature_space"] == "landmark"]
lincs_genes = set(lincs_genes["gene_symbol"])

hallmark_sets = read_gmt("../../../data/raw/MSigDB/h.all.v2023.1.Hs.symbols.gmt")
hallmark_genes = set([g for gs in hallmark_sets.values() for g in gs])

In [None]:
dualgcn_folder = Path("../../../data/raw/DualGCN")
dualgcn_genes = read_list(dualgcn_folder / "gene_list.txt")
dualgcn_genes = set(dualgcn_genes)

### Load omics data

In [None]:
# load gene expression data
exp_matrix = pd.read_csv(
    dataset_folder / "OmicsExpressionProteinCodingGenesTPMLogp1.csv",
    index_col=0,
)
exp_matrix.head()

In [None]:
# load copy number data
cnv_matrix = pd.read_csv(
    dataset_folder / "OmicsCNGeneCNRatioLogp1.csv", index_col=0
)

cnv_matrix.head()

In [None]:
# load mutation data
mut_maf = pd.read_csv(
    dataset_folder / "OmicsSomaticMutationsHarmonizedMAF.csv", low_memory=False
)

mut_maf["Chrom"] = mut_maf["Chrom"].map(lambda v: str(v).replace("chr", ""))
mut_maf["id"] = (
    mut_maf[["HugoSymbol", "Chrom", "Pos"]].astype(str).agg(":".join, axis=1)
)

mut_maf.head()

In [None]:
exp_genes = set(exp_matrix.columns)
cnv_genes = set(cnv_matrix.columns)
mut_genes = set(mut_maf["HugoSymbol"].unique())

## Generate HiDRA features

In [None]:
hidra_input_folder = input_folder / "HiDRA"
hidra_input_folder.mkdir(exist_ok=True)

In [None]:
import csv

geneset_path = Path("../../../pkg/HiDRA/Training/geneset.gmt")

geneset = []
geneset_dict = {}
with open(geneset_path, "r") as f:
    reader = csv.reader(f, delimiter="\t")
    for row in reader:
        common_genes = exp_genes.intersection(row[2:])
        if len(common_genes) > 0:
            geneset.extend(common_genes)
            geneset_dict[row[0]] = sorted(list(common_genes))

geneset = set(geneset)

with open(hidra_input_folder / "MetadataGeneSetDict.pickle", "wb") as fh:
    pickle.dump(geneset_dict, fh)

In [None]:
hidra_exp_matrix = exp_matrix[sorted(list(geneset))]
n_genes = len(geneset)
hidra_exp_matrix.to_csv(
    hidra_input_folder / f"FeatureCellToExpression{n_genes}HiDRAGenes.csv"
)

### Generate ScreenDL features

In [None]:
screendl_input_folder = input_folder / "ScreenDL"
screendl_input_folder.mkdir(exist_ok=True)

In [None]:
# NOTE: move to a separate ontolearn file for feature generation
temp = pd.read_csv("../../../data/raw/DepMap/Model.csv")
temp = temp[temp["SangerModelID"].isin(exp_matrix.index)].drop_duplicates(
    subset="SangerModelID"
)

from sklearn.preprocessing import OneHotEncoder

counts = temp["OncotreeLineage"].value_counts()
# categories = list(counts[counts >=5].index)

enc = OneHotEncoder(sparse_output=False)
x = enc.fit_transform(temp[["OncotreeLineage"]])
ct_ohe = pd.DataFrame(x, index=temp["SangerModelID"], columns=enc.categories_)
ct_ohe = ct_ohe.loc[exp_matrix.index]

ct_ohe.to_csv(screendl_input_folder / "FeatureCellToOncotreeLineageOHE.csv")

In [None]:
screendl_genes = set.intersection(mcg_genes, exp_genes)
screendl_genes = sorted(list(screendl_genes))

screendl_exp_matrix = exp_matrix[screendl_genes]

n_genes = len(screendl_genes)

# screendl_exp_matrix.to_csv(
#     screendl_input_folder
#     / f"FeatureCellToExpression{n_genes}MCGGenesTPMLogp1.csv"
# )

In [None]:
screendl_genes_lincs = mcg_genes.union(lincs_genes).intersection(exp_genes)
screendl_genes_lincs = sorted(list(screendl_genes_lincs))

screendl_exp_matrix = exp_matrix[screendl_genes_lincs]

n_genes = len(screendl_genes_lincs)

# screendl_exp_matrix.to_csv(
#     screendl_input_folder
#     / f"FeatureCellToExpression{n_genes}MCGAndLINCSGenesTPMLogp1.csv"
# )

In [None]:
screendl_genes_hallmark = hallmark_genes.intersection(exp_genes)
screendl_genes_hallmark = sorted(list(screendl_genes_hallmark))

screendl_exp_matrix = exp_matrix[screendl_genes_hallmark]

n_genes = len(screendl_genes_hallmark)

screendl_exp_matrix.to_csv(
    screendl_input_folder
    / f"FeatureCellToExpression{n_genes}HallmarkGenesTPMLogp1.csv"
)

In [None]:
genes = set.intersection(cgc_genes, mut_genes)
mut_maf_cgc = mut_maf[mut_maf["HugoSymbol"].isin(genes)].copy()
mut_maf_cgc.head()

In [None]:
# mut_maf_cgc_likely_driver = mut_maf_cgc[
#     (mut_maf_cgc["LikelyDriver"] == True) | (mut_maf_cgc["Driver"] == True)
# ]
# (
#     mut_maf_cgc_likely_driver.groupby(["SangerModelID", "HugoSymbol"])
#     .size()
#     .unstack()
#     .fillna(0)
#     .clip(upper=1)
#     .astype(int)
# ).sum(axis=1).mean()

In [None]:
# mut_maf_cgc_del = mut_maf_cgc[mut_maf_cgc["LoF"] == True]
# mut_maf_cgc_del = mut_maf_cgc[
#     (mut_maf_cgc["CCLEDeleterious"] == True) | (mut_maf_cgc["LoF"] == True)
# ]
# mut_mat = (
#     mut_maf_cgc_del.groupby(["SangerModelID", "HugoSymbol"])
#     .size()
#     .unstack()
#     .fillna(0)
#     .clip(upper=1)
#     .astype(int)
# )

# n_genes = mut_mat.shape[1]

# mut_mat.to_csv(
#     screendl_input_folder
#     / f"FeatureCellToSomaticMutationsGeneEncoded{n_genes}CGCGenes.csv"
# )

In [None]:
# An alternative encoding would be to concatenate them with an individual column
#   for gene_lof and gene_driver each binary encoded (so twice as many columns)

In [None]:
mut_maf.columns

In [None]:
# FIXME: DRIVERS MAY NOT BE CGC GENES - I should do the driver selection
#   independently of the CGC gene list
mut_maf_cgc["LoFFlag"] = mut_maf_cgc["CCLEDeleterious"] + mut_maf_cgc["LoF"]
mut_maf_cgc["DriverFlag"] = mut_maf_cgc["Driver"] + mut_maf_cgc["LikelyDriver"]

genes = sorted(list(mut_maf_cgc["HugoSymbol"].unique()))
samples = exp_matrix.index
n_genes = len(genes)
n_samples = len(samples)
mut_matrix = pd.DataFrame(
    np.zeros((n_samples, n_genes), dtype=int), index=samples, columns=genes
)
for row in mut_maf_cgc.itertuples():
    if row.DriverFlag is True:
        mut_matrix.loc[row.SangerModelID, row.HugoSymbol] = 1
    elif row.LoFFlag is True:
        mut_matrix.loc[row.SangerModelID, row.HugoSymbol] = -1

counts = (mut_matrix != 0).sum()
# restrict to genes mutated in at least 5 cell lines
keep_genes = counts[counts > 5].index
mut_matrix = mut_matrix[sorted(keep_genes)]

n_genes = mut_matrix.shape[1]
mut_matrix.to_csv(
    screendl_input_folder
    / f"FeatureCellToSomaticMutationsGeneEncoded{n_genes}CGCGenes.csv"
)

In [None]:
mut_maf_cgc_driver = mut_maf_cgc[mut_maf_cgc["DriverFlag"] == True]
genes = sorted(list(mut_maf_cgc_driver["HugoSymbol"].unique()))
n_genes = len(genes)

mut_matrix_driver = pd.DataFrame(
    np.zeros((n_samples, n_genes), dtype=int), index=samples, columns=genes
)

for tup in mut_maf_cgc_driver.itertuples():
    mut_matrix_driver.loc[tup.SangerModelID, tup.HugoSymbol] = 1

mut_matrix_driver.columns = [f"{c}_driver" for c in mut_matrix_driver.columns]

In [None]:
mut_maf_cgc_lof = mut_maf_cgc[mut_maf_cgc["LoFFlag"] == True]
genes = sorted(list(mut_maf_cgc_lof["HugoSymbol"].unique()))
n_genes = len(genes)

mut_matrix_lof = pd.DataFrame(
    np.zeros((n_samples, n_genes), dtype=int), index=samples, columns=genes
)

for tup in mut_maf_cgc_lof.itertuples():
    mut_matrix_lof.loc[tup.SangerModelID, tup.HugoSymbol] = 1

mut_matrix_lof.columns = [f"{c}_lof" for c in mut_matrix_lof.columns]

In [None]:
mut_matrix_concat = pd.concat([mut_matrix_driver, mut_matrix_lof], axis=1)
counts = mut_matrix_concat.sum()
keep_cols = counts[counts >= 5].index

mut_matrix_concat = mut_matrix_concat[keep_cols]

n_features = mut_matrix_concat.shape[1]
mut_matrix.to_csv(
    screendl_input_folder
    / f"FeatureCellToSomaticMutationsTypeEncoded{n_features}CGCFeatures.csv"
)

In [None]:
screendl_genes = set.intersection(cgc_genes, cnv_genes)
screendl_genes = sorted(list(screendl_genes))

screendl_cnv_matrix = cnv_matrix[screendl_genes]

n_genes = len(screendl_genes)

screendl_cnv_matrix.to_csv(
    screendl_input_folder
    / f"FeatureCellToCopyNumber{n_genes}CGCGenesCNRatioLogp1"
)

### DeepCDR features

In [None]:
deepcdr_input_folder = input_folder / "DeepCDR"
deepcdr_input_folder.mkdir(exist_ok=True)

In [None]:
# generate expression features

# deepcdr_exp_path = "https://raw.githubusercontent.com/kimmo1019/DeepCDR/master/data/CCLE/genomic_expression_561celllines_697genes_demap_features.csv"
# deepcdr_exp_genes = pd.read_csv(
#     deepcdr_exp_path, index_col=0, nrows=1
# ).columns.to_list()
# deepcdr_exp_genes = set(deepcdr_exp_genes)
deepcdr_exp_genes = cgc_genes

deepcdr_exp_genes_filtered = set.intersection(deepcdr_exp_genes, exp_genes)
deepcdr_exp_genes_filtered = sorted(list(deepcdr_exp_genes_filtered))
print(len(deepcdr_exp_genes_filtered))

deepcdr_exp_matrix = exp_matrix[deepcdr_exp_genes_filtered]
n_genes = len(deepcdr_exp_genes_filtered)

# deepcdr_exp_matrix.to_csv(
#     deepcdr_input_folder
#     / f"FeatureCellToExpression{n_genes}CGCGenesTPMLogp1.csv"
# )

In [None]:
# generate mutation features

# deepcdr_mut_path = "https://raw.githubusercontent.com/kimmo1019/DeepCDR/master/data/CCLE/genomic_mutation_34673_demap_features.csv"
# deepcdr_mut_genes = (
#     pd.read_csv(deepcdr_mut_path, index_col=0, nrows=2)
#     .columns.map(lambda c: c.split(".")[0])
#     .unique()
#     .tolist()
# )
# deepcdr_mut_genes = set(deepcdr_mut_genes)
deepcdr_mut_genes = cgc_genes
deepcdr_mut_genes_filtered = set.intersection(deepcdr_mut_genes, mut_genes)
print(len(deepcdr_mut_genes_filtered))

deepcdr_mut_maf = mut_maf[mut_maf["HugoSymbol"].isin(deepcdr_mut_genes)]

# lazy check to make sure we didn't lose any cell lines
old_cells = mut_maf["SangerModelID"].unique().tolist()
new_cells = deepcdr_mut_maf["SangerModelID"].unique().tolist()
assert set(old_cells) == set(new_cells)

deepcdr_mut_matrix = (
    deepcdr_mut_maf.groupby(["SangerModelID", "id"])
    .size()
    .unstack()
    .fillna(0)
    .clip(upper=1)
    .astype(int)
)
n_genes = len(deepcdr_mut_genes_filtered)

positions = deepcdr_mut_matrix.columns.to_list()
positions = pd.DataFrame({"pos": positions})
positions["gene"] = positions["pos"].map(lambda v: str(v).split(":")[0])
positions["chr"] = positions["pos"].map(lambda v: str(v).split(":")[1])
positions["start"] = positions["pos"].map(lambda v: int(str(v).split(":")[2]))

mapping = {str(i): i for i in range(1, 23)}
mapping["X"] = 23
mapping["Y"] = 24
positions["chr"] = positions["chr"].map(mapping)

sorted_positions = positions.sort_values(["chr", "start"])["pos"].to_list()

deepcdr_mut_matrix = deepcdr_mut_matrix[sorted_positions]

# deepcdr_mut_matrix.to_csv(
#     deepcdr_input_folder
#     / f"FeatureCellToSomaticMutationsPositionEncoded{n_genes}CGCGenesAll.csv"
# )

In [None]:
deepcdr_mut_matrix

### Generate DualGCN features

In [None]:
dualgcn_input_folder = input_folder / "DualGCN"
dualgcn_input_folder.mkdir(exist_ok=True)

In [None]:
dualgcn_ppi = pd.read_csv(
    dualgcn_folder / "PPI_network.txt",
    sep="\t",
    usecols=[0, 1],
    names=["gene_1", "gene_2"],
    header=None,
)

dualgcn_ppi_genes = set.intersection(
    set(dualgcn_ppi["gene_1"].to_list()),
    set(dualgcn_ppi["gene_2"].to_list()),
)

dualgcn_ppi.head(n=2)

In [None]:
dualgcn_genes_filtered = set.intersection(
    dualgcn_genes, dualgcn_ppi_genes, exp_genes, cnv_genes
)
dualgcn_genes_filtered = sorted(list(dualgcn_genes_filtered))

dualgcn_exp_matrix = exp_matrix[dualgcn_genes_filtered]
dualgcn_cnv_matrix = cnv_matrix[dualgcn_genes_filtered]
dualgcn_ppi = dualgcn_ppi[
    (dualgcn_ppi["gene_1"].isin(dualgcn_genes_filtered))
    & (dualgcn_ppi["gene_2"].isin(dualgcn_genes_filtered))
]

n_genes = len(dualgcn_genes_filtered)

# dualgcn_exp_matrix.to_csv(
#     dualgcn_input_folder
#     / f"FeatureCellToExpression{n_genes}DualGCNGenesTPMLogp1.csv"
# )

# dualgcn_cnv_matrix.to_csv(
#     dualgcn_input_folder
#     / f"FeatureCellToCopyNumber{n_genes}DualGCNGenesCNRatioLogp1.csv"
# )

# dualgcn_ppi.to_csv(
#     dualgcn_input_folder / "MetadataPPIEdgeList.csv",
#     index=False,
# )

### Generate mutation features

In [None]:
# load mutation data
mut_data_maf = pd.read_csv(
    dataset_folder / "OmicsSomaticMutationsHarmonizedMAF.csv",
    low_memory=False,
)

mut_data_maf["Chrom"] = mut_data_maf["Chrom"].map(
    lambda v: str(v).replace("chr", "")
)
mut_data_maf["id"] = (
    mut_data_maf[["HugoSymbol", "Chrom", "Pos"]]
    .astype(str)
    .agg(":".join, axis=1)
)

mut_data_maf.head()

In [None]:
# positional encoding of mutations for convolutional models
mut_data_cgc_genes = mut_data_maf[
    mut_data_maf["HugoSymbol"].isin(cgc_genes)
]

# lazy check to make sure we didn't lose any cell lines
old_cell_ids = mut_data_maf["SangerModelID"].unique().tolist()
new_cell_ids = mut_data_cgc_genes["SangerModelID"].unique().tolist()
assert set(old_cell_ids) == set(new_cell_ids)

mut_matrix = (
    mut_data_cgc_genes.groupby(["SangerModelID", "id"])
    .size()
    .unstack()
    .fillna(0)
    .clip(upper=1)
    .astype(int)
)

n_genes = mut_data_cgc_genes["HugoSymbol"].nunique()

# mut_matrix.to_csv(
#     input_folder
#     / f"CellToSomaticMutationsPositionEncoded{n_genes}CGCGenesAll.csv"
# )

In [None]:
# all mutations encoded by gene
mut_matrix_all = (
    mut_data_maf.groupby(["SangerModelID", "HugoSymbol"])
    .size()
    .unstack()
    .fillna(0)
    .clip(upper=1)
    .astype(int)
)

mut_matrix_all.head()

In [None]:
# gene encoded matrix CGC genes
mut_matrix_all_cgc = mut_matrix_all.filter(items=cgc_genes, axis=1)
mut_matrix_all_cgc = mut_matrix_all_cgc.sort_index(axis=1)

n_genes = mut_matrix_all_cgc.shape[1]

# mut_matrix_all_cgc.to_csv(
#     input_folder / f"CellToSomaticMutationsGeneEncoded{n_genes}CGCGenesAll.csv"
# )

In [None]:
mut_matrix_del = (
    mut_data_maf[mut_data_maf["CCLEDeleterious"] == True]
    .groupby(["SangerModelID", "HugoSymbol"])
    .size()
    .unstack()
    .fillna(0)
    .clip(upper=1)
    .astype(int)
)

mut_matrix_del

In [None]:
# mut_data_cgc_genes[
#     (mut_data_cgc_genes["CCLEDeleterious"] == True)
#     | (mut_data_cgc_genes["HessDriver"] == True)
# ].shape