# Prepare Cell/Tumor Input Features

In [1]:
from __future__ import annotations

import pickle

import pandas as pd
import networkx as nx
import numpy as np
import typing as t

from pathlib import Path

from cdrpy.util.io import read_list

In [2]:
dataset_folder = Path("../../../data/datasets/GDSCv2DepMap")
input_folder = Path("../../../data/inputs/GDSCv2DepMap")
genelist_folder =  Path("../../../data/genelists")

## Generate labels

In [3]:
screen_data = pd.read_csv(dataset_folder / "ScreenDoseResponseLabels.csv")

# screen_data[["id", "cell_id", "drug_id", "ln_ic50"]].rename(
#     columns={"ln_ic50": "label"}
# ).to_csv(input_folder / "LabelsLogIC50.csv", index=False)

# screen_data[["id", "cell_id", "drug_id", "auc"]].rename(
#     columns={"auc": "label"}
# ).to_csv(input_folder / "LabelsAUC.csv", index=False)

In [4]:
# load gene lists
mcg_genes = pd.read_csv(genelist_folder / "VermaatMiniCancerGenome.csv")
mcg_genes = set(mcg_genes["HGNC_SYMBOL"].to_list())

cgc_genes = pd.read_csv(genelist_folder / "CancerGeneCensusFeb062023.csv")
cgc_genes = set(cgc_genes["Gene Symbol"].to_list())

In [5]:
dualgcn_folder = Path("../../../data/raw/DualGCN")
dualgcn_genes = read_list(dualgcn_folder / "gene_list.txt")
dualgcn_genes = set(dualgcn_genes)

### Load omics data

In [6]:
# load gene expression data
exp_matrix = pd.read_csv(
    dataset_folder / "OmicsExpressionProteinCodingGenesTPMLogp1.csv",
    index_col=0,
)
exp_matrix.head()

Unnamed: 0_level_0,A1BG,A1CF,A2M,A2ML1,A3GALT2,A4GALT,A4GNT,AAAS,AACS,AADAC,...,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
model_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SIDM00046,3.636915,0.042644,0.321928,0.084064,0.028569,3.049631,0.028569,5.273889,4.166715,0.201634,...,3.958843,6.162693,0.773996,1.731183,4.032982,0.443607,2.684819,6.743623,4.218781,3.65306
SIDM00078,5.130107,0.014355,0.15056,3.759156,0.056584,5.973382,0.0,5.499527,4.775051,0.790772,...,3.709291,5.216843,1.443607,3.090853,4.078097,1.422233,2.333424,6.576976,3.486714,2.691534
SIDM00079,5.152995,0.028569,0.111031,0.097611,0.0,4.328406,0.042644,5.972693,3.885574,0.111031,...,4.898208,6.947082,1.090853,2.157044,4.101818,2.659925,3.275007,6.530757,3.179511,3.477677
SIDM00080,4.175525,0.042644,6.417009,0.124328,0.0,0.555816,0.097611,5.590961,3.532317,0.0,...,4.373648,6.251151,0.757023,1.803227,3.493135,0.014355,2.495695,5.28244,3.845992,4.080658
SIDM00081,4.82273,0.0,6.120601,0.028569,0.042644,2.778209,0.176323,4.913129,3.283922,0.056584,...,4.051372,6.105175,0.895303,1.922198,4.047015,0.963474,2.803227,5.291309,3.544733,4.852498


In [7]:
# load copy number data
cnv_matrix = pd.read_csv(
    dataset_folder / "OmicsCNGeneCNRatioLogp1.csv", index_col=0
)

cnv_matrix.head()

Unnamed: 0_level_0,A1BG,A1BG-AS1,A1CF,A2M,A2M-AS1,A2ML1,A2MP1,A3GALT2,A4GALT,A4GNT,...,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
model_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SIDM00046,1.291901,1.291901,0.832412,1.177027,1.177027,1.177027,1.177027,0.960124,0.946083,1.220375,...,0.873702,0.832412,0.716176,0.716176,1.1416,0.960124,0.960124,1.565393,1.180616,0.960124
SIDM00078,1.764619,1.764619,0.901516,0.661356,0.661356,0.661356,0.661356,0.975263,1.093212,1.221159,...,0.992034,0.901516,0.597052,0.597052,1.221159,0.975263,0.975263,0.97417,0.90028,0.649486
SIDM00079,1.005852,1.005852,1.064016,0.630855,0.630855,0.630855,0.630855,1.015066,1.01761,1.059105,...,1.017607,1.064016,1.034069,1.034069,1.032002,1.026443,1.026443,0.626533,0.041083,0.653868
SIDM00080,0.806532,0.806532,0.836938,1.044198,1.044198,1.044198,1.044198,0.913477,1.136541,1.118773,...,1.022343,0.901621,0.749775,0.749775,1.118773,0.913477,0.913477,1.372159,0.990136,0.997781
SIDM00081,1.206992,1.206992,0.801166,0.85875,0.85875,0.85875,0.85875,1.159133,1.325867,1.377363,...,0.858489,0.801166,0.562601,0.562601,1.377363,1.159133,1.159133,1.205385,1.018584,1.210574


In [8]:
# load mutation data
mut_maf = pd.read_csv(
    dataset_folder / "OmicsSomaticMutationsHarmonizedMAF.csv", low_memory=False
)

mut_maf["Chrom"] = mut_maf["Chrom"].map(lambda v: str(v).replace("chr", ""))
mut_maf["id"] = (
    mut_maf[["HugoSymbol", "Chrom", "Pos"]].astype(str).agg(":".join, axis=1)
)

mut_maf.head()

Unnamed: 0,Chrom,Pos,Ref,Alt,AF,RefCount,AltCount,GT,PS,VariantType,...,PharmgkbID,DidaID,DidaName,GwasDisease,GwasPmID,GTexGene,ModelID,EntrezGeneID,SangerModelID,id
0,1,999959,C,T,0.556,11,14,0/1,,SNP,...,,,,,,,ACH-000943,57801.0,SIDM01090,HES4:1:999959
1,1,1046480,G,GC,0.467,25,21,0/1,,INS,...,,,,,,,ACH-000943,375790.0,SIDM01090,AGRN:1:1046480
2,1,1048197,A,G,0.571,21,27,0/1,,SNP,...,,,,,,,ACH-000943,375790.0,SIDM01090,AGRN:1:1048197
3,1,1196686,C,A,0.453,28,23,0/1,,SNP,...,,,,,,,ACH-000943,254173.0,SIDM01090,TTLL10:1:1196686
4,1,1313091,C,T,0.577,22,29,0/1,,SNP,...,,,,,,,ACH-000943,54973.0,SIDM01090,INTS11:1:1313091


In [9]:
exp_genes = set(exp_matrix.columns)
cnv_genes = set(cnv_matrix.columns)
mut_genes = set(mut_maf["HugoSymbol"].unique())

### Generate ScreenDL features

In [10]:
screendl_input_folder = input_folder / "ScreenDL"
screendl_input_folder.mkdir(exist_ok=True)

In [11]:
screendl_genes = set.intersection(mcg_genes, exp_genes)
screendl_genes = sorted(list(screendl_genes))

screendl_exp_matrix = exp_matrix[screendl_genes]

n_genes = len(screendl_genes)

# screendl_exp_matrix.to_csv(
#     screendl_input_folder
#     / f"FeatureCellToExpression{n_genes}MCGGenesTPMLogp1.csv"
# )

### DeepCDR features

In [12]:
deepcdr_input_folder = input_folder / "DeepCDR"
deepcdr_input_folder.mkdir(exist_ok=True)

In [13]:
# generate expression features

# deepcdr_exp_path = "https://raw.githubusercontent.com/kimmo1019/DeepCDR/master/data/CCLE/genomic_expression_561celllines_697genes_demap_features.csv"
# deepcdr_exp_genes = pd.read_csv(
#     deepcdr_exp_path, index_col=0, nrows=1
# ).columns.to_list()
# deepcdr_exp_genes = set(deepcdr_exp_genes)
deepcdr_exp_genes = cgc_genes

deepcdr_exp_genes_filtered = set.intersection(deepcdr_exp_genes, exp_genes)
deepcdr_exp_genes_filtered = sorted(list(deepcdr_exp_genes_filtered))
print(len(deepcdr_exp_genes_filtered))

deepcdr_exp_matrix = exp_matrix[deepcdr_exp_genes_filtered]
n_genes = len(deepcdr_exp_genes_filtered)

# deepcdr_exp_matrix.to_csv(
#     deepcdr_input_folder
#     / f"FeatureCellToExpression{n_genes}CGCGenesTPMLogp1.csv"
# )

717


In [26]:
# generate mutation features

# deepcdr_mut_path = "https://raw.githubusercontent.com/kimmo1019/DeepCDR/master/data/CCLE/genomic_mutation_34673_demap_features.csv"
# deepcdr_mut_genes = (
#     pd.read_csv(deepcdr_mut_path, index_col=0, nrows=2)
#     .columns.map(lambda c: c.split(".")[0])
#     .unique()
#     .tolist()
# )
# deepcdr_mut_genes = set(deepcdr_mut_genes)
deepcdr_mut_genes = cgc_genes
deepcdr_mut_genes_filtered = set.intersection(deepcdr_mut_genes, mut_genes)
print(len(deepcdr_mut_genes_filtered))

deepcdr_mut_maf = mut_maf[mut_maf["HugoSymbol"].isin(deepcdr_mut_genes)]

# lazy check to make sure we didn't lose any cell lines
old_cells = mut_maf["SangerModelID"].unique().tolist()
new_cells = deepcdr_mut_maf["SangerModelID"].unique().tolist()
assert set(old_cells) == set(new_cells)

deepcdr_mut_matrix = (
    deepcdr_mut_maf.groupby(["SangerModelID", "id"])
    .size()
    .unstack()
    .fillna(0)
    .clip(upper=1)
    .astype(int)
)
n_genes = len(deepcdr_mut_genes_filtered)

positions = deepcdr_mut_matrix.columns.to_list()
positions = pd.DataFrame({"pos": positions})
positions["gene"] = positions["pos"].map(lambda v: str(v).split(":")[0])
positions["chr"] = positions["pos"].map(lambda v: str(v).split(":")[1])
positions["start"] = positions["pos"].map(lambda v: int(str(v).split(":")[2]))

mapping = {str(i): i for i in range(1, 23)}
mapping["X"] = 23
mapping["Y"] = 24
positions["chr"] = positions["chr"].map(mapping)

sorted_positions = positions.sort_values(["chr", "start"])["pos"].to_list()

deepcdr_mut_matrix = deepcdr_mut_matrix[sorted_positions]

deepcdr_mut_matrix.to_csv(
    deepcdr_input_folder
    / f"FeatureCellToSomaticMutationsPositionEncoded{n_genes}CGCGenesAll.csv"
)

716


### Generate DualGCN features

In [None]:
dualgcn_input_folder = input_folder / "DualGCN"
dualgcn_input_folder.mkdir(exist_ok=True)

In [13]:
dualgcn_ppi = pd.read_csv(
    dualgcn_folder / "PPI_network.txt",
    sep="\t",
    usecols=[0, 1],
    names=["gene_1", "gene_2"],
    header=None,
)

dualgcn_ppi_genes = set.intersection(
    set(dualgcn_ppi["gene_1"].to_list()),
    set(dualgcn_ppi["gene_2"].to_list()),
)

dualgcn_ppi.head(n=2)

Unnamed: 0,gene_1,gene_2
0,HOXA11,CDKN2A
1,HOXA11,JUN


In [None]:
dualgcn_genes_filtered = set.intersection(
    dualgcn_genes, dualgcn_ppi_genes, exp_genes, cnv_genes
)
dualgcn_genes_filtered = sorted(list(dualgcn_genes_filtered))

dualgcn_exp_matrix = exp_matrix[dualgcn_genes_filtered]
dualgcn_cnv_matrix = cnv_matrix[dualgcn_genes_filtered]
dualgcn_ppi = dualgcn_ppi[
    (dualgcn_ppi["gene_1"].isin(dualgcn_genes_filtered))
    & (dualgcn_ppi["gene_2"].isin(dualgcn_genes_filtered))
]

n_genes = len(dualgcn_genes_filtered)

# dualgcn_exp_matrix.to_csv(
#     dualgcn_input_folder
#     / f"FeatureCellToExpression{n_genes}DualGCNGenesTPMLogp1.csv"
# )

# dualgcn_cnv_matrix.to_csv(
#     dualgcn_input_folder
#     / f"FeatureCellToCopyNumber{n_genes}DualGCNGenesCNRatioLogp1.csv"
# )

# dualgcn_ppi.to_csv(
#     dualgcn_input_folder / "MetadataPPIEdgeList.csv",
#     index=False,
# )

### Generate mutation features

In [None]:
# load mutation data
mut_data_maf = pd.read_csv(
    dataset_folder / "OmicsSomaticMutationsHarmonizedMAF.csv",
    low_memory=False,
)

mut_data_maf["Chrom"] = mut_data_maf["Chrom"].map(
    lambda v: str(v).replace("chr", "")
)
mut_data_maf["id"] = (
    mut_data_maf[["HugoSymbol", "Chrom", "Pos"]]
    .astype(str)
    .agg(":".join, axis=1)
)

mut_data_maf.head()

In [None]:
# positional encoding of mutations for convolutional models
mut_data_cgc_genes = mut_data_maf[
    mut_data_maf["HugoSymbol"].isin(cgc_genes)
]

# lazy check to make sure we didn't lose any cell lines
old_cell_ids = mut_data_maf["SangerModelID"].unique().tolist()
new_cell_ids = mut_data_cgc_genes["SangerModelID"].unique().tolist()
assert set(old_cell_ids) == set(new_cell_ids)

mut_matrix = (
    mut_data_cgc_genes.groupby(["SangerModelID", "id"])
    .size()
    .unstack()
    .fillna(0)
    .clip(upper=1)
    .astype(int)
)

n_genes = mut_data_cgc_genes["HugoSymbol"].nunique()

# mut_matrix.to_csv(
#     input_folder
#     / f"CellToSomaticMutationsPositionEncoded{n_genes}CGCGenesAll.csv"
# )

In [None]:
# all mutations encoded by gene
mut_matrix_all = (
    mut_data_maf.groupby(["SangerModelID", "HugoSymbol"])
    .size()
    .unstack()
    .fillna(0)
    .clip(upper=1)
    .astype(int)
)

mut_matrix_all.head()

In [None]:
# gene encoded matrix CGC genes
mut_matrix_all_cgc = mut_matrix_all.filter(items=cgc_genes, axis=1)
mut_matrix_all_cgc = mut_matrix_all_cgc.sort_index(axis=1)

n_genes = mut_matrix_all_cgc.shape[1]

# mut_matrix_all_cgc.to_csv(
#     input_folder / f"CellToSomaticMutationsGeneEncoded{n_genes}CGCGenesAll.csv"
# )

In [None]:
mut_matrix_del = (
    mut_data_maf[mut_data_maf["CCLEDeleterious"] == True]
    .groupby(["SangerModelID", "HugoSymbol"])
    .size()
    .unstack()
    .fillna(0)
    .clip(upper=1)
    .astype(int)
)

mut_matrix_del

In [None]:
# mut_data_cgc_genes[
#     (mut_data_cgc_genes["CCLEDeleterious"] == True)
#     | (mut_data_cgc_genes["HessDriver"] == True)
# ].shape