# Prepare Cell/Tumor Input Features

In [None]:
from __future__ import annotations

import pickle

import pandas as pd
import networkx as nx
import numpy as np
import typing as t

from pathlib import Path

from cdrpy.util.io import read_list

In [None]:
dataset_folder = Path("../../../data/datasets/GDSCv2DepMap")
input_folder = Path("../../../data/inputs/GDSCv2DepMap")
genelist_folder =  Path("../../../data/genelists")

## Generate labels

In [None]:
screen_data = pd.read_csv(dataset_folder / "ScreenDoseResponseLabels.csv")

screen_data[["id", "cell_id", "drug_id", "ln_ic50"]].rename(
    columns={"ln_ic50": "label"}
).to_csv(input_folder / "LabelsLogIC50.csv", index=False)

screen_data[["id", "cell_id", "drug_id", "auc"]].rename(
    columns={"auc": "label"}
).to_csv(input_folder / "LabelsAUC.csv", index=False)

In [None]:
# load gene lists
mcg_genes = pd.read_csv(genelist_folder / "VermaatMiniCancerGenome.csv")
mcg_genes = set(mcg_genes["HGNC_SYMBOL"].to_list())

cgc_genes = pd.read_csv(genelist_folder / "CancerGeneCensusFeb062023.csv")
cgc_genes = set(cgc_genes["Gene Symbol"].to_list())

In [None]:
dualgcn_folder = Path("../../../data/raw/DualGCN")
dualgcn_genes = read_list(dualgcn_folder / "gene_list.txt")
dualgcn_genes = set(dualgcn_genes)

### Load omics data

In [None]:
# load gene expression data
exp_matrix = pd.read_csv(
    dataset_folder / "OmicsExpressionProteinCodingGenesTPMLogp1.csv",
    index_col=0,
)
exp_matrix.head()

In [None]:
# load copy number data
cnv_matrix = pd.read_csv(
    dataset_folder / "OmicsCNGeneCNRatioLogp1.csv", index_col=0
)

cnv_matrix.head()

In [None]:
exp_genes = set(exp_matrix.columns)
cnv_genes = set(cnv_matrix.columns)

### Generate DualGCN features

In [None]:
dualgcn_input_folder = input_folder / "DualGCN"
dualgcn_input_folder.mkdir(exist_ok=True)

In [None]:
dualgcn_ppi = pd.read_csv(
    dualgcn_folder / "PPI_network.txt",
    sep="\t",
    usecols=[0, 1],
    names=["gene_1", "gene_2"],
    header=None,
)

dualgcn_ppi_genes = set.intersection(
    set(dualgcn_ppi["gene_1"].to_list()),
    set(dualgcn_ppi["gene_2"].to_list()),
)

dualgcn_ppi.head(n=2)

In [None]:
dualgcn_genes_filtered = set.intersection(
    dualgcn_genes, dualgcn_ppi_genes, exp_genes, cnv_genes
)
dualgcn_genes_filtered = sorted(list(dualgcn_genes_filtered))

dualgcn_exp_matrix = exp_matrix[dualgcn_genes_filtered]
dualgcn_cnv_matrix = cnv_matrix[dualgcn_genes_filtered]
dualgcn_ppi = dualgcn_ppi[
    (dualgcn_ppi["gene_1"].isin(dualgcn_genes_filtered))
    & (dualgcn_ppi["gene_2"].isin(dualgcn_genes_filtered))
]

n_genes = len(dualgcn_genes_filtered)

dualgcn_exp_matrix.to_csv(
    dualgcn_input_folder
    / f"FeatureCellToExpression{n_genes}DualGCNGenesTPMLogp1.csv"
)

dualgcn_cnv_matrix.to_csv(
    dualgcn_input_folder
    / f"FeatureCellToCopyNumber{n_genes}DualGCNGenesCNRatioLogp1.csv"
)

dualgcn_ppi.to_csv(
    dualgcn_input_folder / "MetadataPPIEdgeList.csv",
    index=False,
)

### Generate mutation features

In [None]:
# load mutation data
mut_data_maf = pd.read_csv(
    dataset_folder / "OmicsSomaticMutationsHarmonizedMAF.csv",
    low_memory=False,
)

mut_data_maf["Chrom"] = mut_data_maf["Chrom"].map(
    lambda v: str(v).replace("chr", "")
)
mut_data_maf["id"] = (
    mut_data_maf[["HugoSymbol", "Chrom", "Pos"]]
    .astype(str)
    .agg(":".join, axis=1)
)

mut_data_maf.head()

In [None]:
# positional encoding of mutations for convolutional models
mut_data_cgc_genes = mut_data_maf[
    mut_data_maf["HugoSymbol"].isin(cgc_genes)
]

# lazy check to make sure we didn't lose any cell lines
old_cell_ids = mut_data_maf["SangerModelID"].unique().tolist()
new_cell_ids = mut_data_cgc_genes["SangerModelID"].unique().tolist()
assert set(old_cell_ids) == set(new_cell_ids)

mut_matrix = (
    mut_data_cgc_genes.groupby(["SangerModelID", "id"])
    .size()
    .unstack()
    .fillna(0)
    .clip(upper=1)
    .astype(int)
)

n_genes = mut_data_cgc_genes["HugoSymbol"].nunique()

mut_matrix.to_csv(
    input_folder
    / f"CellToSomaticMutationsPositionEncoded{n_genes}CGCGenesAll.csv"
)

In [None]:
# all mutations encoded by gene
mut_matrix_all = (
    mut_data_maf.groupby(["SangerModelID", "HugoSymbol"])
    .size()
    .unstack()
    .fillna(0)
    .clip(upper=1)
    .astype(int)
)

mut_matrix_all.head()

In [None]:
# gene encoded matrix CGC genes
mut_matrix_all_cgc = mut_matrix_all.filter(items=cgc_genes, axis=1)
mut_matrix_all_cgc = mut_matrix_all_cgc.sort_index(axis=1)

n_genes = mut_matrix_all_cgc.shape[1]

mut_matrix_all_cgc.to_csv(
    input_folder / f"CellToSomaticMutationsGeneEncoded{n_genes}CGCGenesAll.csv"
)

In [None]:
mut_matrix_del = (
    mut_data_maf[mut_data_maf["CCLEDeleterious"] == True]
    .groupby(["SangerModelID", "HugoSymbol"])
    .size()
    .unstack()
    .fillna(0)
    .clip(upper=1)
    .astype(int)
)

mut_matrix_del

In [None]:
# mut_data_cgc_genes[
#     (mut_data_cgc_genes["CCLEDeleterious"] == True)
#     | (mut_data_cgc_genes["HessDriver"] == True)
# ].shape