# Prepare inputs for cell line data

## DO NOT CHANGE THIS CODE:
## Create folder under preprocessing for each dataset that is generated
## Replicate or adjust these files within each of those folders

In [None]:
from __future__ import annotations

import polars as pl
import pandas as pd
import numpy as np

from pathlib import Path

from rdkit import Chem
from rdkit.Chem import AllChem as AllChem

In [None]:
gdsc_folder = Path("../../data/processed/GDSC")
cmp_folder = Path("../../data/processed/CellModelPassports")
genelist_folder =  Path("../../data/genelists")

In [None]:
output_folder = Path("../../data/inputs/GDSCv1v2")
output_folder.mkdir(exist_ok=True)

In [None]:
drug_info = pl.read_csv(gdsc_folder / "DrugAnnotations.csv")
screen_data = pl.read_csv(gdsc_folder / "ScreenFittedDoseResponse.csv")

In [None]:
model_info = pl.read_csv(cmp_folder / "ModelAnnotations.csv")
mut_data = pl.read_csv(cmp_folder / "OmicsDriverMutations.csv")
exp_data = pl.read_csv(
    cmp_folder / "OmicsExpressionProteinCodingGenesFPKMLogp1.csv"
)

In [None]:
mcg_genes = pl.read_csv(genelist_folder / "VermaatMiniCancerGenome.csv")
mcg_genes = set(mcg_genes["HGNC_SYMBOL"].to_list())

In [None]:
mp_genes = pl.read_csv(genelist_folder / "MutPanningSignificantGenes.csv")
mp_genes = set(mp_genes["Gene"].to_list())

In [None]:
exp_model_ids = set(exp_data["model_id"].to_list())
mut_model_ids = set(mut_data["model_id"].to_list())
screen_model_ids = set(screen_data["model_id"].to_list())

common_model_ids = set.intersection(
    exp_model_ids, mut_model_ids, screen_model_ids
)

In [None]:
filter_expr = pl.col("model_id").is_in(common_model_ids)

exp_data_harm = exp_data.filter(filter_expr).sort("model_id")
mut_data_harm = mut_data.filter(filter_expr).sort("model_id")

screen_data_harm = (
    screen_data.filter(filter_expr)
    .sort(["model_id", "canonical_smiles"])
    .select(["model_id", "canonical_smiles", "ln_ic50", "auc"])
)

In [None]:
exp_genes = set(exp_data_harm.columns[1:])
exp_mcg_genes = set.intersection(exp_genes, mcg_genes)
exp_mcg_genes = sorted(list(exp_mcg_genes))
exp_data_harm_mcg_genes = exp_data_harm.select(["model_id", *exp_mcg_genes])
exp_data_harm_mcg_genes.head()

In [None]:
mut_genes = set(mut_data_harm.columns[1:])
mut_mp_genes = set.intersection(mut_genes, mp_genes)
mut_mp_genes = sorted(list(mut_mp_genes))
mut_data_harm_mp_genes = mut_data_harm.select(["model_id", *mut_mp_genes])
mut_data_harm_mp_genes.head()

In [None]:
min_muts_per_gene = 5
mut_counts = mut_data_harm_mp_genes.select(mut_mp_genes).sum()
mut_counts = mut_counts.melt(variable_name="gene_symbol", value_name="count")

mut_counts_filt = mut_counts.filter(pl.col("count") >= min_muts_per_gene)
keep_mut_genes = sorted(mut_counts_filt["gene_symbol"].to_list())

mut_data_harm_mp_genes = mut_data_harm_mp_genes.select(
    ["model_id", *keep_mut_genes]
)
mut_data_harm_mp_genes.head()

In [None]:
len(exp_data_harm_mcg_genes.columns[1:])

In [None]:
mut_data_harm_mp_genes.write_csv(
    output_folder / "FeatureOmicsDriverMutations186MutPanningGenes.csv"
)
exp_data_harm_mcg_genes.write_csv(
    output_folder
    / "FeatureOmicsExpression1771MiniCancerGenomeGenesFPKMLogp1.csv"
)

In [None]:
smiles_strs = screen_data_harm["canonical_smiles"].unique().sort()
fingerprints = []
for smiles in smiles_strs:
    mol = Chem.MolFromSmiles(smiles)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024)
    fingerprints.append([smiles, *list(fp)])

fingerprints = pl.DataFrame(fingerprints)
fingerprints.columns = ["drug_id", *list(map(str, range(1024)))]
fingerprints.head()

In [None]:
fingerprints.write_csv(
    output_folder / "FeatureChemMorganFingerprint1024Bit.csv"
)

In [None]:
screen_data_harm = screen_data_harm.with_row_count("id")
screen_data_harm.head()

In [None]:
screen_data_harm.rename(
    {"model_id": "cell_id", "canonical_smiles": "drug_id"}
).write_csv(output_folder / "LabelDoseResponse.csv")

In [None]:
model_info_harm = (
    model_info.filter(pl.col("model_id").is_in(common_model_ids))
    .select(["model_id", "tissue", "cancer_type", "cancer_type_detail"])
    .sort("model_id")
)

model_info_harm.write_csv(output_folder / "MetaModelAnnotations.csv")