# Clean and preprocess Cell Model Passports data

In [None]:
from __future__ import annotations

import polars as pl
import pandas as pd
import numpy as np

from pathlib import Path

In [None]:
cmp_data_folder = Path("../../data/raw/CellModelPassports")
cmp_output_folder = Path("../../data/processed/CellModelPassports")
cmp_output_folder.mkdir(exist_ok=True)

In [None]:
cmp_model_list = pl.read_csv(cmp_data_folder / "model_list_20230608.csv")
cmp_model_meta = cmp_model_list.select([
    "model_id",
    "model_name",
    "tissue",
    "cancer_type",
    "cancer_type_detail",
    "cancer_type_ncit_id",
    pl.col("CCLE_ID").alias("ccle_id"),
    pl.col("BROAD_ID").alias("broad_id")
])
cmp_model_meta.head()

In [None]:
# cmp_model_meta.write_csv(cmp_output_folder / "ModelAnnotations.csv")

In [None]:
cmp_gene_identifiers = pl.read_csv(
    cmp_data_folder / "gene_identifiers_20191101.csv",
    dtypes={"entrez_id": str},
)
cmp_gene_identifiers.head()

In [None]:
cmp_wes_maf = pl.read_csv(cmp_data_folder / "mutations_all_20230202.csv")
cmp_wes_maf.head(n=2)

## Clean the WES data

In [None]:
cmp_wes_matrix = (
    cmp_wes_maf.with_columns(pl.col("cancer_driver").cast(int))
    .pivot(
        values="cancer_driver",
        index="model_id",
        columns="gene_symbol",
        aggregate_function="max",
    )
    .fill_null(0)
)

mut_genes = sorted(cmp_wes_matrix.columns[1:])
cmp_wes_matrix = cmp_wes_matrix.select(["model_id", *mut_genes])

cmp_wes_matrix.head(n=2)

In [None]:
cmp_wes_matrix.write_csv(cmp_output_folder / "OmicsDriverMutations.csv")

## Clean the RNA seq data

In [None]:
cmp_rnaseq_fpkm_matrix = (
    pd.read_csv(
        cmp_data_folder / "rnaseq_fpkm_20220624.csv",
        skiprows=[1, 2, 3, 4],
    )
    .drop(columns=["model_id"])
    .rename(columns={"Unnamed: 1": "gene_symbol"})
)

cmp_rnaseq_fpkm_matrix.head()

In [None]:
gene_symbols = cmp_rnaseq_fpkm_matrix["gene_symbol"]
model_ids = cmp_rnaseq_fpkm_matrix.columns[1:]

cmp_rnaseq_fpkm_matrix_t = cmp_rnaseq_fpkm_matrix[model_ids].transpose()
cmp_rnaseq_fpkm_matrix_t.columns = gene_symbols
cmp_rnaseq_fpkm_matrix_t = cmp_rnaseq_fpkm_matrix_t.rename_axis(
    columns={"gene_symbol": None}
)

cmp_rnaseq_log2fpkm_matrix_t: pd.DataFrame = np.log2(
    cmp_rnaseq_fpkm_matrix_t + 1
)

cmp_rnaseq_log2fpkm_matrix_t = (
    cmp_rnaseq_log2fpkm_matrix_t.reset_index().rename(
        columns={"index": "model_id"}
    )
)

cmp_rnaseq_log2fpkm_matrix_t.head()

In [None]:
cmp_rnaseq_log2fpkm_matrix_t.to_csv(
    cmp_output_folder / "OmicsExpressionProteinCodingGenesFPKMLogp1.csv",
    index=False,
)

In [None]:
cmp_rnaseq_tpm_matrix = (
    pd.read_csv(
        cmp_data_folder / "rnaseq_tpm_20220624.csv",
        skiprows=[1, 2, 3, 4],
    )
    .drop(columns=["model_id"])
    .rename(columns={"Unnamed: 1": "gene_symbol"})
)

cmp_rnaseq_tpm_matrix.head()

In [None]:
gene_symbols = cmp_rnaseq_tpm_matrix["gene_symbol"]
model_ids = cmp_rnaseq_tpm_matrix.columns[1:]

cmp_rnaseq_tpm_matrix_t = cmp_rnaseq_tpm_matrix[model_ids].transpose()
cmp_rnaseq_tpm_matrix_t.columns = gene_symbols
cmp_rnaseq_tpm_matrix_t = cmp_rnaseq_tpm_matrix_t.rename_axis(
    columns={"gene_symbol": None}
)

cmp_rnaseq_log2tpm_matrix_t: pd.DataFrame = np.log2(
    cmp_rnaseq_tpm_matrix_t + 1
)

cmp_rnaseq_log2tpm_matrix_t = (
    cmp_rnaseq_log2tpm_matrix_t.reset_index().rename(
        columns={"index": "model_id"}
    )
)

cmp_rnaseq_log2tpm_matrix_t.head()

In [None]:
cmp_rnaseq_log2tpm_matrix_t.to_csv(
    cmp_output_folder / "OmicsExpressionProteinCodingGenesTPMLogp1.csv",
    index=False,
)

## Copy number data

In [None]:
cmp_cnv_matrix = pd.read_csv(
    cmp_data_folder / "WES_pureCN_CNV_genes_total_copy_number_20221213.csv",
    skiprows=[0, 2, 3],
    index_col=0,
)
cmp_cnv_matrix.head()

In [None]:
cmp_cnv_matrix.columns