In [1]:
import pandas as pd
from pathlib import Path

Load all datasets for intersection lists

In [2]:
all_datasets= ['BeatAML2', 'CCLE', 'CTRPv1', 'CTRPv2', 'GDSC1', 'GDSC2', 'PDX_Bruna', 'TOYv1', 'TOYv2']
path_to_data = '../../drevalpy/data'

In [3]:
def get_gene_intersection_omic(dataset_list: list, data_path: str, omic: str):
    first_dataset = dataset_list.pop()
    print(f'Processing {first_dataset}...')
    first_path = Path(data_path, first_dataset, f'{omic}.csv')
    df = pd.read_csv(first_path, index_col='cell_line_name')
    if 'cellosaurus_id' in df.columns:
        df.drop(columns=['cellosaurus_id'], inplace=True)
    gene_set = set(df.columns)
    for ds in dataset_list:
        print(f'Processing {ds}...')
        path = Path(data_path, ds, f'{omic}.csv')
        df = pd.read_csv(path, index_col='cell_line_name')
        if 'cellosaurus_id' in df.columns:
            df.drop(columns=['cellosaurus_id'], inplace=True)
        gene_set = gene_set.intersection(set(df.columns))
    return gene_set

In [4]:
gene_set_cnv = get_gene_intersection_omic(
    dataset_list=['CCLE', 'CTRPv1', 'CTRPv2', 'GDSC1', 'GDSC2', 'PDX_Bruna', 'TOYv1', 'TOYv2'],
    data_path=path_to_data,
    omic='copy_number_variation_gistic'
)
pd.DataFrame({'Symbol': list(gene_set_cnv)}).to_csv('copy_number_variation_gistic_intersection.csv')

Processing TOYv2...
Processing CCLE...
Processing CTRPv1...
Processing CTRPv2...
Processing GDSC1...
Processing GDSC2...
Processing PDX_Bruna...
Processing TOYv1...


In [5]:
gene_set_gex = get_gene_intersection_omic(
    dataset_list=all_datasets,
    data_path=path_to_data,
    omic='gene_expression'
)
pd.DataFrame({'Symbol': list(gene_set_gex)}).to_csv('gene_expression_intersection.csv')

Processing TOYv2...
Processing BeatAML2...
Processing CCLE...
Processing CTRPv1...
Processing CTRPv2...
Processing GDSC1...
Processing GDSC2...
Processing PDX_Bruna...
Processing TOYv1...


In [6]:
gene_set_met = get_gene_intersection_omic(
    dataset_list=['CCLE', 'CTRPv1', 'CTRPv2', 'GDSC1', 'GDSC2', 'TOYv1', 'TOYv2'],
    data_path=path_to_data,
    omic='methylation'
)
pd.DataFrame({'Symbol': list(gene_set_met)}).to_csv('methylation_intersection.csv')

Processing TOYv2...
Processing CCLE...
Processing CTRPv1...
Processing CTRPv2...
Processing GDSC1...
Processing GDSC2...
Processing TOYv1...


In [7]:
gene_set_mut = get_gene_intersection_omic(
    dataset_list=['CCLE', 'CTRPv1', 'CTRPv2', 'GDSC1', 'GDSC2', 'TOYv1', 'TOYv2'],
    data_path=path_to_data,
    omic='mutations'
)
pd.DataFrame({'Symbol': list(gene_set_mut)}).to_csv('mutations_intersection.csv')

Processing TOYv2...
Processing CCLE...
Processing CTRPv1...
Processing CTRPv2...
Processing GDSC1...
Processing GDSC2...
Processing TOYv1...


In [8]:
# same for all for now; could change at a later point in time to include the other dataset
gene_set_prot = get_gene_intersection_omic(
    dataset_list=['CCLE', 'TOYv1', 'TOYv2'],
    data_path=path_to_data,
    omic='mutations'
)
pd.DataFrame({'Symbol': list(gene_set_prot)}).to_csv('proteomics_intersection.csv')

Processing TOYv2...
Processing CCLE...
Processing TOYv1...


## Reduced gene lists

Todo: how did we get here?

In [10]:
landmark_genes = pd.read_csv("landmark_genes.csv")
drug_targets = pd.read_csv("drug_target_genes_all_drugs.csv")
paccman_list = pd.read_csv("gene_list_paccmann_network_prop.csv")

In [11]:
# proteomics versions
proteomics = pd.read_csv('../../drevalpy/data/CCLE/proteomics.csv', index_col=['cellosaurus_id', 'cell_line_name'])
landmark_proteomics = set(landmark_genes['Symbol']).intersection(set(proteomics.columns))
drug_targets_proteomics = set(drug_targets['Symbol']).intersection(set(proteomics.columns))
paccman_list_proteomics = set(paccman_list['Symbol']).intersection(set(proteomics.columns))

In [12]:
pd.DataFrame({'Symbol': list(landmark_proteomics)}).to_csv('landmark_genes_proteomics.csv')
pd.DataFrame({'Symbol': list(drug_targets_proteomics)}).to_csv('drug_target_genes_all_drugs_proteomics.csv')
pd.DataFrame({'Symbol': list(paccman_list_proteomics)}).to_csv('gene_list_paccmann_network_prop_proteomics.csv')

In [13]:
all_genes_intersected = gene_set_cnv.intersection(gene_set_gex).intersection(gene_set_mut).intersection(set(proteomics.columns))

In [14]:
landmark_reduced = set(landmark_genes['Symbol']).intersection(all_genes_intersected)
drug_targets_reduced = set(drug_targets['Symbol']).intersection(all_genes_intersected)
paccman_list_reduced = set(paccman_list['Symbol']).intersection(all_genes_intersected)

In [15]:
pd.DataFrame({'Symbol': list(landmark_reduced)}).to_csv('landmark_genes_reduced.csv')
pd.DataFrame({'Symbol': list(drug_targets_reduced)}).to_csv('drug_target_genes_reduced.csv')
pd.DataFrame({'Symbol': list(paccman_list_reduced)}).to_csv('gene_list_paccmann_network_prop_reduced.csv')