# Create a project

In [1]:
import pandas as pd
import pickle
import json

## Define the parameters of your project

In [2]:
project = {
    'name': 'LysOnc',
    'data_dir': 'data/', 
    'datasets': {
        'TCGA-BRCA': {
            'data': 'expression_data_tcga_brca_TCGA-BRCA_log_fpkm_1226_samples_55_genes.csv',
            'clinical': 'clinical_TCGA-BRCA.csv'
        }  
    }
}

## Define clinical groups

In [3]:
groups = {
    # --------------
    'TCGA-BRCA': {
        'NT': [{'tissue_status': ['normal']}], # Non tumour (NT) breast
        'All-tumours': [{'tissue_status': ['tumoral']}],
        'Luminal-A': [{'tissue_status': ['tumoral']}, {'pam50': ['luminal-A']}],
        'Luminal-B': [{'tissue_status': ['tumoral']}, {'pam50': ['luminal-B']}],
        'HER2-enriched': [{'tissue_status': ['tumoral']}, {'pam50': ['HER2-enriched']}],
        'Basal-like': [{'tissue_status': ['tumoral']}, {'pam50': ['basal-like']}],
        'T1N0': [{'ajcc_tumor_pathologic_pt_shared_stage_pathologic_categories': ['T1', 'T1a', 'T1b', 'T1c']}, {'ajcc_nodes_pathologic_pn_shared_stage_pathologic_m': ['N0', 'N0 (i-)', 'N0 (i+)']}],
        'N0': [{'ajcc_nodes_pathologic_pn_shared_stage_pathologic_m': ['N0', 'N0 (i-)', 'N0 (i+)']}],
        'M1': [{'diagnoses_1_ajcc_pathologic_m': ['M1']}],
        'Claudin-low': [{'tissue_status': ['tumoral']}, {'claudin_low': [1]}],
        },
}

## Define a function to execute queries on clinical data

In [4]:
def get_query(clinical, list_filters):
    query_and = True
    for filter_element in list_filters:
        for colname, colvalues in filter_element.items():
            query_and = query_and & (clinical[colname].isin(colvalues))
    return query_and

## Identify the samples belonging to each group and store them inside the project variable 

In [5]:
dataset_name = 'TCGA-BRCA'
project['datasets'][dataset_name]['groups'] = dict()
clinical = pd.read_csv(project['data_dir'] + project['datasets'][dataset_name]['clinical'], sep=';', index_col=0)
for group_name, list_filters in groups[dataset_name].items():
    query = get_query(clinical, list_filters)
    group_samples = list(clinical.loc[query].index)    
    project['datasets'][dataset_name]['groups'][group_name] = group_samples
    print(dataset_name, group_name, len(group_samples))

TCGA-BRCA NT 113
TCGA-BRCA All-tumours 1113
TCGA-BRCA Luminal-A 547
TCGA-BRCA Luminal-B 202
TCGA-BRCA HER2-enriched 82
TCGA-BRCA Basal-like 193
TCGA-BRCA T1N0 170
TCGA-BRCA N0 509
TCGA-BRCA M1 22
TCGA-BRCA Claudin-low 33


## Save the project

In [6]:
pickle_file = f"{project['data_dir']}{project['name']}.pickle"
with open(pickle_file, 'bw') as f:
    pickle.dump(project, f)
print(pickle_file)

data/LysOnc.pickle


In [7]:
json_file = pickle_file = f"{project['data_dir']}{project['name']}.json"
with open(json_file, 'w', encoding='utf-8') as f:
    json.dump(project, f, ensure_ascii=True, indent=4)
print(json_file)

data/LysOnc.json
