## RNASeq workflow for PDX-HTS paper


This notebook contains all the analyses...

In [9]:
# testing
from common_imports import *

In [10]:
PREPROC_PDX_AFFY_EXP_FILE

'/pdx-hts/Notebooks/data/preprocessed/exp/pdx_affy_exp.csv'

In [11]:
#pdx_affy = pd.read_csv(PREPROC_PDX_AFFY_EXP_FILE, index_col=0)
#pdx_affy

## User input

In [1]:
# Requires GenePattern Notebook: pip install genepattern-notebook
import gp
import genepattern

# Username and password removed for security reasons.
genepattern.display(genepattern.session.register("https://genepattern.broadinstitute.org/gp", "", ""))

GPAuthWidget()

Select parameters before running the rest of the notebook.

<div class="alert alert-info">
<h3 style="margin-top: 0;"> Instructions <i class="fa fa-info-circle"></i></h3>
Select parameters before running the rest of the notebook.
</div>

In [2]:
from companion_script import *
# # Select case
# case_id = 'case17'
# # This patient directory should match the directory name on DNANexus.
# patient_dir = '18-10716_tumor-normal'
# is_medullo = True # set False if it is another kind of brain tumor
%load_ext autoreload
%autoreload 2
%matplotlib inline
import readline # required for rpy2 extension
%load_ext rpy2.ipython


def rmagic_warning(
    message,
    category = rpy2.rinterface.RRuntimeWarning,
    filename = '',
    lineno = -1,
    file=None,
    line=None):
    print(message)
default_showwarning = warnings.showwarning


@genepattern.build_ui(parameters={
    "output_var": {
        "default": "setup",
        "hide": False,
    },
    "case_id": {"type": "text",
                "description": "The name of the case, e.g., 'PDX1'",
               "default":"PDX1"},
    "patient_dir": {"type": "text",
                    "description": 'For DNA Nexus downloads only. The name of the "patient" directory, e.g. "18-10716_tumor-normal" (quotes are required)',
                    "default":"PDX1_dir"},
    "dna_nexus_bool": {"type": "bool",
                   "description": "Whether or not this sample has been classified as medulloblastoma",
                  "default":False},
    "is_medullo": {"type": "bool",
                   "description": "Whether or not this sample has been classified as medulloblastoma",},
    "control": {"type": "choice",
                "description": "Whether or not to use a custom control",
                "choices": {
                    "original": "original",
                    "custom": "custom",
                            }
               },
    "custom_control_expression": {"type": "file",
                           "kinds": ["gct"],
                           "description": "The file (or path to the GCT file) which contains the gene expression of the custom control.",
                           "default":None},
})
def read_user_input(case_id, patient_dir, dna_nexus_bool=False, is_medullo=False, control='original',custom_control_expression=None):
    # Select control for DiSCoVER and Connectivity Map
    # Generally, if the tumor is a medulloblastoma, we use `cerebellar_stem` (comment the `neural_stem` line).
    # And if it is any other kind of brain tumor, we use `neural_stem`.
    if control == 'original':
        expression_control = 'cerebellar_stem' if is_medullo else 'neural_stem'
    elif control == 'custom':
        expression_control = 'custom_control'
    else:
        print('Unexpected value for variable named control, value:', control)
        
    if (len(custom_control_expression) is not 0) and (control is not 'custom'):
        print("Reminder: if you want to use a custom control expresion, you must set control to 'custom'")

    base_dir = os.getcwd()
    utilities_dir = '/build'
    patients_dir = os.path.join(base_dir, 'patients')
    if not dna_nexus_bool:
        log('Setting patient_dir = case_id')
        patient_dir = case_id
        
    in_dir = os.path.join(patients_dir, patient_dir)
    
    out_dir = in_dir
    os.makedirs(out_dir, exist_ok=True)
    
#     out['base_dir'] = base_dir
#     out['utilities_dir'] = utilities_dir
#     out['patients_dir'] = patients_dir
#     

    platform = sys.platform
    if platform.startswith('linux'):
        os_string = 'linux'
    elif platform == 'darwin':
        os_string = 'mac'
    else:
        raise ValueError('Platform "{}" not supported'.format(platform))

    # RNASeq quantification
    kallisto_dir = '/build/kallisto'
    kallisto_path = os.path.join(kallisto_dir, 'kallisto_{}-v0.44.0/kallisto'.format(os_string))
    transcriptome_index_path = os.path.join(kallisto_dir, 'GRCh38.ensembl.transcriptome.idx')
    local_fastqs_dir = os.path.join(in_dir, 'fastqs')
    os.makedirs(local_fastqs_dir, exist_ok=True)
    patient_gexp_file = os.path.join(out_dir, 'gene_abundance.sleuth.csv')

    # Medulloblastoma classification
#     from sklearn.ensemble import RandomForestClassifier
#     from tumor_classification.medulloblastoma import classify_cavalli, classify_cho, classify_northcott
    medullo_classify_out_dir = os.path.join(out_dir, 'medulloblastoma_classification')
    if not os.path.exists(medullo_classify_out_dir):
        os.mkdir(medullo_classify_out_dir)
    cavalli_subgroup_file = os.path.join(medullo_classify_out_dir, 'cavalli_subgroups.csv')
    cavalli_subgroup_direct_file = os.path.join(medullo_classify_out_dir, 'cavalli_subgroups_direct.csv')
    cavalli_subtype_file = os.path.join(medullo_classify_out_dir, 'cavalli_subtypes.csv')
    cho_subtype_file = os.path.join(medullo_classify_out_dir, 'cho_subtypes.csv')
    cho_subgroup_file = os.path.join(medullo_classify_out_dir, 'cho_subgroups.csv')
    northcott_subgroup_file = os.path.join(medullo_classify_out_dir, 'northcott_subgroups.csv')

    drug_suggestion_out_dir = os.path.join(out_dir, 'drug_suggestions')
    os.makedirs(drug_suggestion_out_dir, exist_ok=True)

    # DiSCoVER
    discover_out_dir = os.path.join(drug_suggestion_out_dir, 'discover/{}'.format(expression_control))
    os.makedirs(discover_out_dir, exist_ok=True)
    discover_heatmap_file = os.path.join(discover_out_dir, 'ctrp.png')
    full_discover_results_file = os.path.join(discover_out_dir, 'discover.all.csv')
    rdrugs_discover_file = os.path.join(discover_out_dir, '{}.discover.{}.reasonable.annotated.csv'.format(case_id, expression_control))

    # Connectivity Map
    cmap_out_dir = os.path.join(drug_suggestion_out_dir, 'cmap/{}'.format(expression_control))
    os.makedirs(cmap_out_dir, exist_ok=True)
    cmap_all_ranked_drugs_file = os.path.join(cmap_out_dir, '{}.cmap.{}.all.csv'.format(case_id, expression_control))
    cmap_reasonable_ranked_drugs_file = os.path.join(cmap_out_dir, '{}.cmap.{}.reasonable.annotated.csv'.format(case_id, expression_control))
    
    # Powerpoint for MTB
#     from slides import make_medullo_classification_slide, make_discover_workflow_slide, make_exp_drug_ranking_results_slide, make_intersection_slide
    mtb_ppt_file = os.path.join(out_dir, '{}.mtb_slides.pptx'.format(case_id))

    # DNANexus
    dx_source_path = os.path.join(utilities_dir, 'dx-toolkit/environment')
    dnanexus_project = 'UW_UCSD_RNAseq_collaboration_share'
    # Replace the contents of this file with your own DNANexus token.
    dnanexus_token_file = os.path.join(base_dir, 'dnanexus_token.txt')
    # To use the dx command, we must update some environment variables. 
    # From the command line, this is done with source dx-toolkit/environment, 
    # but from Python we have to use a workaround, because normally any changes 
    # to environment variables done in a subprocess are not reflected in the 
    # parent process. The workaround runs the source command in a subprocess, 
    # fetches the environment variables from the subprocess and updates those 
    # of the parent process.
#     from utils import source_and_update_env_vars
    source_and_update_env_vars(dx_source_path)    
    out = {"case_id": case_id,
                 "patient_dir": patient_dir,
                 "is_medullo": is_medullo}
    out['dna_nexus_bool']=dna_nexus_bool
    out['expression_control'] = expression_control
    out['custom_control_expression'] = custom_control_expression
    out['dnanexus_token_file'] = dnanexus_token_file
    out['local_fastqs_dir'] = local_fastqs_dir
    out['dnanexus_project'] = dnanexus_project
    out['local_fastqs_dir'] = local_fastqs_dir
    out['transcriptome_index_path'] = transcriptome_index_path
    out['kallisto_path'] = kallisto_path
    out['kallisto_dir'] = kallisto_dir
    out['out_dir'] = out_dir
    out['r_out_dir'] = out_dir.replace('\\',r'\\')
    out['patient_gexp_file'] = patient_gexp_file
    out['in_dir'] = in_dir
    out['cavalli_subgroup_file'] = cavalli_subgroup_file
    out['cavalli_subtype_file'] = cavalli_subtype_file
    out['cavalli_subgroup_direct_file'] = cavalli_subgroup_direct_file
    out['cho_subgroup_file'] = cho_subgroup_file
    out['cho_subtype_file'] = cho_subtype_file
    out['northcott_subgroup_file'] = northcott_subgroup_file
    out['mtb_ppt_file'] = mtb_ppt_file
    out['expression_control'] = expression_control
    out['full_discover_results_file'] = full_discover_results_file
    out['discover_out_dir'] = discover_out_dir
    out['discover_heatmap_file'] = discover_heatmap_file
    out['rdrugs_discover_file'] = rdrugs_discover_file
    out['cmap_out_dir'] = cmap_out_dir
    out['cmap_all_ranked_drugs_file'] = cmap_all_ranked_drugs_file
    out['cmap_reasonable_ranked_drugs_file'] = cmap_reasonable_ranked_drugs_file
    out['mtb_ppt_file'] = mtb_ppt_file
    out['out_dir'] = os.path.join(patients_dir, case_id)

    if not os.path.exists(out['out_dir']):
        os.mkdir(out['out_dir'])
    print('Setup done!')
    pickle.dump(out, file=open(os.path.join(out['out_dir'], case_id+'_backup1_input.p'),'wb'))
    return Bunch(out)

UIBuilder(function_import='read_user_input', name='read_user_input', params=[{'name': 'case_id', 'label': 'cas…

<div class="well">
Running all cells below this point will execute all the analyses except for one: the Connectivity Map analysis at the end of the notebook, which requires two manual steps.
</div>

# Download RNAseq data

In [3]:
@genepattern.build_ui(parameters={
    "setup": {"default": "setup",
              "hide": False,
              "description": "The variable which has the setup information"},
    "input_expression_dir":{"hide":True},
    "output_var": {
        "default": "setup",
        "hide": True,
    },
})
def download_and_preprocess_rnaseq(setup, input_expression_dir = None):
    setup.input_expression_dir = f'/pdx-hts/Notebooks/data/preprocessed/exp/{setup.case_id}.csv'
    if setup.dna_nexus_bool:
        if input_expression_dir is not None:
            log(f"input_expression_dir has a value ({setup.input_expression_dir}), and it will be ignored!")
        log('About to download fastqfiles from DNA Nexus. This may take a while.')
        with open(setup.dnanexus_token_file, 'r') as f:
            dnanexus_token = f.readline().strip()
        login_command = 'dx login --token {} --noprojects; dx select {}'.format(dnanexus_token, setup.dnanexus_project)
        # subprocess.check_output('ls', shell=True).decode('utf-8').strip()
        subprocess.check_output(login_command, shell=True).decode('utf-8').strip()

        find_fastq_command = 'dx find data --name "*.fastq.gz" --path {}:{}'.format(setup.dnanexus_project, setup.patient_dir)
        find_fastq_return_lines = subprocess.check_output(find_fastq_command, shell=True).decode().strip().split('\n')
        re_string = '.*(/{}/.*\.fastq.gz) .*'.format(setup.patient_dir)
        fastq_path_re = re.compile(re_string)
        remote_fastq_paths = []
        local_fastq_subdirs = []

        for line in find_fastq_return_lines:
            search = fastq_path_re.search(line)
            remote_fastq_path = search.group(1)
            remote_fastq_paths.append(remote_fastq_path)
            fastq_subdir_path = os.path.dirname(remote_fastq_path)
            fastq_subdir = os.path.basename(fastq_subdir_path)
            local_fastq_subdir = os.path.join(setup.local_fastqs_dir, fastq_subdir)
            os.makedirs(local_fastq_subdir, exist_ok=True)
            local_fastq_subdirs.append(local_fastq_subdir)

        for remote_fastq_path, local_fastq_subdir in zip(remote_fastq_paths, local_fastq_subdirs):
            download_command = 'dx download "{}" -o "{}"'.format(remote_fastq_path, local_fastq_subdir)
            print('\t'+download_command)
            try:
                a=subprocess.check_output(download_command, shell=True)
            except subprocess.CalledProcessError as e:
                print('\tEncountered a dx error, this likely means you already have the file indicated above.')
                print('\tContinuing...\n')
                continue
        log('Done downloading the fastq files.')
        log('Preprocessing RNASeq data now:')
        log('Using kallisto to compute transcript abundance.')
        preprocess_rna_seq(setup)
        log('Done with tanscript abundance.')
        log('Using sleuth to aggregate transcript abundance into gene abbundance.')
        run_sleuth(setup)
        patient_exp = pd.read_csv(setup.patient_gexp_file, index_col=0).T
        patient_exp.index = [setup.case_id]
        setup.patient_exp = patient_exp
        patient_exp.to_csv(setup.patient_gexp_file)
        log('Habemus Genus Expressium *release the white smoke*')
    else:
        log(f'Checking if local file ({setup.input_expression_dir}) exist.')
        if os.path.isfile(setup.input_expression_dir):
            df = pd.read_csv(setup.input_expression_dir, index_col=0)
            setup.expression_input = df
            log("This file containes the expression of the PDXs. Printing dataframe's info:")
            log(setup.expression_input.info())
        else:
            log('File could not be located please check and run again.')
        patient_exp = df
        setup.patient_exp = patient_exp
        patient_exp.to_csv(setup.patient_gexp_file)
        log(f'File {setup.patient_gexp_file} saved successfully')
    
    pickle.dump(setup, file=open(os.path.join(setup.out_dir, setup.case_id+'_backup2_download.p'),'wb'))
#     pickle.dump(setup, file=open(setup.out_dir+'_backup2_download.p','wb'))
    log('Done preprocessing!')
    return setup

UIBuilder(function_import='download_and_preprocess_rnaseq', name='download_and_preprocess_rnaseq', params=[{'n…

# Classify the tumor by medulloblastoma subgroup and subtype

In [4]:
@genepattern.build_ui(
    description="This function classifies a medulloblastoma sample into a subgroup. Non-medulloblastoma samples are ignored.",
    parameters={
    "setup": {"default": "setup",
              "hide": False,
              "description": "The variable which has the setup information"},
    "output_var": {
        "default": "setup",
        "hide": True,
    },
})
def classify_sample(setup):
    # We have three datasets we can use to classify based on expression:
    # - [Cavalli et al. 2017](http://www.sciencedirect.com/science/article/pii/S1535610817302015) cohort. This cohort includes 763 tumors, and was used to define 12 finer-grained subtypes nested in the 4 subgroups. Both expression and methylation data are available.

    # - [Cho et al. 2011](http://www.mesirovlab.org/medulloblastoma/cho/) cohort. This paper identified two subtypes within G3 and two within G4, for a total of 6. It contains 194 tumors.

    # - [Northcott et al. 2017](http://www.nature.com/nature/journal/v547/n7663/full/nature22973.html) expression data (shared by Sebastian). The labels we have for this data are of the 4 basic subgroups only. There are 223 tumors.

    # When finer-grained subtypes are known, we perform the finer-grained classification first and also collapse the subtypes to the 4 basic subgroups, so as to report both subtype and subgroup probabilities. Classification is done using random forests.

    # Since the patient data are from the same platform and contain the same features each time, we can use pre-fit models to classify them. The classification methods also have a fallback in case the data looks different.

    # The tumor board is arranging for methylation data to be obtained from patient samples as well, since it seems it may be more informative than expression. Methylation data would also allow comparison to a large and variety collection of brain tumors, currently available through a DKFZ [web portal](https://www.molecularneuropathology.org/mnp).

    if setup.is_medullo:
        # Read in patient's gene-level RNASeq TPM data
        patient_exp = pd.read_csv(setup.patient_gexp_file, index_col=0)

        cavalli_subgroups, cavalli_subtypes = classify_cavalli(patient_exp)
        cavalli_subgroups.to_csv(setup.cavalli_subgroup_file)
        cavalli_subtypes.to_csv(setup.cavalli_subtype_file)

        cho_subgroups, cho_subtypes = classify_cho(patient_exp)
        cho_subtypes.to_csv(setup.cho_subtype_file)
        cho_subgroups.to_csv(setup.cho_subgroup_file)

        northcott_subgroups = classify_northcott(patient_exp)
        northcott_subgroups.to_csv(setup.northcott_subgroup_file)

        make_medullo_classification_slide(setup.mtb_ppt_file,
                                          setup.cavalli_subgroup_file,
                                          setup.cavalli_subtype_file,
                                          setup.cho_subgroup_file,
                                          setup.cho_subtype_file,
                                          setup.northcott_subgroup_file)
        log('Done! Move along')
    else:
        log('This is not medulloblastoma. Nothing to do here. Move along')
#     pickle.dump(setup, file=open(setup.out_dir+'_backup3_classify.p','wb'))
    pickle.dump(setup, file=open(os.path.join(setup.out_dir, setup.case_id+'_backup3_classify.p'),'wb'))
    
    return setup

UIBuilder(description='This function classifies a medulloblastoma sample into a subgroup. Non-medulloblastoma …

In [32]:
setup.expression_input

Unnamed: 0,A1BG,A1BG-AS1,A1CF,A2M,A2M-AS1,A2ML1,A4GNT,AAAS,AACS,AACSP1,...,ZUP1,ZWILCH,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
DMB006,3.359795,3.317396,2.982897,3.272958,5.974374,2.700876,3.623813,4.469528,8.874965,3.177392,...,7.441817,8.600564,6.104677,5.542006,6.384078,3.845576,9.761523,4.290367,5.395606,8.178586


# Suggest drugs based on RNAseq data (DiSCoVER)

In [5]:
@genepattern.build_ui(
  description="Run DiSCoVER on the provided sample and control.",
  parameters={
    "setup": {"default": "setup",
              "hide": False,
              "description": "The variable which has the setup information"},
    "output_var": {
        "default": "setup",
        "hide": True,
    },
})
def run_discover(setup):
    from rpy2.robjects import numpy2ri
    numpy2ri.activate()
    from discover import discover_from_expression, plot_discover_from_expression
    from drug_suggestion.expression.controls import load_control_exp
    patient_exp = pd.read_csv(setup.patient_gexp_file, index_col=0)
    control_exp = load_control_exp(setup.expression_control)
    log("About to perform DiSCoVER.")
    discover_results = discover_from_expression(exp=patient_exp, 
                                                control_exp=control_exp, 
                                                verbose=False)
    setup.raw_discover_results = discover_results
#     pickle.dump(setup, file=open(setup.out_dir+'_backup4-0_DISCoVER.p','wb'))
#     print(discover_results)
    # move some files created by DiSCoVER
#     for cl_name in ['ccle','ctrp','gdsc']:
#         os.rename(f"COSMIC_cell_lines_IDs_and_types_{cl_name}.csv", os.path.join(setup.out_dir, f"COSMIC_cell_lines_IDs_and_types_{cl_name}.csv"))
    
    log("DiSCoVER done!")
    numpy2ri.deactivate()
    log('Saving results to file.')
    # display(discover_results)
    discover_results.T.sort_values(by=setup.case_id, ascending=False).to_csv(setup.full_discover_results_file)
    log("Saving done!")
    log("NOT Restricting to clinically relevant drugs.")
    #Not all drugs in CCLE, CTRP, and GDSC are realistic candidates for treatment. We compiled a list of medications that are FDA-approved or in late-stage clinical trials, and Dr. Wechsler-Reya curated it to include only those that are relevant for treating brain tumors. Here we limit the results to these drugs and add Dr. Wechsler-Reya's mechanism-of-action annotations. To enable comparison of drug lists, drugs from the different sources have been mapped to PubChem compound IDs (CIDs) using [PubChemPy](http://pubchempy.readthedocs.io/en/latest/).
    from drug_suggestion.drug_annotation import subset_to_reasonable_drugs
    from drug_suggestion.expression.discover import load_discover_drug_to_cids
    disco2cid = load_discover_drug_to_cids()
    reasonable_results = subset_to_reasonable_drugs(discover_results, 
                                                disco2cid, 
                                                out_prefix='discover.{}'.format(setup.expression_control), 
                                                out_dir=setup.discover_out_dir)
    # This will override the file setup.rdrugs_discover_file
    all_drugs = format_drugs(discover_results, 
                                disco2cid, 
                                out_prefix='discover.{}'.format(setup.expression_control), 
                                out_dir=setup.discover_out_dir)
    log('Done restricting to clinically relevant drugs!')
    
##     log('making a discover illustrative method')
##     from drug_suggestion.expression.discover import plot_discover_from_expression
##     plot_discover_from_expression(case_id, 
##                                   discover_results, 
##                                   exp=patient_exp,
##                                   control_exp=control_exp,
##                                   cl='ctrp',
##                                   out_file=discover_heatmap_file)
##     make_discover_workflow_slide(mtb_ppt_file, discover_heatmap_file)
    log('Making the DiSCoVER powerpoint.')
    rdrugs_discover = pd.read_csv(setup.rdrugs_discover_file, index_col=None)

    
    # Using all of the drugs
    log('Using all of the drugs')
    df = split_discover_dataframe(df=rdrugs_discover)
    df = rank_drugs_discover(df)
#     df.head()
    make_exp_drug_ranking_results_slide(setup.mtb_ppt_file, df.head(20), setup.expression_control, method='DiSCoVER')
    log('Done making the DiSCoVER powerpoint slide!')
    log('Savig the variables to a file.')
    setup.discover_results = discover_results
    setup.disco2cid = disco2cid
    setup.control_exp = control_exp
    setup.reasonable_results = reasonable_results
    setup.df = df
    log('Saving the formatted results of DiSCoVER to a csv')
    df.to_csv(os.path.join(setup.out_dir, setup.case_id+'_formated_DISCoVER_results.csv'))
#     pickle.dump(setup, file=open(setup.out_dir+'_backup4_DISCoVER.p','wb'))
    pickle.dump(setup, file=open(os.path.join(setup.out_dir, setup.case_id+'_backup4_DISCoVER.p'),'wb'))
#     pickle.dump(setup, file=open(setup.case_id+'_DISCoVER.p','wb'))
    log('Done savig the variables to a file!')
    
    log('Done with all the taks in this cell. Move along.')
    return setup

UIBuilder(description='Run DiSCoVER on the provided sample and control.', function_import='run_discover', name…

In [77]:
setup.df.to_csv(os.path.join(setup.out_dir, setup.case_id+'_formated_DISCoVER_results.csv'))

In [34]:
setup.raw_discover_results

Unnamed: 0,ctrp_ML311,ctrp_zebularine,ctrp_BRD-A02303741,ctrp_importazole,ctrp_SR8278,ctrp_elocalcitol,ctrp_BRD-A05715709,ctrp_docetaxel,ctrp_cyclophosphamide,ctrp_3-Cl-AHPC,...,ccle_PHA-665752,ccle_PLX4720,ccle_Paclitaxel,ccle_Panobinostat,ccle_RAF265,ccle_Sorafenib,ccle_TAE684,ccle_TKI258,ccle_Topotecan,ccle_ZD-6474
DMB006,0.24922,-0.368372,0.224963,0.284399,-0.359116,0.345501,-0.279495,0.216308,0.289871,0.244442,...,-0.262857,-0.29992,-0.21247,-0.277326,-0.207103,-0.195914,-0.231196,-0.245791,0.21383,-0.257422


In [35]:
setup.discover_results

Unnamed: 0,ctrp_ML311,ctrp_zebularine,ctrp_BRD-A02303741,ctrp_importazole,ctrp_SR8278,ctrp_elocalcitol,ctrp_BRD-A05715709,ctrp_docetaxel,ctrp_cyclophosphamide,ctrp_3-Cl-AHPC,...,ccle_PHA-665752,ccle_PLX4720,ccle_Paclitaxel,ccle_Panobinostat,ccle_RAF265,ccle_Sorafenib,ccle_TAE684,ccle_TKI258,ccle_Topotecan,ccle_ZD-6474
DMB006,0.24922,-0.368372,0.224963,0.284399,-0.359116,0.345501,-0.279495,0.216308,0.289871,0.244442,...,-0.262857,-0.29992,-0.21247,-0.277326,-0.207103,-0.195914,-0.231196,-0.245791,0.21383,-0.257422


In [51]:
from drug_suggestion.drug_annotation import subset_to_reasonable_drugs
from drug_suggestion.expression.discover import load_discover_drug_to_cids
disco2cid = load_discover_drug_to_cids()
reasonable_results = subset_to_reasonable_drugs(setup.raw_discover_results, 
                                                disco2cid, 
                                                out_prefix='discover.{}'.format(setup.expression_control), 
                                                out_dir=setup.discover_out_dir)
reasonable_results

  return func(*args, **kwargs)


Unnamed: 0,DMB006
ctrp_elocalcitol,0.345501
ctrp_RAF265,0.211422
gdsc_Vinblastine,-0.282946
gdsc_Tamoxifen,-0.344953
ctrp_temozolomide,0.273595
ccle_Paclitaxel,-0.212470
ctrp_quizartinib,0.269583
gdsc_Sorafenib,-0.487374
gdsc_BI-2536,-0.298340
ctrp_daporinad,0.248200


In [70]:
from companion_script import format_drugs
from drug_suggestion.expression.discover import load_discover_drug_to_cids
disco2cid = load_discover_drug_to_cids()
reasonable_results = format_drugs(setup.raw_discover_results, 
                                                disco2cid, 
                                                out_prefix='discover.{}'.format(setup.expression_control), 
                                                out_dir=setup.discover_out_dir)
reasonable_results

  return func(*args, **kwargs)


600 155


Unnamed: 0_level_0,score,moa
drug,Unnamed: 1_level_1,Unnamed: 2_level_1
gdsc_Navitoclax,0.647316,"Bcl-2 family inhibitor: esp Bcl-xL, Bcl-2 and ..."
gdsc_Linsitinib,0.623647,IGF-1R inhibitor
gdsc_TL-2-105,0.608240,Not Clinically Relevant
gdsc_Tubastatin A,0.584972,Not Clinically Relevant
gdsc_SB52334,0.584363,Not Clinically Relevant
gdsc_GSK1070916,0.565351,Not Clinically Relevant
gdsc_GSK429286A,0.538728,Not Clinically Relevant
gdsc_QL-XII-61,0.531059,Not Clinically Relevant
gdsc_VX-702,0.526880,Not Clinically Relevant
gdsc_GW-2580,0.506626,Not Clinically Relevant


In [78]:
setup.df

Unnamed: 0,moa,GDSC,CTRP,CCLE,drug,score,evidence
tl-2-105,Not Clinically Relevant,0.608,,,gdsc_TL-2-105,0.608,+..
sb52334,Not Clinically Relevant,0.584,,,gdsc_SB52334,0.584,+..
gsk1070916,Not Clinically Relevant,0.565,,,gdsc_GSK1070916,0.565,+..
gsk429286a,Not Clinically Relevant,0.539,,,gdsc_GSK429286A,0.539,+..
linsitinib,IGF-1R inhibitor,0.624,0.449,,ctrp_linsitinib,0.536,++.
ql-xii-61,Not Clinically Relevant,0.531,,,gdsc_QL-XII-61,0.531,+..
vx-702,Not Clinically Relevant,0.527,,,gdsc_VX-702,0.527,+..
tubastatin a,Not Clinically Relevant,0.585,0.442,,ctrp_tubastatin A,0.514,++.
gw-2580,Not Clinically Relevant,0.507,,,gdsc_GW-2580,0.507,+..
bx-912,Not Clinically Relevant,0.498,,,gdsc_BX-912,0.498,+..


## Select signature genes for Connectivity Map

In [6]:
@genepattern.build_ui(
  description="This function parses CMap's results.",
  parameters={
    "setup": {"default": "setup",
              "hide": False,
              "description": "The variable which has the setup information"},
    "output_var": {
        "default": "setup",
        "hide": True,
    },
})
def make_cmap_slide(setup):
    log('About to parse CMap results.')
    patient_exp = pd.read_csv(setup.patient_gexp_file, index_col=0)
    control_exp = load_control_exp(setup.expression_control)
    cmap_genesets = make_cmap_genesets(patient_exp, control_exp)
    write_cmap_genesets(cmap_genesets, setup.cmap_out_dir)

    # must match path to downloaded .gct file
    cmap_gct = os.path.join(setup.cmap_out_dir, 'cmap_result.gct')

    if os.path.exists(cmap_gct):
        cmap_ranked_drugs = read_cmap_gct(cmap_gct)
        cmap_ranked_drugs.columns = [setup.case_id]
        cmap_ranked_drugs.to_csv(setup.cmap_all_ranked_drugs_file)
        cmap2cid = load_cmap_drug_to_cids()
        cmap_reasonable = subset_to_reasonable_drugs(cmap_ranked_drugs.T, 
                                   cmap2cid,
                                   out_prefix='cmap.{}'.format(setup.expression_control), 
                                   out_dir=setup.cmap_out_dir).sort_values(by=setup.case_id, ascending=False)
        rdrugs_cmap = pd.read_csv(setup.cmap_reasonable_ranked_drugs_file, index_col=None)
        make_exp_drug_ranking_results_slide(setup.mtb_ppt_file, rdrugs_cmap, setup.expression_control, method='CMap')
        setup.rdrugs_cmap = rdrugs_cmap
        pickle.dump(setup, file=open('patients/'+setup.patient_dir+'/drug_suggestions/'+setup.case_id+'_drug_recommendations.p','wb'))
        log("done!")
    else:
        log(f"cmap_result.gct not found! (It should be present in the directiory {setup.cmap_out_dir}).")
        log("Try again if you'd like to see CMap results.")
        log("Hint, you may want to go here:")
        log("https://clue.io/l1000-query#individual")
    
#     pickle.dump(setup, file=open(setup.out_dir+'_backup5_CMap.p','wb'))
    pickle.dump(setup, file=open(os.path.join(setup.out_dir, setup.case_id+'_backup5_CMap.p'),'wb'))
    return setup

UIBuilder(description="This function parses CMap's results.", function_import='make_cmap_slide', name='make_cm…

DiSCoVER "supporting information" or "evidence" means in how many of the three drug databases that drug is significant.

In [7]:
@genepattern.build_ui(
  description="This function merges the results of DiSCoVER and CMap.",
  parameters={
    "setup": {"default": "setup",
              "hide": False,
              "description": "The variable which has the setup information"},
    "output_var": {
        "default": "setup",
        "hide": True,
    },
})
def merge_discover_and_cmap(setup):
    log('Merging results from DiSCoVER and CMap.')
    try:
        combined_df = add_cmap_to_split_df(discover=setup.df,cmap=setup.rdrugs_cmap)
    except AttributeError:
        log("CMap info not present, proceeding with only DiSCoVER")
        combined_df = setup.df
    to_slide = rank_combined_df(combined_df)
    make_intersection_slide(setup.mtb_ppt_file, to_slide, setup.expression_control, method='DiSCoVER ∩ CMap')
    
    log("Done Merging results from DiSCoVER and CMap!")
    log("Saving combined_df and to_slide on setup variable")
    setup.combined_df = combined_df
    setup.to_slide = to_slide
    pickle.dump(setup, file=open('patients/'+setup.patient_dir+'/drug_suggestions/'+setup.case_id+'_drug_recommendations_merged.p','wb'))
#     pickle.dump(setup, file=open(setup.out_dir+'_backup6_merged.p','wb'))
    pickle.dump(setup, file=open(os.path.join(setup.out_dir, setup.case_id+'_backup6_merged.p'),'wb'))
    
    log("Done done!")
    return setup

UIBuilder(description='This function merges the results of DiSCoVER and CMap.', function_import='merge_discove…