## RNASeq workflow for PDX-HTS paper


This notebook contains all the analyses...

In [9]:
# Performing common imports ... a few other imports are made automatically by UIBuilder cells bellow. Don't delete them!
from common_imports import *

In [10]:
#Setting the random number generator seed. This is used by Sklearn's Random Forrest Classifier. Not a crucial part of the pipeline, more like an optional display -- but reproducibility is imperative
np.random.seed(0)

In [11]:
CLASSIFY = False
RUN_DISCOVER = True

In [12]:
def run_pipeline(case_id):
    print('======================================')
    print("Step 1: read_user_input")
    print('======================================')
    setup = read_user_input(case_id, patient_dir=case_id, dna_nexus_bool=False, is_medullo=True, control='original',custom_control_expression='N/A')
    print('======================================')
    print("Step 2: download_and_preprocess_rnaseq")
    print('======================================')
    setup = download_and_preprocess_rnaseq(setup)
    print('======================================')
    print("Step 3: classify_sample")
    print('======================================')
    if CLASSIFY:
        setup = classify_sample(setup)
    else:
         print("These are not the analyses you are looking for. Move along!")   
    print('======================================')
    print("Step 4: run_discover")
    print('======================================')
    if RUN_DISCOVER:
        setup = run_discover(setup)
    else:
        print("These are not the analyses you are looking for. Move along!") 
        setup = pickle.load(file=open(os.path.join(setup.out_dir, setup.case_id+'_backup4_DISCoVER.p'),'rb'))
    print('======================================')
#     print("Step 5: run_cmap")
#     print('======================================')
#     setup = make_cmap_slide(setup)
#     setup = merge_discover_and_cmap(setup)
    print(f'All analyses completed suscessfully with "patient" {case_id}')
    return

In [13]:
# "BT084"
run_pipeline('BT084')

Step 1: read_user_input
Reminder: if you want to use a custom control expresion, you must set control to 'custom'
==> Setting patient_dir = case_id
Setup done!
Step 2: download_and_preprocess_rnaseq
==> Checking if local file (/pdx-hts/Notebooks/data/preprocessed/exp/BT084.csv) exist.
==> This file containes the expression of the PDXs. Printing dataframe's info:
<class 'pandas.core.frame.DataFrame'>
Index: 1 entries, BT084 to BT084
Columns: 16247 entries, A1BG to ZZZ3
dtypes: float64(16247)
memory usage: 126.9+ KB
==> None
==> File /pdx-hts/Notebooks/patients/BT084/gene_abundance.sleuth.csv saved successfully
==> Done preprocessing!
Step 3: classify_sample
These are not the analyses you are looking for. Move along!
Step 4: run_discover
==> About to perform DiSCoVER.
Writing file: cell_lines_IDs_and_types_ctrp.csv
Writing file: cell_lines_IDs_and_types_COSMIC_IDS_gdsc.csv
Writing file: cell_lines_IDs_and_types_ccle.csv
==> DiSCoVER done!
==> Moving files contaiend enrichment of cell lin

In [21]:
# "DMB006": "DMB006",
run_pipeline("DMB006")

Step 1: read_user_input
Reminder: if you want to use a custom control expresion, you must set control to 'custom'
==> Setting patient_dir = case_id
Setup done!
Step 2: download_and_preprocess_rnaseq
==> Checking if local file (/pdx-hts/Notebooks/data/preprocessed/exp/DMB006.csv) exist.
==> This file containes the expression of the PDXs. Printing dataframe's info:
<class 'pandas.core.frame.DataFrame'>
Index: 1 entries, DMB006 to DMB006
Columns: 16247 entries, A1BG to ZZZ3
dtypes: float64(16247)
memory usage: 126.9+ KB
==> None
==> File /pdx-hts/Notebooks/patients/DMB006/gene_abundance.sleuth.csv saved successfully
==> Done preprocessing!
Step 3: classify_sample
These are not the analyses you are looking for. Move along!
Step 4: run_discover
These are not the analyses you are looking for. Move along!
Step 5: run_cmap
==> About to parse CMap results.


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return getattr(section, self.name)[new_key]
  return func(*args, **kwargs)


==> done!
==> Merging results from DiSCoVER and CMap.
done!
==> Done Merging results from DiSCoVER and CMap!
==> Saving combined_df and to_slide on setup variable
==> Saving the combined DiSCoVER+CMap resuls to a csv called DMB006_combined_DISCoVER_and_CMap_results.csv
==> Done done!
All analyses completed suscessfully with "patient" DMB006


In [22]:
# "ICB984": "ICB984",
run_pipeline("ICB984")

Step 1: read_user_input
Reminder: if you want to use a custom control expresion, you must set control to 'custom'
==> Setting patient_dir = case_id
Setup done!
Step 2: download_and_preprocess_rnaseq
==> Checking if local file (/pdx-hts/Notebooks/data/preprocessed/exp/ICB984.csv) exist.
==> This file containes the expression of the PDXs. Printing dataframe's info:
<class 'pandas.core.frame.DataFrame'>
Index: 1 entries, ICB984 to ICB984
Columns: 16247 entries, A1BG to ZZZ3
dtypes: float64(16247)
memory usage: 126.9+ KB
==> None
==> File /pdx-hts/Notebooks/patients/ICB984/gene_abundance.sleuth.csv saved successfully
==> Done preprocessing!
Step 3: classify_sample
These are not the analyses you are looking for. Move along!
Step 4: run_discover
These are not the analyses you are looking for. Move along!
Step 5: run_cmap
==> About to parse CMap results.


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return getattr(section, self.name)[new_key]
  return func(*args, **kwargs)


==> done!
==> Merging results from DiSCoVER and CMap.
done!
==> Done Merging results from DiSCoVER and CMap!
==> Saving combined_df and to_slide on setup variable
==> Saving the combined DiSCoVER+CMap resuls to a csv called ICB984_combined_DISCoVER_and_CMap_results.csv
==> Done done!
All analyses completed suscessfully with "patient" ICB984


In [23]:
# "ICB1299": "ICB1299",
run_pipeline("ICB1299")

Step 1: read_user_input
Reminder: if you want to use a custom control expresion, you must set control to 'custom'
==> Setting patient_dir = case_id
Setup done!
Step 2: download_and_preprocess_rnaseq
==> Checking if local file (/pdx-hts/Notebooks/data/preprocessed/exp/ICB1299.csv) exist.
==> This file containes the expression of the PDXs. Printing dataframe's info:
<class 'pandas.core.frame.DataFrame'>
Index: 1 entries, ICB1299 to ICB1299
Columns: 16247 entries, A1BG to ZZZ3
dtypes: float64(16247)
memory usage: 126.9+ KB
==> None
==> File /pdx-hts/Notebooks/patients/ICB1299/gene_abundance.sleuth.csv saved successfully
==> Done preprocessing!
Step 3: classify_sample
These are not the analyses you are looking for. Move along!
Step 4: run_discover
These are not the analyses you are looking for. Move along!
Step 5: run_cmap
==> About to parse CMap results.


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return getattr(section, self.name)[new_key]
  return func(*args, **kwargs)


==> done!
==> Merging results from DiSCoVER and CMap.
done!
==> Done Merging results from DiSCoVER and CMap!
==> Saving combined_df and to_slide on setup variable
==> Saving the combined DiSCoVER+CMap resuls to a csv called ICB1299_combined_DISCoVER_and_CMap_results.csv
==> Done done!
All analyses completed suscessfully with "patient" ICB1299


In [243]:
cosmic_lines

Unnamed: 0,CellosaurusID,CosmicIDs,Databases,Disease,Names
CVCL_D231,[CVCL_D231],"[889083, 946049, 1066155]","[ChEMBL-Cells, ChEMBL-Targets, Cosmic, Cosmic,...",Lung adenocarcinoma,[1-87]
CVCL_7942,[CVCL_7942],"[876687, 905206]","[CGH-DB, Cosmic, Cosmic, Wikidata]",Melanoma,"[1011-mel, 1011-MEL, 1011 mel, 1011mel, 1011, ..."
CVCL_4806,[CVCL_4806],[721372],"[Cosmic, Wikidata]",Prostate carcinoma,"[1013L, 1013-L]"
CVCL_8029,[CVCL_8029],[876702],"[Cosmic, Wikidata]",Melanoma,"[1088-mel, 1088-MEL, 1088 Mel, 1088 mel, 1088]"
CVCL_VJ83,[CVCL_VJ83],[2543722],[Cosmic],Cutaneous melanoma,"[10CM, 10 CM]"
CVCL_D856,[CVCL_D856],[1731373],"[Cosmic, Wikidata]",Lung adenocarcinoma,"[11-18-ER1-7, 11-18/ER1-7]"
CVCL_D857,[CVCL_D857],[1731374],"[Cosmic, Wikidata]",Lung adenocarcinoma,"[11-18-ER2-1, 11-18/ER2-1]"
CVCL_D776,[CVCL_D776],[1731375],"[Cosmic, Wikidata]",Lung adenocarcinoma,"[11-18-GEF10-1, 11-18/GEF10-1]"
CVCL_D777,[CVCL_D777],[1731376],"[Cosmic, Wikidata]",Lung adenocarcinoma,"[11-18-GEF20-1, 11-18/GEF20-1]"
CVCL_S745,[CVCL_S745],[1890459],"[Cosmic, Wikidata]",Lung adenocarcinoma,"[11-18/R, 11-18R, 11-18/ER]"


In [24]:
# "ICB1487": "ICB1487",
run_pipeline("ICB1487")

Step 1: read_user_input
Reminder: if you want to use a custom control expresion, you must set control to 'custom'
==> Setting patient_dir = case_id
Setup done!
Step 2: download_and_preprocess_rnaseq
==> Checking if local file (/pdx-hts/Notebooks/data/preprocessed/exp/ICB1487.csv) exist.
==> This file containes the expression of the PDXs. Printing dataframe's info:
<class 'pandas.core.frame.DataFrame'>
Index: 1 entries, ICB1487 to ICB1487
Columns: 16247 entries, A1BG to ZZZ3
dtypes: float64(16247)
memory usage: 126.9+ KB
==> None
==> File /pdx-hts/Notebooks/patients/ICB1487/gene_abundance.sleuth.csv saved successfully
==> Done preprocessing!
Step 3: classify_sample
These are not the analyses you are looking for. Move along!
Step 4: run_discover
These are not the analyses you are looking for. Move along!
Step 5: run_cmap
==> About to parse CMap results.


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return getattr(section, self.name)[new_key]
  return func(*args, **kwargs)


==> done!
==> Merging results from DiSCoVER and CMap.
done!
==> Done Merging results from DiSCoVER and CMap!
==> Saving combined_df and to_slide on setup variable
==> Saving the combined DiSCoVER+CMap resuls to a csv called ICB1487_combined_DISCoVER_and_CMap_results.csv
==> Done done!
All analyses completed suscessfully with "patient" ICB1487


In [25]:
# "ICB1572": "ICB1572",
run_pipeline("ICB1572")

Step 1: read_user_input
Reminder: if you want to use a custom control expresion, you must set control to 'custom'
==> Setting patient_dir = case_id
Setup done!
Step 2: download_and_preprocess_rnaseq
==> Checking if local file (/pdx-hts/Notebooks/data/preprocessed/exp/ICB1572.csv) exist.
==> This file containes the expression of the PDXs. Printing dataframe's info:
<class 'pandas.core.frame.DataFrame'>
Index: 1 entries, ICB1572 to ICB1572
Columns: 16247 entries, A1BG to ZZZ3
dtypes: float64(16247)
memory usage: 126.9+ KB
==> None
==> File /pdx-hts/Notebooks/patients/ICB1572/gene_abundance.sleuth.csv saved successfully
==> Done preprocessing!
Step 3: classify_sample
These are not the analyses you are looking for. Move along!
Step 4: run_discover
These are not the analyses you are looking for. Move along!
Step 5: run_cmap
==> About to parse CMap results.


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return getattr(section, self.name)[new_key]
  return func(*args, **kwargs)


==> done!
==> Merging results from DiSCoVER and CMap.
done!
==> Done Merging results from DiSCoVER and CMap!
==> Saving combined_df and to_slide on setup variable
==> Saving the combined DiSCoVER+CMap resuls to a csv called ICB1572_combined_DISCoVER_and_CMap_results.csv
==> Done done!
All analyses completed suscessfully with "patient" ICB1572


In [26]:
# "MB002": "MB002",
run_pipeline("MB002")

Step 1: read_user_input
Reminder: if you want to use a custom control expresion, you must set control to 'custom'
==> Setting patient_dir = case_id
Setup done!
Step 2: download_and_preprocess_rnaseq
==> Checking if local file (/pdx-hts/Notebooks/data/preprocessed/exp/MB002.csv) exist.
==> This file containes the expression of the PDXs. Printing dataframe's info:
<class 'pandas.core.frame.DataFrame'>
Index: 1 entries, MB002 to MB002
Columns: 16247 entries, A1BG to ZZZ3
dtypes: float64(16247)
memory usage: 126.9+ KB
==> None
==> File /pdx-hts/Notebooks/patients/MB002/gene_abundance.sleuth.csv saved successfully
==> Done preprocessing!
Step 3: classify_sample
These are not the analyses you are looking for. Move along!
Step 4: run_discover
These are not the analyses you are looking for. Move along!
Step 5: run_cmap
==> About to parse CMap results.


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return getattr(section, self.name)[new_key]
  return func(*args, **kwargs)


==> done!
==> Merging results from DiSCoVER and CMap.
done!
==> Done Merging results from DiSCoVER and CMap!
==> Saving combined_df and to_slide on setup variable
==> Saving the combined DiSCoVER+CMap resuls to a csv called MB002_combined_DISCoVER_and_CMap_results.csv
==> Done done!
All analyses completed suscessfully with "patient" MB002


In [27]:
# "MB009": "MB009",
run_pipeline("MB009")

Step 1: read_user_input
Reminder: if you want to use a custom control expresion, you must set control to 'custom'
==> Setting patient_dir = case_id
Setup done!
Step 2: download_and_preprocess_rnaseq
==> Checking if local file (/pdx-hts/Notebooks/data/preprocessed/exp/MB009.csv) exist.
==> This file containes the expression of the PDXs. Printing dataframe's info:
<class 'pandas.core.frame.DataFrame'>
Index: 1 entries, MB009 to MB009
Columns: 16247 entries, A1BG to ZZZ3
dtypes: float64(16247)
memory usage: 126.9+ KB
==> None
==> File /pdx-hts/Notebooks/patients/MB009/gene_abundance.sleuth.csv saved successfully
==> Done preprocessing!
Step 3: classify_sample
These are not the analyses you are looking for. Move along!
Step 4: run_discover
These are not the analyses you are looking for. Move along!
Step 5: run_cmap
==> About to parse CMap results.


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return getattr(section, self.name)[new_key]
  return func(*args, **kwargs)


==> done!
==> Merging results from DiSCoVER and CMap.
done!
==> Done Merging results from DiSCoVER and CMap!
==> Saving combined_df and to_slide on setup variable
==> Saving the combined DiSCoVER+CMap resuls to a csv called MB009_combined_DISCoVER_and_CMap_results.csv
==> Done done!
All analyses completed suscessfully with "patient" MB009


In [28]:
# "MED211": "MED211",
run_pipeline("MED211")

Step 1: read_user_input
Reminder: if you want to use a custom control expresion, you must set control to 'custom'
==> Setting patient_dir = case_id
Setup done!
Step 2: download_and_preprocess_rnaseq
==> Checking if local file (/pdx-hts/Notebooks/data/preprocessed/exp/MED211.csv) exist.
==> This file containes the expression of the PDXs. Printing dataframe's info:
<class 'pandas.core.frame.DataFrame'>
Index: 1 entries, MED211 to MED211
Columns: 16247 entries, A1BG to ZZZ3
dtypes: float64(16247)
memory usage: 126.9+ KB
==> None
==> File /pdx-hts/Notebooks/patients/MED211/gene_abundance.sleuth.csv saved successfully
==> Done preprocessing!
Step 3: classify_sample
These are not the analyses you are looking for. Move along!
Step 4: run_discover
These are not the analyses you are looking for. Move along!
Step 5: run_cmap
==> About to parse CMap results.


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return getattr(section, self.name)[new_key]
  return func(*args, **kwargs)


==> done!
==> Merging results from DiSCoVER and CMap.
done!
==> Done Merging results from DiSCoVER and CMap!
==> Saving combined_df and to_slide on setup variable
==> Saving the combined DiSCoVER+CMap resuls to a csv called MED211_combined_DISCoVER_and_CMap_results.csv
==> Done done!
All analyses completed suscessfully with "patient" MED211


In [29]:
# "MED411": "MED411",
run_pipeline("MED411")

Step 1: read_user_input
Reminder: if you want to use a custom control expresion, you must set control to 'custom'
==> Setting patient_dir = case_id
Setup done!
Step 2: download_and_preprocess_rnaseq
==> Checking if local file (/pdx-hts/Notebooks/data/preprocessed/exp/MED411.csv) exist.
==> This file containes the expression of the PDXs. Printing dataframe's info:
<class 'pandas.core.frame.DataFrame'>
Index: 1 entries, MED411 to MED411
Columns: 16247 entries, A1BG to ZZZ3
dtypes: float64(16247)
memory usage: 126.9+ KB
==> None
==> File /pdx-hts/Notebooks/patients/MED411/gene_abundance.sleuth.csv saved successfully
==> Done preprocessing!
Step 3: classify_sample
These are not the analyses you are looking for. Move along!
Step 4: run_discover
These are not the analyses you are looking for. Move along!


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return getattr(section, self.name)[new_key]
  return func(*args, **kwargs)


Step 5: run_cmap
==> About to parse CMap results.
==> done!
==> Merging results from DiSCoVER and CMap.
done!
==> Done Merging results from DiSCoVER and CMap!
==> Saving combined_df and to_slide on setup variable
==> Saving the combined DiSCoVER+CMap resuls to a csv called MED411_combined_DISCoVER_and_CMap_results.csv
==> Done done!
All analyses completed suscessfully with "patient" MED411


In [30]:
# "MED1712": "MED1712",
run_pipeline("MED1712")

Step 1: read_user_input
Reminder: if you want to use a custom control expresion, you must set control to 'custom'
==> Setting patient_dir = case_id
Setup done!
Step 2: download_and_preprocess_rnaseq
==> Checking if local file (/pdx-hts/Notebooks/data/preprocessed/exp/MED1712.csv) exist.
==> This file containes the expression of the PDXs. Printing dataframe's info:
<class 'pandas.core.frame.DataFrame'>
Index: 1 entries, MED1712 to MED1712
Columns: 16247 entries, A1BG to ZZZ3
dtypes: float64(16247)
memory usage: 126.9+ KB
==> None
==> File /pdx-hts/Notebooks/patients/MED1712/gene_abundance.sleuth.csv saved successfully
==> Done preprocessing!
Step 3: classify_sample
These are not the analyses you are looking for. Move along!
Step 4: run_discover
These are not the analyses you are looking for. Move along!
Step 5: run_cmap
==> About to parse CMap results.


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return getattr(section, self.name)[new_key]
  return func(*args, **kwargs)


==> done!
==> Merging results from DiSCoVER and CMap.
done!
==> Done Merging results from DiSCoVER and CMap!
==> Saving combined_df and to_slide on setup variable
==> Saving the combined DiSCoVER+CMap resuls to a csv called MED1712_combined_DISCoVER_and_CMap_results.csv
==> Done done!
All analyses completed suscessfully with "patient" MED1712


In [31]:
# "MED1911": "MED1911",
run_pipeline("MED1911")

Step 1: read_user_input
Reminder: if you want to use a custom control expresion, you must set control to 'custom'
==> Setting patient_dir = case_id
Setup done!
Step 2: download_and_preprocess_rnaseq
==> Checking if local file (/pdx-hts/Notebooks/data/preprocessed/exp/MED1911.csv) exist.
==> This file containes the expression of the PDXs. Printing dataframe's info:
<class 'pandas.core.frame.DataFrame'>
Index: 1 entries, MED1911 to MED1911
Columns: 16247 entries, A1BG to ZZZ3
dtypes: float64(16247)
memory usage: 126.9+ KB
==> None
==> File /pdx-hts/Notebooks/patients/MED1911/gene_abundance.sleuth.csv saved successfully
==> Done preprocessing!
Step 3: classify_sample
These are not the analyses you are looking for. Move along!
Step 4: run_discover
These are not the analyses you are looking for. Move along!
Step 5: run_cmap
==> About to parse CMap results.


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return getattr(section, self.name)[new_key]
  return func(*args, **kwargs)


==> done!
==> Merging results from DiSCoVER and CMap.
done!
==> Done Merging results from DiSCoVER and CMap!
==> Saving combined_df and to_slide on setup variable
==> Saving the combined DiSCoVER+CMap resuls to a csv called MED1911_combined_DISCoVER_and_CMap_results.csv
==> Done done!
All analyses completed suscessfully with "patient" MED1911


In [32]:
# "MED2312": "MED2312",
run_pipeline("MED2312")

Step 1: read_user_input
Reminder: if you want to use a custom control expresion, you must set control to 'custom'
==> Setting patient_dir = case_id
Setup done!
Step 2: download_and_preprocess_rnaseq
==> Checking if local file (/pdx-hts/Notebooks/data/preprocessed/exp/MED2312.csv) exist.
==> This file containes the expression of the PDXs. Printing dataframe's info:
<class 'pandas.core.frame.DataFrame'>
Index: 1 entries, MED2312 to MED2312
Columns: 16247 entries, A1BG to ZZZ3
dtypes: float64(16247)
memory usage: 126.9+ KB
==> None
==> File /pdx-hts/Notebooks/patients/MED2312/gene_abundance.sleuth.csv saved successfully
==> Done preprocessing!
Step 3: classify_sample
These are not the analyses you are looking for. Move along!
Step 4: run_discover
These are not the analyses you are looking for. Move along!
Step 5: run_cmap
==> About to parse CMap results.


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return getattr(section, self.name)[new_key]
  return func(*args, **kwargs)


==> done!
==> Merging results from DiSCoVER and CMap.
done!
==> Done Merging results from DiSCoVER and CMap!
==> Saving combined_df and to_slide on setup variable
==> Saving the combined DiSCoVER+CMap resuls to a csv called MED2312_combined_DISCoVER_and_CMap_results.csv
==> Done done!
All analyses completed suscessfully with "patient" MED2312


In [33]:
# "RCMB18": "RCMB18",
run_pipeline("RCMB18")

Step 1: read_user_input
Reminder: if you want to use a custom control expresion, you must set control to 'custom'
==> Setting patient_dir = case_id
Setup done!
Step 2: download_and_preprocess_rnaseq
==> Checking if local file (/pdx-hts/Notebooks/data/preprocessed/exp/RCMB18.csv) exist.
==> This file containes the expression of the PDXs. Printing dataframe's info:
<class 'pandas.core.frame.DataFrame'>
Index: 1 entries, RCMB18 to RCMB18
Columns: 16247 entries, A1BG to ZZZ3
dtypes: float64(16247)
memory usage: 126.9+ KB
==> None
==> File /pdx-hts/Notebooks/patients/RCMB18/gene_abundance.sleuth.csv saved successfully
==> Done preprocessing!
Step 3: classify_sample
These are not the analyses you are looking for. Move along!
Step 4: run_discover
These are not the analyses you are looking for. Move along!
Step 5: run_cmap
==> About to parse CMap results.


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return getattr(section, self.name)[new_key]
  return func(*args, **kwargs)


==> done!
==> Merging results from DiSCoVER and CMap.
done!
==> Done Merging results from DiSCoVER and CMap!
==> Saving combined_df and to_slide on setup variable
==> Saving the combined DiSCoVER+CMap resuls to a csv called RCMB18_combined_DISCoVER_and_CMap_results.csv
==> Done done!
All analyses completed suscessfully with "patient" RCMB18


In [34]:
# "RCMB20": "RCMB20",
run_pipeline("RCMB20")

Step 1: read_user_input
Reminder: if you want to use a custom control expresion, you must set control to 'custom'
==> Setting patient_dir = case_id
Setup done!
Step 2: download_and_preprocess_rnaseq
==> Checking if local file (/pdx-hts/Notebooks/data/preprocessed/exp/RCMB20.csv) exist.
==> This file containes the expression of the PDXs. Printing dataframe's info:
<class 'pandas.core.frame.DataFrame'>
Index: 1 entries, RCMB20 to RCMB20
Columns: 16247 entries, A1BG to ZZZ3
dtypes: float64(16247)
memory usage: 126.9+ KB
==> None
==> File /pdx-hts/Notebooks/patients/RCMB20/gene_abundance.sleuth.csv saved successfully
==> Done preprocessing!
Step 3: classify_sample
These are not the analyses you are looking for. Move along!
Step 4: run_discover
These are not the analyses you are looking for. Move along!
Step 5: run_cmap
==> About to parse CMap results.


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return getattr(section, self.name)[new_key]
  return func(*args, **kwargs)


==> done!
==> Merging results from DiSCoVER and CMap.
done!
==> Done Merging results from DiSCoVER and CMap!
==> Saving combined_df and to_slide on setup variable
==> Saving the combined DiSCoVER+CMap resuls to a csv called RCMB20_combined_DISCoVER_and_CMap_results.csv
==> Done done!
All analyses completed suscessfully with "patient" RCMB20


In [35]:
# "RCMB24": "RCMB24",
run_pipeline("RCMB24")

Step 1: read_user_input
Reminder: if you want to use a custom control expresion, you must set control to 'custom'
==> Setting patient_dir = case_id
Setup done!
Step 2: download_and_preprocess_rnaseq
==> Checking if local file (/pdx-hts/Notebooks/data/preprocessed/exp/RCMB24.csv) exist.
==> This file containes the expression of the PDXs. Printing dataframe's info:
<class 'pandas.core.frame.DataFrame'>
Index: 1 entries, RCMB24 to RCMB24
Columns: 16247 entries, A1BG to ZZZ3
dtypes: float64(16247)
memory usage: 126.9+ KB
==> None
==> File /pdx-hts/Notebooks/patients/RCMB24/gene_abundance.sleuth.csv saved successfully
==> Done preprocessing!
Step 3: classify_sample
These are not the analyses you are looking for. Move along!
Step 4: run_discover
These are not the analyses you are looking for. Move along!
Step 5: run_cmap
==> About to parse CMap results.


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return getattr(section, self.name)[new_key]
  return func(*args, **kwargs)


==> done!
==> Merging results from DiSCoVER and CMap.
done!
==> Done Merging results from DiSCoVER and CMap!
==> Saving combined_df and to_slide on setup variable
==> Saving the combined DiSCoVER+CMap resuls to a csv called RCMB24_combined_DISCoVER_and_CMap_results.csv
==> Done done!
All analyses completed suscessfully with "patient" RCMB24


In [36]:
# "RCMB28": "RCMB28",
run_pipeline("RCMB28")

Step 1: read_user_input
Reminder: if you want to use a custom control expresion, you must set control to 'custom'
==> Setting patient_dir = case_id
Setup done!
Step 2: download_and_preprocess_rnaseq
==> Checking if local file (/pdx-hts/Notebooks/data/preprocessed/exp/RCMB28.csv) exist.
==> This file containes the expression of the PDXs. Printing dataframe's info:
<class 'pandas.core.frame.DataFrame'>
Index: 1 entries, RCMB28 to RCMB28
Columns: 16247 entries, A1BG to ZZZ3
dtypes: float64(16247)
memory usage: 126.9+ KB
==> None
==> File /pdx-hts/Notebooks/patients/RCMB28/gene_abundance.sleuth.csv saved successfully
==> Done preprocessing!
Step 3: classify_sample
These are not the analyses you are looking for. Move along!
Step 4: run_discover
These are not the analyses you are looking for. Move along!
Step 5: run_cmap
==> About to parse CMap results.


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return getattr(section, self.name)[new_key]
  return func(*args, **kwargs)


==> done!
==> Merging results from DiSCoVER and CMap.
done!
==> Done Merging results from DiSCoVER and CMap!
==> Saving combined_df and to_slide on setup variable
==> Saving the combined DiSCoVER+CMap resuls to a csv called RCMB28_combined_DISCoVER_and_CMap_results.csv
==> Done done!
All analyses completed suscessfully with "patient" RCMB28


In [37]:
# "RCMB32": "RCMB32",
run_pipeline("RCMB32")

Step 1: read_user_input
Reminder: if you want to use a custom control expresion, you must set control to 'custom'
==> Setting patient_dir = case_id
Setup done!
Step 2: download_and_preprocess_rnaseq
==> Checking if local file (/pdx-hts/Notebooks/data/preprocessed/exp/RCMB32.csv) exist.
==> This file containes the expression of the PDXs. Printing dataframe's info:
<class 'pandas.core.frame.DataFrame'>
Index: 1 entries, RCMB32 to RCMB32
Columns: 16247 entries, A1BG to ZZZ3
dtypes: float64(16247)
memory usage: 126.9+ KB
==> None
==> File /pdx-hts/Notebooks/patients/RCMB32/gene_abundance.sleuth.csv saved successfully
==> Done preprocessing!
Step 3: classify_sample
These are not the analyses you are looking for. Move along!
Step 4: run_discover
These are not the analyses you are looking for. Move along!
Step 5: run_cmap
==> About to parse CMap results.


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return getattr(section, self.name)[new_key]
  return func(*args, **kwargs)


==> done!
==> Merging results from DiSCoVER and CMap.
done!
==> Done Merging results from DiSCoVER and CMap!
==> Saving combined_df and to_slide on setup variable
==> Saving the combined DiSCoVER+CMap resuls to a csv called RCMB32_combined_DISCoVER_and_CMap_results.csv
==> Done done!
All analyses completed suscessfully with "patient" RCMB32


In [38]:
# "RCMB38": "RCMB38",
run_pipeline("RCMB38")

Step 1: read_user_input
Reminder: if you want to use a custom control expresion, you must set control to 'custom'
==> Setting patient_dir = case_id
Setup done!
Step 2: download_and_preprocess_rnaseq
==> Checking if local file (/pdx-hts/Notebooks/data/preprocessed/exp/RCMB38.csv) exist.
==> This file containes the expression of the PDXs. Printing dataframe's info:
<class 'pandas.core.frame.DataFrame'>
Index: 1 entries, RCMB38 to RCMB38
Columns: 16247 entries, A1BG to ZZZ3
dtypes: float64(16247)
memory usage: 126.9+ KB
==> None
==> File /pdx-hts/Notebooks/patients/RCMB38/gene_abundance.sleuth.csv saved successfully
==> Done preprocessing!
Step 3: classify_sample
These are not the analyses you are looking for. Move along!
Step 4: run_discover
These are not the analyses you are looking for. Move along!
Step 5: run_cmap
==> About to parse CMap results.


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return getattr(section, self.name)[new_key]
  return func(*args, **kwargs)


==> done!
==> Merging results from DiSCoVER and CMap.
done!
==> Done Merging results from DiSCoVER and CMap!
==> Saving combined_df and to_slide on setup variable
==> Saving the combined DiSCoVER+CMap resuls to a csv called RCMB38_combined_DISCoVER_and_CMap_results.csv
==> Done done!
All analyses completed suscessfully with "patient" RCMB38


In [39]:
# "RCMB40": "RCMB40"
run_pipeline("RCMB40")

Step 1: read_user_input
Reminder: if you want to use a custom control expresion, you must set control to 'custom'
==> Setting patient_dir = case_id
Setup done!
Step 2: download_and_preprocess_rnaseq
==> Checking if local file (/pdx-hts/Notebooks/data/preprocessed/exp/RCMB40.csv) exist.
==> This file containes the expression of the PDXs. Printing dataframe's info:
<class 'pandas.core.frame.DataFrame'>
Index: 1 entries, RCMB40 to RCMB40
Columns: 16247 entries, A1BG to ZZZ3
dtypes: float64(16247)
memory usage: 126.9+ KB
==> None
==> File /pdx-hts/Notebooks/patients/RCMB40/gene_abundance.sleuth.csv saved successfully
==> Done preprocessing!
Step 3: classify_sample
These are not the analyses you are looking for. Move along!
Step 4: run_discover
These are not the analyses you are looking for. Move along!
Step 5: run_cmap
==> About to parse CMap results.


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return getattr(section, self.name)[new_key]
  return func(*args, **kwargs)


==> done!
==> Merging results from DiSCoVER and CMap.
done!
==> Done Merging results from DiSCoVER and CMap!
==> Saving combined_df and to_slide on setup variable
==> Saving the combined DiSCoVER+CMap resuls to a csv called RCMB40_combined_DISCoVER_and_CMap_results.csv
==> Done done!
All analyses completed suscessfully with "patient" RCMB40


In [40]:
print('done!')

done!


In [1]:
# Requires GenePattern Notebook: pip install genepattern-notebook
import gp
import genepattern

# Username and password removed for security reasons.
genepattern.display(genepattern.session.register("https://cloud.genepattern.org/gp", "", ""))

GPAuthWidget()

In [2]:
@genepattern.build_ui(parameters={
    "output_var": {
        "default": "case_id",
        "hide": False,
    },
    "case_id": {"type": "choice",
                "description": "The name of the case, e.g., 'BT084'",
                "choices": {
                    "BT084": "BT084",
                    "DMB006": "DMB006",
                    "ICB984": "ICB984",
                    "ICB1299": "ICB1299",
                    "ICB1487": "ICB1487",
                    "ICB1572": "ICB1572",
                    "MB002": "MB002",
                    "MB009": "MB009",
                    "MED211": "MED211",
                    "MED411": "MED411",
                    "MED1712": "MED1712",
                    "MED1911": "MED1911",
                    "MED2312": "MED2312",
                    "RCMB18": "RCMB18",
                    "RCMB20": "RCMB20",
                    "RCMB24": "RCMB24",
                    "RCMB28": "RCMB28",
                    "RCMB32": "RCMB32",
                    "RCMB38": "RCMB38",
                    "RCMB40": "RCMB40",},
               "default":"BT084"},
})
def set_case_id(case_id):
    print(f'case_id set to "{case_id}"')
    return case_id

UIBuilder(function_import='set_case_id', name='set_case_id', params=[{'name': 'case_id', 'label': 'case_id', '…

## User input

Select parameters before running the rest of the notebook.

<div class="alert alert-info">
<h3 style="margin-top: 0;"> Instructions <i class="fa fa-info-circle"></i></h3>
Select parameters before running the rest of the notebook.
</div>

In [3]:
from companion_script import *
# # Select case
# case_id = 'case17'
# # This patient directory should match the directory name on DNANexus.
# patient_dir = '18-10716_tumor-normal'
# is_medullo = True # set False if it is another kind of brain tumor
%load_ext autoreload
%autoreload 2
%matplotlib inline
import readline # required for rpy2 extension
%load_ext rpy2.ipython


def rmagic_warning(
    message,
    category = rpy2.rinterface.RRuntimeWarning,
    filename = '',
    lineno = -1,
    file=None,
    line=None):
    print(message)
default_showwarning = warnings.showwarning


@genepattern.build_ui(parameters={
    "output_var": {
        "default": "setup",
        "hide": False,
    },
    "case_id": {"type": "text",
                "description": "The name of the case, e.g., 'PDX1'",
               "default":"PDX1"},
    "patient_dir": {"type": "text",
                    "description": 'For DNA Nexus downloads only. The name of the "patient" directory, e.g. "18-10716_tumor-normal" (quotes are required)',
                    "default":"PDX1_dir"},
    "dna_nexus_bool": {"type": "bool",
                   "description": "Whether or not this sample has been classified as medulloblastoma",
                  "default":False},
    "is_medullo": {"type": "bool",
                   "description": "Whether or not this sample has been classified as medulloblastoma",},
    "control": {"type": "choice",
                "description": "Whether or not to use a custom control",
                "choices": {
                    "original": "original",
                    "custom": "custom",
                            }
               },
    "custom_control_expression": {"type": "file",
                           "kinds": ["gct"],
                           "description": "The file (or path to the GCT file) which contains the gene expression of the custom control.",
                           "default":None},
})
def read_user_input(case_id, patient_dir, dna_nexus_bool=False, is_medullo=False, control='original',custom_control_expression=None):
    # Select control for DiSCoVER and Connectivity Map
    # Generally, if the tumor is a medulloblastoma, we use `cerebellar_stem` (comment the `neural_stem` line).
    # And if it is any other kind of brain tumor, we use `neural_stem`.
    if control == 'original':
        expression_control = 'cerebellar_stem' if is_medullo else 'neural_stem'
    elif control == 'custom':
        expression_control = 'custom_control'
    else:
        print('Unexpected value for variable named control, value:', control)
        
    if (len(custom_control_expression) is not 0) and (control is not 'custom'):
        print("Reminder: if you want to use a custom control expresion, you must set control to 'custom'")

    base_dir = os.getcwd()
    utilities_dir = '/build'
    patients_dir = os.path.join(base_dir, 'patients')
    if not dna_nexus_bool:
        log('Setting patient_dir = case_id')
        patient_dir = case_id
        
    in_dir = os.path.join(patients_dir, patient_dir)
    
    out_dir = in_dir
    os.makedirs(out_dir, exist_ok=True)
    
#     out['base_dir'] = base_dir
#     out['utilities_dir'] = utilities_dir
#     out['patients_dir'] = patients_dir
#     

    platform = sys.platform
    if platform.startswith('linux'):
        os_string = 'linux'
    elif platform == 'darwin':
        os_string = 'mac'
    else:
        raise ValueError('Platform "{}" not supported'.format(platform))

    # RNASeq quantification
    kallisto_dir = '/build/kallisto'
    kallisto_path = os.path.join(kallisto_dir, 'kallisto_{}-v0.44.0/kallisto'.format(os_string))
    transcriptome_index_path = os.path.join(kallisto_dir, 'GRCh38.ensembl.transcriptome.idx')
    local_fastqs_dir = os.path.join(in_dir, 'fastqs')
    os.makedirs(local_fastqs_dir, exist_ok=True)
    patient_gexp_file = os.path.join(out_dir, 'gene_abundance.sleuth.csv')

    # Medulloblastoma classification
#     from sklearn.ensemble import RandomForestClassifier
#     from tumor_classification.medulloblastoma import classify_cavalli, classify_cho, classify_northcott
    medullo_classify_out_dir = os.path.join(out_dir, 'medulloblastoma_classification')
    if not os.path.exists(medullo_classify_out_dir):
        os.mkdir(medullo_classify_out_dir)
    cavalli_subgroup_file = os.path.join(medullo_classify_out_dir, 'cavalli_subgroups.csv')
    cavalli_subgroup_direct_file = os.path.join(medullo_classify_out_dir, 'cavalli_subgroups_direct.csv')
    cavalli_subtype_file = os.path.join(medullo_classify_out_dir, 'cavalli_subtypes.csv')
    cho_subtype_file = os.path.join(medullo_classify_out_dir, 'cho_subtypes.csv')
    cho_subgroup_file = os.path.join(medullo_classify_out_dir, 'cho_subgroups.csv')
    northcott_subgroup_file = os.path.join(medullo_classify_out_dir, 'northcott_subgroups.csv')

    drug_suggestion_out_dir = os.path.join(out_dir, 'drug_suggestions')
    os.makedirs(drug_suggestion_out_dir, exist_ok=True)

    # DiSCoVER
    discover_out_dir = os.path.join(drug_suggestion_out_dir, 'discover/{}'.format(expression_control))
    os.makedirs(discover_out_dir, exist_ok=True)
    discover_heatmap_file = os.path.join(discover_out_dir, 'ctrp.png')
    full_discover_results_file = os.path.join(discover_out_dir, 'discover.all.csv')
    rdrugs_discover_file = os.path.join(discover_out_dir, '{}.discover.{}.reasonable.annotated.csv'.format(case_id, expression_control))

    # Connectivity Map
    cmap_out_dir = os.path.join(drug_suggestion_out_dir, 'cmap/{}'.format(expression_control))
    os.makedirs(cmap_out_dir, exist_ok=True)
    cmap_all_ranked_drugs_file = os.path.join(cmap_out_dir, '{}.cmap.{}.all.csv'.format(case_id, expression_control))
    cmap_reasonable_ranked_drugs_file = os.path.join(cmap_out_dir, '{}.cmap.{}.reasonable.annotated.csv'.format(case_id, expression_control))
    
    # Powerpoint for MTB
#     from slides import make_medullo_classification_slide, make_discover_workflow_slide, make_exp_drug_ranking_results_slide, make_intersection_slide
    mtb_ppt_file = os.path.join(out_dir, '{}.mtb_slides.pptx'.format(case_id))

    # DNANexus
    dx_source_path = os.path.join(utilities_dir, 'dx-toolkit/environment')
    dnanexus_project = 'UW_UCSD_RNAseq_collaboration_share'
    # Replace the contents of this file with your own DNANexus token.
    dnanexus_token_file = os.path.join(base_dir, 'dnanexus_token.txt')
    # To use the dx command, we must update some environment variables. 
    # From the command line, this is done with source dx-toolkit/environment, 
    # but from Python we have to use a workaround, because normally any changes 
    # to environment variables done in a subprocess are not reflected in the 
    # parent process. The workaround runs the source command in a subprocess, 
    # fetches the environment variables from the subprocess and updates those 
    # of the parent process.
#     from utils import source_and_update_env_vars
    source_and_update_env_vars(dx_source_path)    
    out = {"case_id": case_id,
                 "patient_dir": patient_dir,
                 "is_medullo": is_medullo}
    out['dna_nexus_bool']=dna_nexus_bool
    out['expression_control'] = expression_control
    out['custom_control_expression'] = custom_control_expression
    out['dnanexus_token_file'] = dnanexus_token_file
    out['local_fastqs_dir'] = local_fastqs_dir
    out['dnanexus_project'] = dnanexus_project
    out['local_fastqs_dir'] = local_fastqs_dir
    out['transcriptome_index_path'] = transcriptome_index_path
    out['kallisto_path'] = kallisto_path
    out['kallisto_dir'] = kallisto_dir
    out['out_dir'] = out_dir
    out['r_out_dir'] = out_dir.replace('\\',r'\\')
    out['patient_gexp_file'] = patient_gexp_file
    out['in_dir'] = in_dir
    out['cavalli_subgroup_file'] = cavalli_subgroup_file
    out['cavalli_subtype_file'] = cavalli_subtype_file
    out['cavalli_subgroup_direct_file'] = cavalli_subgroup_direct_file
    out['cho_subgroup_file'] = cho_subgroup_file
    out['cho_subtype_file'] = cho_subtype_file
    out['northcott_subgroup_file'] = northcott_subgroup_file
    out['mtb_ppt_file'] = mtb_ppt_file
    out['expression_control'] = expression_control
    out['full_discover_results_file'] = full_discover_results_file
    out['discover_out_dir'] = discover_out_dir
    out['discover_heatmap_file'] = discover_heatmap_file
    out['rdrugs_discover_file'] = rdrugs_discover_file
    out['cmap_out_dir'] = cmap_out_dir
    out['cmap_all_ranked_drugs_file'] = cmap_all_ranked_drugs_file
    out['cmap_reasonable_ranked_drugs_file'] = cmap_reasonable_ranked_drugs_file
    out['mtb_ppt_file'] = mtb_ppt_file
    out['out_dir'] = os.path.join(patients_dir, case_id)

    if not os.path.exists(out['out_dir']):
        os.mkdir(out['out_dir'])
    print('Setup done!')
    pickle.dump(out, file=open(os.path.join(out['out_dir'], case_id+'_backup1_input.p'),'wb'))
    return Bunch(out)

UIBuilder(function_import='read_user_input', name='read_user_input', params=[{'name': 'case_id', 'label': 'cas…

<div class="well">
Running all cells below this point will execute all the analyses except for one: the Connectivity Map analysis at the end of the notebook, which requires two manual steps.
</div>

# Download RNAseq data

In [4]:
@genepattern.build_ui(parameters={
    "setup": {"default": "setup",
              "hide": False,
              "description": "The variable which has the setup information"},
    "input_expression_dir":{"hide":True},
    "output_var": {
        "default": "setup",
        "hide": True,
    },
})
def download_and_preprocess_rnaseq(setup, input_expression_dir = None):
    setup.input_expression_dir = f'/pdx-hts/Notebooks/data/preprocessed/exp/{setup.case_id}.csv'
    if setup.dna_nexus_bool:
        if input_expression_dir is not None:
            log(f"input_expression_dir has a value ({setup.input_expression_dir}), and it will be ignored!")
        log('About to download fastqfiles from DNA Nexus. This may take a while.')
        with open(setup.dnanexus_token_file, 'r') as f:
            dnanexus_token = f.readline().strip()
        login_command = 'dx login --token {} --noprojects; dx select {}'.format(dnanexus_token, setup.dnanexus_project)
        # subprocess.check_output('ls', shell=True).decode('utf-8').strip()
        subprocess.check_output(login_command, shell=True).decode('utf-8').strip()

        find_fastq_command = 'dx find data --name "*.fastq.gz" --path {}:{}'.format(setup.dnanexus_project, setup.patient_dir)
        find_fastq_return_lines = subprocess.check_output(find_fastq_command, shell=True).decode().strip().split('\n')
        re_string = '.*(/{}/.*\.fastq.gz) .*'.format(setup.patient_dir)
        fastq_path_re = re.compile(re_string)
        remote_fastq_paths = []
        local_fastq_subdirs = []

        for line in find_fastq_return_lines:
            search = fastq_path_re.search(line)
            remote_fastq_path = search.group(1)
            remote_fastq_paths.append(remote_fastq_path)
            fastq_subdir_path = os.path.dirname(remote_fastq_path)
            fastq_subdir = os.path.basename(fastq_subdir_path)
            local_fastq_subdir = os.path.join(setup.local_fastqs_dir, fastq_subdir)
            os.makedirs(local_fastq_subdir, exist_ok=True)
            local_fastq_subdirs.append(local_fastq_subdir)

        for remote_fastq_path, local_fastq_subdir in zip(remote_fastq_paths, local_fastq_subdirs):
            download_command = 'dx download "{}" -o "{}"'.format(remote_fastq_path, local_fastq_subdir)
            print('\t'+download_command)
            try:
                a=subprocess.check_output(download_command, shell=True)
            except subprocess.CalledProcessError as e:
                print('\tEncountered a dx error, this likely means you already have the file indicated above.')
                print('\tContinuing...\n')
                continue
        log('Done downloading the fastq files.')
        log('Preprocessing RNASeq data now:')
        log('Using kallisto to compute transcript abundance.')
        preprocess_rna_seq(setup)
        log('Done with tanscript abundance.')
        log('Using sleuth to aggregate transcript abundance into gene abbundance.')
        run_sleuth(setup)
        patient_exp = pd.read_csv(setup.patient_gexp_file, index_col=0).T
        patient_exp.index = [setup.case_id]
        setup.patient_exp = patient_exp
        patient_exp.to_csv(setup.patient_gexp_file)
        log('Habemus Genus Expressium *release the white smoke*')
    else:
        log(f'Checking if local file ({setup.input_expression_dir}) exist.')
        if os.path.isfile(setup.input_expression_dir):
            df = pd.read_csv(setup.input_expression_dir, index_col=0)
            setup.expression_input = df
            log("This file containes the expression of the PDXs. Printing dataframe's info:")
            log(setup.expression_input.info())
        else:
            log('File could not be located please check and run again.')
        patient_exp = df
        setup.patient_exp = patient_exp
        patient_exp.to_csv(setup.patient_gexp_file)
        log(f'File {setup.patient_gexp_file} saved successfully')
    
    pickle.dump(setup, file=open(os.path.join(setup.out_dir, setup.case_id+'_backup2_download.p'),'wb'))
#     pickle.dump(setup, file=open(setup.out_dir+'_backup2_download.p','wb'))
    log('Done preprocessing!')
    return setup

UIBuilder(function_import='download_and_preprocess_rnaseq', name='download_and_preprocess_rnaseq', params=[{'n…

# Classify the tumor by medulloblastoma subgroup and subtype

In [5]:
@genepattern.build_ui(
    description="This function classifies a medulloblastoma sample into a subgroup. Non-medulloblastoma samples are ignored.",
    parameters={
    "setup": {"default": "setup",
              "hide": False,
              "description": "The variable which has the setup information"},
    "output_var": {
        "default": "setup",
        "hide": True,
    },
})
def classify_sample(setup):
    # We have three datasets we can use to classify based on expression:
    # - [Cavalli et al. 2017](http://www.sciencedirect.com/science/article/pii/S1535610817302015) cohort. This cohort includes 763 tumors, and was used to define 12 finer-grained subtypes nested in the 4 subgroups. Both expression and methylation data are available.

    # - [Cho et al. 2011](http://www.mesirovlab.org/medulloblastoma/cho/) cohort. This paper identified two subtypes within G3 and two within G4, for a total of 6. It contains 194 tumors.

    # - [Northcott et al. 2017](http://www.nature.com/nature/journal/v547/n7663/full/nature22973.html) expression data (shared by Sebastian). The labels we have for this data are of the 4 basic subgroups only. There are 223 tumors.

    # When finer-grained subtypes are known, we perform the finer-grained classification first and also collapse the subtypes to the 4 basic subgroups, so as to report both subtype and subgroup probabilities. Classification is done using random forests.

    # Since the patient data are from the same platform and contain the same features each time, we can use pre-fit models to classify them. The classification methods also have a fallback in case the data looks different.

    # The tumor board is arranging for methylation data to be obtained from patient samples as well, since it seems it may be more informative than expression. Methylation data would also allow comparison to a large and variety collection of brain tumors, currently available through a DKFZ [web portal](https://www.molecularneuropathology.org/mnp).

    if setup.is_medullo:
        # Read in patient's gene-level RNASeq TPM data
        patient_exp = pd.read_csv(setup.patient_gexp_file, index_col=0)

        cavalli_subgroups, cavalli_subtypes = classify_cavalli(patient_exp)
        cavalli_subgroups.to_csv(setup.cavalli_subgroup_file)
        cavalli_subtypes.to_csv(setup.cavalli_subtype_file)
        setup.cavalli_subgroups = cavalli_subgroups
        setup.cavalli_subgroups.name='Cavalli'
        setup.cavalli_subtypes = cavalli_subtypes
        setup.cavalli_subtypes.name='Cavalli'

        cho_subgroups, cho_subtypes = classify_cho(patient_exp)
        cho_subtypes.to_csv(setup.cho_subtype_file)
        cho_subgroups.to_csv(setup.cho_subgroup_file)
        setup.cho_subtypes = cho_subtypes
        setup.cho_subtypes.name = 'Cho'
        setup.cho_subgroups = cho_subgroups
        setup.cho_subgroups.name = 'Cho'

        northcott_subgroups = classify_northcott(patient_exp)
        northcott_subgroups.to_csv(setup.northcott_subgroup_file)
        setup.northcott_subgroups = northcott_subgroups
        setup.northcott_subgroups.name = 'Northcott'
        

        make_medullo_classification_slide(setup.mtb_ppt_file,
                                          setup.cavalli_subgroup_file,
                                          setup.cavalli_subtype_file,
                                          setup.cho_subgroup_file,
                                          setup.cho_subtype_file,
                                          setup.northcott_subgroup_file)
        subgroups = pd.DataFrame(data=setup.cavalli_subgroups).join(setup.cho_subgroups).join(setup.northcott_subgroups)
        setup.subgroups = subgroups
        print(setup.subgroups)
        cavalli_subtypes = pd.DataFrame(data=setup.cavalli_subtypes)
        setup.cavalli_subtypes = cavalli_subtypes
        print(cavalli_subtypes)
        cho_subtypes = pd.DataFrame(pd.Series(index=[3,1,6,5,4,2], data=['SHH','G3-MYC','WNT','G3-photoreceptor','G4-Mixed','G4-neuronal'],name='Subtypes')).join(setup.cho_subtypes)
        setup.cho_subtypes = cho_subtypes
        print(cho_subtypes)
        
        log('Done! Move along')
    else:
        log('This is not medulloblastoma. Nothing to do here. Move along')
#     pickle.dump(setup, file=open(setup.out_dir+'_backup3_classify.p','wb'))
    pickle.dump(setup, file=open(os.path.join(setup.out_dir, setup.case_id+'_backup3_classify.p'),'wb'))
    
    return setup

UIBuilder(description='This function classifies a medulloblastoma sample into a subgroup. Non-medulloblastoma …

# Suggest drugs based on RNAseq data (DiSCoVER)

In [177]:
setup.discover_out_dir

'/pdx-hts/Notebooks/patients/DMB006/drug_suggestions/discover/cerebellar_stem'

In [6]:
@genepattern.build_ui(
  description="Run DiSCoVER on the provided sample and control.",
  parameters={
    "setup": {"default": "setup",
              "hide": False,
              "description": "The variable which has the setup information"},
    "output_var": {
        "default": "setup",
        "hide": True,
    },
})
def run_discover(setup):
    from rpy2.robjects import numpy2ri
    numpy2ri.activate()
#     from discover import discover_from_expression, plot_discover_from_expression
    from discover_temp import discover_from_expression, plot_discover_from_expression
    from drug_suggestion.expression.controls import load_control_exp
    patient_exp = pd.read_csv(setup.patient_gexp_file, index_col=0)
    control_exp = load_control_exp(setup.expression_control)
    log("About to perform DiSCoVER.")
    discover_results = discover_from_expression(exp=patient_exp, 
                                                control_exp=control_exp, 
                                                verbose=False, extra_outputs=True)
    setup.raw_discover_results = discover_results
#     pickle.dump(setup, file=open(setup.out_dir+'_backup4-0_DISCoVER.p','wb'))
#     print(discover_results)
    # move some files created by DiSCoVER
#     for cl_name in ['ccle','ctrp','gdsc']:
#         os.rename(f"COSMIC_cell_lines_IDs_and_types_{cl_name}.csv", os.path.join(setup.out_dir, f"COSMIC_cell_lines_IDs_and_types_{cl_name}.csv"))
    
    log("DiSCoVER done!")
    numpy2ri.deactivate()
    
    log("Moving files contaiend enrichment of cell lines")
    for current_file in ['cell_lines_IDs_and_types_ccle.csv','cell_lines_IDs_and_types_COSMIC_IDS_gdsc.csv','cell_lines_IDs_and_types_ctrp.csv']:
        os.rename(current_file,os.path.join(setup.discover_out_dir,current_file))
        log(f'Moved {current_file} to {os.path.join(setup.discover_out_dir,current_file)}')
    
    
    log("Ranking cell lines by enrichment and saving those")
    ranked_diseases_from_enrichment = rank_cell_lines(setup)
    
    log('Saving results to file.')
    # display(discover_results)
    discover_results.T.sort_values(by=setup.case_id, ascending=False).to_csv(setup.full_discover_results_file)
    log("Saving done!")
    log("NOT Restricting to clinically relevant drugs.")
    #Not all drugs in CCLE, CTRP, and GDSC are realistic candidates for treatment. We compiled a list of medications that are FDA-approved or in late-stage clinical trials, and Dr. Wechsler-Reya curated it to include only those that are relevant for treating brain tumors. Here we limit the results to these drugs and add Dr. Wechsler-Reya's mechanism-of-action annotations. To enable comparison of drug lists, drugs from the different sources have been mapped to PubChem compound IDs (CIDs) using [PubChemPy](http://pubchempy.readthedocs.io/en/latest/).
    from drug_suggestion.drug_annotation import subset_to_reasonable_drugs
    from drug_suggestion.expression.discover import load_discover_drug_to_cids
    disco2cid = load_discover_drug_to_cids()
    reasonable_results = subset_to_reasonable_drugs(discover_results, 
                                                disco2cid, 
                                                out_prefix='discover.{}'.format(setup.expression_control), 
                                                out_dir=setup.discover_out_dir)
    # This will override the file setup.rdrugs_discover_file
    all_drugs = format_drugs(discover_results, 
                                disco2cid, 
                                out_prefix='discover.{}'.format(setup.expression_control), 
                                out_dir=setup.discover_out_dir)
    log('Done restricting to clinically relevant drugs!')
    
##     log('making a discover illustrative method')
##     from drug_suggestion.expression.discover import plot_discover_from_expression
##     plot_discover_from_expression(case_id, 
##                                   discover_results, 
##                                   exp=patient_exp,
##                                   control_exp=control_exp,
##                                   cl='ctrp',
##                                   out_file=discover_heatmap_file)
##     make_discover_workflow_slide(mtb_ppt_file, discover_heatmap_file)
    log('Making the DiSCoVER powerpoint.')
    rdrugs_discover = pd.read_csv(setup.rdrugs_discover_file, index_col=None)

    
    # Using all of the drugs
    log('Using all of the drugs')
    df = split_discover_dataframe(df=rdrugs_discover)
    df = rank_drugs_discover(df)
#     df.head()
    make_exp_drug_ranking_results_slide(setup.mtb_ppt_file, df.head(20), setup.expression_control, method='DiSCoVER')
    log('Done making the DiSCoVER powerpoint slide!')
    log('Savig the variables to a file.')
    setup.ranked_diseases_from_enrichment = ranked_diseases_from_enrichment
    setup.discover_results = discover_results
    setup.disco2cid = disco2cid
    setup.control_exp = control_exp
    setup.reasonable_results = reasonable_results
    setup.df = df
    log('Saving the formatted results of DiSCoVER to a csv')
    df.to_csv(os.path.join(setup.out_dir, setup.case_id+'_formated_DISCoVER_results.csv'))
#     pickle.dump(setup, file=open(setup.out_dir+'_backup4_DISCoVER.p','wb'))
    pickle.dump(setup, file=open(os.path.join(setup.out_dir, setup.case_id+'_backup4_DISCoVER.p'),'wb'))
#     pickle.dump(setup, file=open(setup.case_id+'_DISCoVER.p','wb'))
    log('Done savig the variables to a file!')
    
    log('Done with all the taks in this cell. Move along.')
    return setup

UIBuilder(description='Run DiSCoVER on the provided sample and control.', function_import='run_discover', name…

# Run CMap

In [None]:
# delete
case_id='BT084'
print('======================================')
print("Step 1: read_user_input")
print('======================================')
setup = read_user_input(case_id, patient_dir=case_id, dna_nexus_bool=False, is_medullo=True, control='original',custom_control_expression='N/A')
print('======================================')
print("Step 2: download_and_preprocess_rnaseq")
print('======================================')
setup = download_and_preprocess_rnaseq(setup)
print('======================================')
print("Step 3: classify_sample")
print('======================================')
if CLASSIFY:
    setup = classify_sample(setup)
else:
     print("These are not the analyses you are looking for. Move along!")   
print('======================================')
print("Step 4: run_discover")
print('======================================')
setup = run_discover(setup)
print('======================================')
print("Step 5: run_cmap")
print('======================================')
setup = make_cmap_slide(setup)
setup = merge_discover_and_cmap(setup)
print(f'All analyses completed suscessfully with "patient" {case_id}')


In [7]:
from cmap import make_cmap_genesets, write_cmap_genesets
from cmap import read_cmap_gct, load_cmap_drug_to_cids
from controls import load_control_exp

@genepattern.build_ui(
  description="This function parses CMap's results.",
  parameters={
    "setup": {"default": "setup",
              "hide": False,
              "description": "The variable which has the setup information"},
    "output_var": {
        "default": "setup",
        "hide": True,
    },
})
def make_cmap_slide(setup):
    log('About to parse CMap results.')
    patient_exp = pd.read_csv(setup.patient_gexp_file, index_col=0)
    control_exp = load_control_exp(setup.expression_control)
    cmap_genesets = make_cmap_genesets(patient_exp, control_exp)
    write_cmap_genesets(cmap_genesets, setup.cmap_out_dir)

    # must match path to downloaded .gct file
    cmap_gct = os.path.join(setup.cmap_out_dir, 'cmap_result.gct')

    if os.path.exists(cmap_gct):
        cmap_ranked_drugs = read_cmap_gct(cmap_gct)
        cmap_ranked_drugs.columns = [setup.case_id]
        cmap_ranked_drugs.to_csv(setup.cmap_all_ranked_drugs_file)
        cmap2cid = load_cmap_drug_to_cids()
        cmap_reasonable = subset_to_reasonable_drugs(cmap_ranked_drugs.T, 
                                   cmap2cid,
                                   out_prefix='cmap.{}'.format(setup.expression_control), 
                                   out_dir=setup.cmap_out_dir).sort_values(by=setup.case_id, ascending=False)
        rdrugs_cmap = pd.read_csv(setup.cmap_reasonable_ranked_drugs_file, index_col=None)
        make_exp_drug_ranking_results_slide(setup.mtb_ppt_file, rdrugs_cmap, setup.expression_control, method='CMap')
        setup.rdrugs_cmap = rdrugs_cmap
        setup.cmap_flag = True
        pickle.dump(setup, file=open(os.path.join(setup.out_dir, setup.case_id+'_backup5_CMap.p'),'wb'))
        log("done!")
    else:
        setup.cmap_flag = False
        log(f"cmap_result.gct not found! (It should be present in the directiory {setup.cmap_out_dir}).")
        log("Try again if you'd like to see CMap results.")
        log("Hint, you may want to go here:")
        log("https://clue.io/l1000-query#individual")
    return setup

UIBuilder(description="This function parses CMap's results.", function_import='make_cmap_slide', name='make_cm…

In [None]:
setup = make_cmap_slide(setup)

In [8]:
def add_cmap_to_split_df(discover,cmap):
    df = discover.rename(index=str, columns={"score": "DiSCoVER", "moa":"MoA"},inplace=False)
    for index, row in cmap.iterrows():
        drug = row['drug'].lower()
        try:
            if str(df.loc[drug,'CMAP']) != 'nan':
                #drug already exists and a CMAP score has been added
                df.loc[drug,'CMAP'] = np.nanmean([row['score'], df.loc[drug,'CMAP']])
            else:
                # drug already exists but a CMAP score has not been added
                df.loc[drug,'CMAP'] = row['score']
        except KeyError:
            # drug didn't exist therefore a CMAP score had not been added
            df.loc[drug,'CMAP'] = row['score']    
            df.loc[drug,'MoA'] = row['moa']
            df.loc[drug,'drug'] = 'cmap_'+drug
        if str(df.loc[drug,'evidence'])=='nan': #numpy is not letting me use np.isnan()
            df.loc[drug,'evidence'] = '...'+sign_to_letter[str(np.sign(row['score']))]
        else:
            df.loc[drug,'evidence'] = str(df.loc[drug,'evidence'])+sign_to_letter[str(np.sign(row['score']))]
    
    #update those rows which are not in cmap
    for index, row in df.iterrows():
        if len(row['evidence'])==3:
            df.loc[index,'evidence']  = df.loc[index,'evidence']+'.'            
    
    return df[['drug','MoA','GDSC','CTRP','CCLE','DiSCoVER','CMAP','evidence']]


def custom_mean(ser):
    if pd.isna(ser['DiSCoVER']):
        to_mean = ser['CMAP rank']
    elif pd.isna(ser['CMAP']):
        to_mean = ser['DiSCoVER rank']
    else:
        to_mean = ser[['DiSCoVER rank','CMAP rank']].mean()
#         to_mean = np.mean(ser['DiSCoVER rank','CMAP rank'])
    return to_mean


def rank_combined_df(df, strict_rules = False):
#     df['average'] = df.drop(['MoA','GDSC','CTRP','CCLE','support'],axis=1,inplace=False).mean(axis=1,skipna=True).round(3)
    df.sort_values(by=['DiSCoVER'],ascending=False,axis=0,inplace=True)
    df['DiSCoVER rank'] = range(1, len(df) + 1)
    df.sort_values(by=['CMAP'],ascending=False,axis=0,inplace=True)
    df['CMAP rank'] = range(1, len(df) + 1)
    
    # compute average rank
    
    #A simple average does not do the trick. It penalizes drugs which show up in only one of the methods
    # df['Average rank'] = df[['DiSCoVER rank','CMAP rank']].mean(axis=1)
    df['Average rank'] = df[['DiSCoVER','CMAP','DiSCoVER rank','CMAP rank']].apply(custom_mean, axis=1)
    
    df.sort_values(by=['Average rank'],ascending=True,axis=0,inplace=True)
    
    if strict_rules:
        df = df[((df['DiSCoVER']>0.001).values) & ((df['CMAP']>0.001).values)]
#     combined_df = combined_df[((combined_df['DiSCoVER']>0.001).values) & ((combined_df['CMAP']>0.001).values)]

    return df.drop(['GDSC','CTRP','CCLE'],axis=1,inplace=False)

@genepattern.build_ui(
  description="This function merges the results of DiSCoVER and CMap.",
  parameters={
    "setup": {"default": "setup",
              "hide": False,
              "description": "The variable which has the setup information"},
    "output_var": {
        "default": "setup",
        "hide": True,
    },
})
def merge_discover_and_cmap(setup):
    if setup.cmap_flag:
        log('Merging results from DiSCoVER and CMap.')
        combined_df = add_cmap_to_split_df(discover=setup.df,cmap=setup.rdrugs_cmap)
        to_slide = rank_combined_df(combined_df)

        #this is commented out, since it is not used for the paper
        make_intersection_slide(setup.mtb_ppt_file, to_slide, setup.expression_control, method='DiSCoVER ∩ CMap')

        log("Done Merging results from DiSCoVER and CMap!")
        log("Saving combined_df and to_slide on setup variable")
        setup.combined_df = combined_df
        setup.to_slide = to_slide
        log(f"Saving the combined DiSCoVER+CMap resuls to a csv called {setup.case_id+'_combined_DISCoVER_and_CMap_results.csv'}")
        setup.combined_df.to_csv(os.path.join(setup.out_dir, setup.case_id+'_combined_DISCoVER_and_CMap_results.csv'))
        pickle.dump(setup, file=open(os.path.join(setup.out_dir, setup.case_id+'_backup6_combined_results.p'),'wb'))
        log("Done done!")
        
    else:
        log('CMap has not run! Nothing to do here.')
    
    return setup

UIBuilder(description='This function merges the results of DiSCoVER and CMap.', function_import='merge_discove…

In [None]:
setup.to_slide

In [None]:
d_names = [
'monafide',
'bx-912',
'linsitinib',
'ABT-737',
'anisomycin',
'GSK1070916',
'topotecan',
'etoposide'
]

d_names = [x.lower() for x in d_names]

In [None]:
setup.combined_df[setup.combined_df.index.isin(d_names)]