## RNASeq workflow for PDX-HTS paper


This notebook contains all the analyses...

In [7]:
# Performing common imports ... a few other imports are made automatically by UIBuilder cells bellow. Don't delete them!
from common_imports import *

In [8]:
#Setting the random number generator seed. This is used by Sklearn's Random Forrest Classifier. Not a crucial part of the pipeline, more like an optional display -- but reproducibility is imperative
np.random.seed(0)

In [9]:
def run_pipeline(case_id):
    print('======================================')
    print("Step 1: read_user_input")
    print('======================================')
    setup = read_user_input(case_id, patient_dir=case_id, dna_nexus_bool=False, is_medullo=True, control='original',custom_control_expression='N/A')
    print('======================================')
    print("Step 2: download_and_preprocess_rnaseq")
    print('======================================')
    setup = download_and_preprocess_rnaseq(setup)
    print('======================================')
    print("Step 3: classify_sample")
    print('======================================')
    setup = classify_sample(setup)
    print('======================================')
    print("Step 4: run_discover")
    print('======================================')
    setup = run_discover(setup)
    print(f'All analyses completed suscessfully with "patient" {case_id}')
    return

In [10]:
# "BT084"
run_pipeline('BT084')

Step 1: read_user_input
Reminder: if you want to use a custom control expresion, you must set control to 'custom'
==> Setting patient_dir = case_id
Setup done!
Step 2: download_and_preprocess_rnaseq
==> Checking if local file (/pdx-hts/Notebooks/data/preprocessed/exp/BT084.csv) exist.
==> This file containes the expression of the PDXs. Printing dataframe's info:
<class 'pandas.core.frame.DataFrame'>
Index: 1 entries, BT084 to BT084
Columns: 16247 entries, A1BG to ZZZ3
dtypes: float64(16247)
memory usage: 126.9+ KB
==> None
==> File /pdx-hts/Notebooks/patients/BT084/gene_abundance.sleuth.csv saved successfully
==> Done preprocessing!
Step 3: classify_sample


  infer_datetime_format=infer_datetime_format)


     Cavalli     Cho  Northcott
SHH   0.5975  0.6827     0.6285
G3    0.1642  0.1750     0.1852
WNT   0.1258  0.0618     0.0676
G4    0.1125  0.0805     0.1187
           Cavalli
SHH_alpha   0.3594
SHH_delta   0.1047
G3_gamma    0.0791
SHH_gamma   0.0723
WNT_alpha   0.0638
G4_alpha    0.0638
WNT_beta    0.0620
SHH_beta    0.0611
G3_beta     0.0455
G3_alpha    0.0396
G4_gamma    0.0260
G4_beta     0.0227
           Subtypes     Cho
3               SHH  0.6827
1            G3-MYC  0.1266
6               WNT  0.0618
5  G3-photoreceptor  0.0484
4          G4-Mixed  0.0468
2       G4-neuronal  0.0337
==> Done! Move along
Step 4: run_discover
==> About to perform DiSCoVER.


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return getattr(section, self.name)[new_key]


==> DiSCoVER done!
==> Saving results to file.
==> Saving done!
==> NOT Restricting to clinically relevant drugs.


  return func(*args, **kwargs)


==> Done restricting to clinically relevant drugs!
==> Making the DiSCoVER powerpoint.
==> Using all of the drugs
==> Done making the DiSCoVER powerpoint slide!
==> Savig the variables to a file.
==> Saving the formatted results of DiSCoVER to a csv
==> Done savig the variables to a file!
==> Done with all the taks in this cell. Move along.
All analyses completed suscessfully with "patient" BT084


In [11]:
# "DMB006": "DMB006",
run_pipeline("DMB006")

Step 1: read_user_input
Reminder: if you want to use a custom control expresion, you must set control to 'custom'
==> Setting patient_dir = case_id
Setup done!
Step 2: download_and_preprocess_rnaseq
==> Checking if local file (/pdx-hts/Notebooks/data/preprocessed/exp/DMB006.csv) exist.
==> This file containes the expression of the PDXs. Printing dataframe's info:
<class 'pandas.core.frame.DataFrame'>
Index: 1 entries, DMB006 to DMB006
Columns: 16247 entries, A1BG to ZZZ3
dtypes: float64(16247)
memory usage: 126.9+ KB
==> None
==> File /pdx-hts/Notebooks/patients/DMB006/gene_abundance.sleuth.csv saved successfully
==> Done preprocessing!
Step 3: classify_sample


  infer_datetime_format=infer_datetime_format)


     Cavalli     Cho  Northcott
G4    0.4795  0.5229     0.5443
G3    0.2099  0.2740     0.3128
SHH   0.2029  0.1448     0.0860
WNT   0.1077  0.0583     0.0569
           Cavalli
G4_beta     0.2024
G4_alpha    0.1977
SHH_alpha   0.0903
G3_gamma    0.0808
G3_beta     0.0798
G4_gamma    0.0794
WNT_beta    0.0568
WNT_alpha   0.0509
G3_alpha    0.0493
SHH_gamma   0.0463
SHH_delta   0.0444
SHH_beta    0.0219
           Subtypes     Cho
3               SHH  0.1448
1            G3-MYC  0.1650
6               WNT  0.0583
5  G3-photoreceptor  0.1090
4          G4-Mixed  0.1894
2       G4-neuronal  0.3335
==> Done! Move along
Step 4: run_discover
==> About to perform DiSCoVER.


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return getattr(section, self.name)[new_key]


==> DiSCoVER done!
==> Saving results to file.
==> Saving done!
==> NOT Restricting to clinically relevant drugs.


  return func(*args, **kwargs)


==> Done restricting to clinically relevant drugs!
==> Making the DiSCoVER powerpoint.
==> Using all of the drugs
==> Done making the DiSCoVER powerpoint slide!
==> Savig the variables to a file.
==> Saving the formatted results of DiSCoVER to a csv
==> Done savig the variables to a file!
==> Done with all the taks in this cell. Move along.
All analyses completed suscessfully with "patient" DMB006


In [12]:
# "ICB984": "ICB984",
run_pipeline("ICB984")

Step 1: read_user_input
Reminder: if you want to use a custom control expresion, you must set control to 'custom'
==> Setting patient_dir = case_id
Setup done!
Step 2: download_and_preprocess_rnaseq
==> Checking if local file (/pdx-hts/Notebooks/data/preprocessed/exp/ICB984.csv) exist.
==> This file containes the expression of the PDXs. Printing dataframe's info:
<class 'pandas.core.frame.DataFrame'>
Index: 1 entries, ICB984 to ICB984
Columns: 16247 entries, A1BG to ZZZ3
dtypes: float64(16247)
memory usage: 126.9+ KB
==> None
==> File /pdx-hts/Notebooks/patients/ICB984/gene_abundance.sleuth.csv saved successfully
==> Done preprocessing!
Step 3: classify_sample


  infer_datetime_format=infer_datetime_format)


     Cavalli     Cho  Northcott
SHH   0.5263  0.5827     0.5420
G4    0.1773  0.1195     0.1530
G3    0.1597  0.2061     0.1984
WNT   0.1367  0.0917     0.1066
           Cavalli
SHH_alpha   0.2917
G4_alpha    0.1004
SHH_delta   0.0970
SHH_gamma   0.0803
G3_gamma    0.0787
WNT_alpha   0.0709
WNT_beta    0.0658
SHH_beta    0.0573
G3_beta     0.0461
G4_beta     0.0397
G4_gamma    0.0372
G3_alpha    0.0349
           Subtypes     Cho
3               SHH  0.5827
1            G3-MYC  0.1383
6               WNT  0.0917
5  G3-photoreceptor  0.0678
4          G4-Mixed  0.0675
2       G4-neuronal  0.0520
==> Done! Move along
Step 4: run_discover
==> About to perform DiSCoVER.


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return getattr(section, self.name)[new_key]


==> DiSCoVER done!
==> Saving results to file.
==> Saving done!
==> NOT Restricting to clinically relevant drugs.


  return func(*args, **kwargs)


==> Done restricting to clinically relevant drugs!
==> Making the DiSCoVER powerpoint.
==> Using all of the drugs
==> Done making the DiSCoVER powerpoint slide!
==> Savig the variables to a file.
==> Saving the formatted results of DiSCoVER to a csv
==> Done savig the variables to a file!
==> Done with all the taks in this cell. Move along.
All analyses completed suscessfully with "patient" ICB984


In [13]:
# "ICB1299": "ICB1299",
run_pipeline("ICB1299")

Step 1: read_user_input
Reminder: if you want to use a custom control expresion, you must set control to 'custom'
==> Setting patient_dir = case_id
Setup done!
Step 2: download_and_preprocess_rnaseq
==> Checking if local file (/pdx-hts/Notebooks/data/preprocessed/exp/ICB1299.csv) exist.
==> This file containes the expression of the PDXs. Printing dataframe's info:
<class 'pandas.core.frame.DataFrame'>
Index: 1 entries, ICB1299 to ICB1299
Columns: 16247 entries, A1BG to ZZZ3
dtypes: float64(16247)
memory usage: 126.9+ KB
==> None
==> File /pdx-hts/Notebooks/patients/ICB1299/gene_abundance.sleuth.csv saved successfully
==> Done preprocessing!
Step 3: classify_sample


  infer_datetime_format=infer_datetime_format)


     Cavalli     Cho  Northcott
G3    0.3766  0.4581     0.4818
SHH   0.2626  0.2630     0.1735
G4    0.2298  0.1744     0.2598
WNT   0.1310  0.1045     0.0849
           Cavalli
G3_gamma    0.2191
SHH_alpha   0.1349
G4_alpha    0.1209
G3_beta     0.0929
WNT_beta    0.0658
WNT_alpha   0.0652
G3_alpha    0.0646
G4_gamma    0.0564
G4_beta     0.0525
SHH_gamma   0.0511
SHH_delta   0.0511
SHH_beta    0.0255
           Subtypes     Cho
3               SHH  0.2630
1            G3-MYC  0.3323
6               WNT  0.1045
5  G3-photoreceptor  0.1258
4          G4-Mixed  0.0985
2       G4-neuronal  0.0759
==> Done! Move along
Step 4: run_discover
==> About to perform DiSCoVER.


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return getattr(section, self.name)[new_key]


==> DiSCoVER done!
==> Saving results to file.
==> Saving done!
==> NOT Restricting to clinically relevant drugs.


  return func(*args, **kwargs)


==> Done restricting to clinically relevant drugs!
==> Making the DiSCoVER powerpoint.
==> Using all of the drugs
==> Done making the DiSCoVER powerpoint slide!
==> Savig the variables to a file.
==> Saving the formatted results of DiSCoVER to a csv
==> Done savig the variables to a file!
==> Done with all the taks in this cell. Move along.
All analyses completed suscessfully with "patient" ICB1299


In [14]:
# "ICB1487": "ICB1487",
run_pipeline("ICB1487")

Step 1: read_user_input
Reminder: if you want to use a custom control expresion, you must set control to 'custom'
==> Setting patient_dir = case_id
Setup done!
Step 2: download_and_preprocess_rnaseq
==> Checking if local file (/pdx-hts/Notebooks/data/preprocessed/exp/ICB1487.csv) exist.
==> This file containes the expression of the PDXs. Printing dataframe's info:
<class 'pandas.core.frame.DataFrame'>
Index: 1 entries, ICB1487 to ICB1487
Columns: 16247 entries, A1BG to ZZZ3
dtypes: float64(16247)
memory usage: 126.9+ KB
==> None
==> File /pdx-hts/Notebooks/patients/ICB1487/gene_abundance.sleuth.csv saved successfully
==> Done preprocessing!
Step 3: classify_sample


  infer_datetime_format=infer_datetime_format)


     Cavalli     Cho  Northcott
G4    0.4831  0.5280     0.5384
G3    0.2534  0.2956     0.3065
SHH   0.1639  0.1121     0.1058
WNT   0.0996  0.0643     0.0493
           Cavalli
G4_alpha    0.2141
G4_gamma    0.1751
G3_beta     0.0973
G4_beta     0.0939
G3_gamma    0.0805
G3_alpha    0.0756
SHH_alpha   0.0696
WNT_alpha   0.0532
WNT_beta    0.0464
SHH_gamma   0.0403
SHH_delta   0.0294
SHH_beta    0.0246
           Subtypes     Cho
3               SHH  0.1121
1            G3-MYC  0.1383
6               WNT  0.0643
5  G3-photoreceptor  0.1573
4          G4-Mixed  0.2591
2       G4-neuronal  0.2689
==> Done! Move along
Step 4: run_discover
==> About to perform DiSCoVER.


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return getattr(section, self.name)[new_key]


==> DiSCoVER done!
==> Saving results to file.
==> Saving done!
==> NOT Restricting to clinically relevant drugs.


  return func(*args, **kwargs)


==> Done restricting to clinically relevant drugs!
==> Making the DiSCoVER powerpoint.
==> Using all of the drugs
==> Done making the DiSCoVER powerpoint slide!
==> Savig the variables to a file.
==> Saving the formatted results of DiSCoVER to a csv
==> Done savig the variables to a file!
==> Done with all the taks in this cell. Move along.
All analyses completed suscessfully with "patient" ICB1487


In [15]:
# "ICB1572": "ICB1572",
run_pipeline("ICB1572")

Step 1: read_user_input
Reminder: if you want to use a custom control expresion, you must set control to 'custom'
==> Setting patient_dir = case_id
Setup done!
Step 2: download_and_preprocess_rnaseq
==> Checking if local file (/pdx-hts/Notebooks/data/preprocessed/exp/ICB1572.csv) exist.
==> This file containes the expression of the PDXs. Printing dataframe's info:
<class 'pandas.core.frame.DataFrame'>
Index: 1 entries, ICB1572 to ICB1572
Columns: 16247 entries, A1BG to ZZZ3
dtypes: float64(16247)
memory usage: 126.9+ KB
==> None
==> File /pdx-hts/Notebooks/patients/ICB1572/gene_abundance.sleuth.csv saved successfully
==> Done preprocessing!
Step 3: classify_sample


  infer_datetime_format=infer_datetime_format)


     Cavalli     Cho  Northcott
G3    0.4070  0.5376     0.5146
G4    0.2295  0.1556     0.2332
SHH   0.2280  0.1990     0.1528
WNT   0.1355  0.1078     0.0994
           Cavalli
G3_gamma    0.2091
G4_alpha    0.1205
SHH_alpha   0.1150
G3_alpha    0.1033
G3_beta     0.0946
WNT_alpha   0.0682
WNT_beta    0.0673
G4_gamma    0.0568
G4_beta     0.0522
SHH_gamma   0.0483
SHH_delta   0.0411
SHH_beta    0.0236
           Subtypes     Cho
3               SHH  0.1990
1            G3-MYC  0.3481
6               WNT  0.1078
5  G3-photoreceptor  0.1895
4          G4-Mixed  0.0885
2       G4-neuronal  0.0671
==> Done! Move along
Step 4: run_discover
==> About to perform DiSCoVER.


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return getattr(section, self.name)[new_key]


==> DiSCoVER done!
==> Saving results to file.
==> Saving done!
==> NOT Restricting to clinically relevant drugs.


  return func(*args, **kwargs)


==> Done restricting to clinically relevant drugs!
==> Making the DiSCoVER powerpoint.
==> Using all of the drugs
==> Done making the DiSCoVER powerpoint slide!
==> Savig the variables to a file.
==> Saving the formatted results of DiSCoVER to a csv
==> Done savig the variables to a file!
==> Done with all the taks in this cell. Move along.
All analyses completed suscessfully with "patient" ICB1572


In [16]:
# "MB002": "MB002",
run_pipeline("MB002")

Step 1: read_user_input
Reminder: if you want to use a custom control expresion, you must set control to 'custom'
==> Setting patient_dir = case_id
Setup done!
Step 2: download_and_preprocess_rnaseq
==> Checking if local file (/pdx-hts/Notebooks/data/preprocessed/exp/MB002.csv) exist.
==> This file containes the expression of the PDXs. Printing dataframe's info:
<class 'pandas.core.frame.DataFrame'>
Index: 1 entries, MB002 to MB002
Columns: 16247 entries, A1BG to ZZZ3
dtypes: float64(16247)
memory usage: 126.9+ KB
==> None
==> File /pdx-hts/Notebooks/patients/MB002/gene_abundance.sleuth.csv saved successfully
==> Done preprocessing!
Step 3: classify_sample


  infer_datetime_format=infer_datetime_format)


     Cavalli     Cho  Northcott
G3    0.3920  0.4765     0.4839
SHH   0.2562  0.2780     0.1794
G4    0.2004  0.1332     0.2325
WNT   0.1514  0.1123     0.1042
           Cavalli
G3_gamma    0.2194
SHH_alpha   0.1354
G4_alpha    0.1023
G3_alpha    0.0889
G3_beta     0.0837
WNT_beta    0.0758
WNT_alpha   0.0756
G4_gamma    0.0526
SHH_delta   0.0506
G4_beta     0.0455
SHH_gamma   0.0448
SHH_beta    0.0254
           Subtypes     Cho
3               SHH  0.2780
1            G3-MYC  0.3458
6               WNT  0.1123
5  G3-photoreceptor  0.1307
4          G4-Mixed  0.0721
2       G4-neuronal  0.0611
==> Done! Move along
Step 4: run_discover
==> About to perform DiSCoVER.


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return getattr(section, self.name)[new_key]


==> DiSCoVER done!
==> Saving results to file.
==> Saving done!
==> NOT Restricting to clinically relevant drugs.


  return func(*args, **kwargs)


==> Done restricting to clinically relevant drugs!
==> Making the DiSCoVER powerpoint.
==> Using all of the drugs
==> Done making the DiSCoVER powerpoint slide!
==> Savig the variables to a file.
==> Saving the formatted results of DiSCoVER to a csv
==> Done savig the variables to a file!
==> Done with all the taks in this cell. Move along.
All analyses completed suscessfully with "patient" MB002


In [17]:
# "MB009": "MB009",
run_pipeline("MB009")

Step 1: read_user_input
Reminder: if you want to use a custom control expresion, you must set control to 'custom'
==> Setting patient_dir = case_id
Setup done!
Step 2: download_and_preprocess_rnaseq
==> Checking if local file (/pdx-hts/Notebooks/data/preprocessed/exp/MB009.csv) exist.
==> This file containes the expression of the PDXs. Printing dataframe's info:
<class 'pandas.core.frame.DataFrame'>
Index: 1 entries, MB009 to MB009
Columns: 16247 entries, A1BG to ZZZ3
dtypes: float64(16247)
memory usage: 126.9+ KB
==> None
==> File /pdx-hts/Notebooks/patients/MB009/gene_abundance.sleuth.csv saved successfully
==> Done preprocessing!
Step 3: classify_sample


  infer_datetime_format=infer_datetime_format)


     Cavalli     Cho  Northcott
G3    0.5379  0.6753     0.6232
SHH   0.1759  0.1212     0.0965
G4    0.1583  0.1297     0.2148
WNT   0.1279  0.0738     0.0655
           Cavalli
G3_gamma    0.2673
G3_beta     0.1399
G3_alpha    0.1307
SHH_alpha   0.0803
G4_alpha    0.0766
WNT_alpha   0.0680
WNT_beta    0.0599
G4_gamma    0.0510
SHH_gamma   0.0405
SHH_delta   0.0347
G4_beta     0.0307
SHH_beta    0.0204
           Subtypes     Cho
3               SHH  0.1212
1            G3-MYC  0.4590
6               WNT  0.0738
5  G3-photoreceptor  0.2163
4          G4-Mixed  0.0764
2       G4-neuronal  0.0533
==> Done! Move along
Step 4: run_discover
==> About to perform DiSCoVER.


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return getattr(section, self.name)[new_key]


==> DiSCoVER done!
==> Saving results to file.
==> Saving done!
==> NOT Restricting to clinically relevant drugs.


  return func(*args, **kwargs)


==> Done restricting to clinically relevant drugs!
==> Making the DiSCoVER powerpoint.
==> Using all of the drugs
==> Done making the DiSCoVER powerpoint slide!
==> Savig the variables to a file.
==> Saving the formatted results of DiSCoVER to a csv
==> Done savig the variables to a file!
==> Done with all the taks in this cell. Move along.
All analyses completed suscessfully with "patient" MB009


In [18]:
# "MED211": "MED211",
run_pipeline("MED211")

Step 1: read_user_input
Reminder: if you want to use a custom control expresion, you must set control to 'custom'
==> Setting patient_dir = case_id
Setup done!
Step 2: download_and_preprocess_rnaseq
==> Checking if local file (/pdx-hts/Notebooks/data/preprocessed/exp/MED211.csv) exist.
==> This file containes the expression of the PDXs. Printing dataframe's info:
<class 'pandas.core.frame.DataFrame'>
Index: 1 entries, MED211 to MED211
Columns: 16247 entries, A1BG to ZZZ3
dtypes: float64(16247)
memory usage: 126.9+ KB
==> None
==> File /pdx-hts/Notebooks/patients/MED211/gene_abundance.sleuth.csv saved successfully
==> Done preprocessing!
Step 3: classify_sample


  infer_datetime_format=infer_datetime_format)


     Cavalli     Cho  Northcott
G3    0.3530  0.4910     0.4678
SHH   0.3138  0.2588     0.1872
G4    0.1852  0.1370     0.2286
WNT   0.1480  0.1132     0.1164
           Cavalli
G3_gamma    0.1836
SHH_alpha   0.1616
G3_beta     0.1001
G4_alpha    0.0995
WNT_alpha   0.0806
G3_alpha    0.0693
WNT_beta    0.0674
SHH_gamma   0.0659
SHH_delta   0.0540
G4_gamma    0.0463
G4_beta     0.0394
SHH_beta    0.0323
           Subtypes     Cho
3               SHH  0.2588
1            G3-MYC  0.3476
6               WNT  0.1132
5  G3-photoreceptor  0.1434
4          G4-Mixed  0.0814
2       G4-neuronal  0.0556
==> Done! Move along
Step 4: run_discover
==> About to perform DiSCoVER.


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return getattr(section, self.name)[new_key]


==> DiSCoVER done!
==> Saving results to file.
==> Saving done!
==> NOT Restricting to clinically relevant drugs.


  return func(*args, **kwargs)


==> Done restricting to clinically relevant drugs!
==> Making the DiSCoVER powerpoint.
==> Using all of the drugs
==> Done making the DiSCoVER powerpoint slide!
==> Savig the variables to a file.
==> Saving the formatted results of DiSCoVER to a csv
==> Done savig the variables to a file!
==> Done with all the taks in this cell. Move along.
All analyses completed suscessfully with "patient" MED211


In [19]:
# "MED411": "MED411",
run_pipeline("MED411")

Step 1: read_user_input
Reminder: if you want to use a custom control expresion, you must set control to 'custom'
==> Setting patient_dir = case_id
Setup done!
Step 2: download_and_preprocess_rnaseq
==> Checking if local file (/pdx-hts/Notebooks/data/preprocessed/exp/MED411.csv) exist.
==> This file containes the expression of the PDXs. Printing dataframe's info:
<class 'pandas.core.frame.DataFrame'>
Index: 1 entries, MED411 to MED411
Columns: 16247 entries, A1BG to ZZZ3
dtypes: float64(16247)
memory usage: 126.9+ KB
==> None
==> File /pdx-hts/Notebooks/patients/MED411/gene_abundance.sleuth.csv saved successfully
==> Done preprocessing!
Step 3: classify_sample


  infer_datetime_format=infer_datetime_format)


     Cavalli     Cho  Northcott
G3    0.4931  0.6395     0.5700
SHH   0.2033  0.1375     0.0934
G4    0.1572  0.1374     0.2508
WNT   0.1464  0.0856     0.0858
           Cavalli
G3_gamma    0.2687
G3_beta     0.1160
G3_alpha    0.1084
SHH_alpha   0.0978
G4_alpha    0.0802
WNT_alpha   0.0761
WNT_beta    0.0703
SHH_delta   0.0473
G4_gamma    0.0426
SHH_gamma   0.0395
G4_beta     0.0344
SHH_beta    0.0187
           Subtypes     Cho
3               SHH  0.1375
1            G3-MYC  0.4615
6               WNT  0.0856
5  G3-photoreceptor  0.1780
4          G4-Mixed  0.0782
2       G4-neuronal  0.0592
==> Done! Move along
Step 4: run_discover
==> About to perform DiSCoVER.


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return getattr(section, self.name)[new_key]


==> DiSCoVER done!
==> Saving results to file.
==> Saving done!
==> NOT Restricting to clinically relevant drugs.


  return func(*args, **kwargs)


==> Done restricting to clinically relevant drugs!
==> Making the DiSCoVER powerpoint.
==> Using all of the drugs
==> Done making the DiSCoVER powerpoint slide!
==> Savig the variables to a file.
==> Saving the formatted results of DiSCoVER to a csv
==> Done savig the variables to a file!
==> Done with all the taks in this cell. Move along.
All analyses completed suscessfully with "patient" MED411


In [20]:
# "MED1712": "MED1712",
run_pipeline("MED1712")

Step 1: read_user_input
Reminder: if you want to use a custom control expresion, you must set control to 'custom'
==> Setting patient_dir = case_id
Setup done!
Step 2: download_and_preprocess_rnaseq
==> Checking if local file (/pdx-hts/Notebooks/data/preprocessed/exp/MED1712.csv) exist.
==> This file containes the expression of the PDXs. Printing dataframe's info:
<class 'pandas.core.frame.DataFrame'>
Index: 1 entries, MED1712 to MED1712
Columns: 16247 entries, A1BG to ZZZ3
dtypes: float64(16247)
memory usage: 126.9+ KB
==> None
==> File /pdx-hts/Notebooks/patients/MED1712/gene_abundance.sleuth.csv saved successfully
==> Done preprocessing!
Step 3: classify_sample


  infer_datetime_format=infer_datetime_format)


     Cavalli     Cho  Northcott
SHH   0.6349  0.8259     0.7436
G3    0.1333  0.0798     0.1033
G4    0.1193  0.0483     0.0857
WNT   0.1125  0.0460     0.0674
           Cavalli
SHH_alpha   0.3480
SHH_delta   0.1143
SHH_gamma   0.1059
SHH_beta    0.0667
G4_alpha    0.0659
WNT_alpha   0.0617
G3_gamma    0.0599
WNT_beta    0.0508
G3_beta     0.0446
G3_alpha    0.0288
G4_gamma    0.0284
G4_beta     0.0250
           Subtypes     Cho
3               SHH  0.8259
1            G3-MYC  0.0580
6               WNT  0.0460
5  G3-photoreceptor  0.0218
4          G4-Mixed  0.0264
2       G4-neuronal  0.0219
==> Done! Move along
Step 4: run_discover
==> About to perform DiSCoVER.


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return getattr(section, self.name)[new_key]


==> DiSCoVER done!
==> Saving results to file.
==> Saving done!
==> NOT Restricting to clinically relevant drugs.


  return func(*args, **kwargs)


==> Done restricting to clinically relevant drugs!
==> Making the DiSCoVER powerpoint.
==> Using all of the drugs
==> Done making the DiSCoVER powerpoint slide!
==> Savig the variables to a file.
==> Saving the formatted results of DiSCoVER to a csv
==> Done savig the variables to a file!
==> Done with all the taks in this cell. Move along.
All analyses completed suscessfully with "patient" MED1712


In [21]:
# "MED1911": "MED1911",
run_pipeline("MED1911")

Step 1: read_user_input
Reminder: if you want to use a custom control expresion, you must set control to 'custom'
==> Setting patient_dir = case_id
Setup done!
Step 2: download_and_preprocess_rnaseq
==> Checking if local file (/pdx-hts/Notebooks/data/preprocessed/exp/MED1911.csv) exist.
==> This file containes the expression of the PDXs. Printing dataframe's info:
<class 'pandas.core.frame.DataFrame'>
Index: 1 entries, MED1911 to MED1911
Columns: 16247 entries, A1BG to ZZZ3
dtypes: float64(16247)
memory usage: 126.9+ KB
==> None
==> File /pdx-hts/Notebooks/patients/MED1911/gene_abundance.sleuth.csv saved successfully
==> Done preprocessing!
Step 3: classify_sample


  infer_datetime_format=infer_datetime_format)


     Cavalli     Cho  Northcott
G3    0.5167  0.6678     0.5936
G4    0.1894  0.1501     0.2606
SHH   0.1680  0.1188     0.0847
WNT   0.1259  0.0633     0.0611
           Cavalli
G3_gamma    0.2010
G3_alpha    0.1977
G3_beta     0.1180
G4_alpha    0.0947
SHH_alpha   0.0784
WNT_alpha   0.0661
WNT_beta    0.0598
G4_gamma    0.0588
SHH_gamma   0.0388
G4_beta     0.0359
SHH_delta   0.0297
SHH_beta    0.0211
           Subtypes     Cho
3               SHH  0.1188
1            G3-MYC  0.3713
6               WNT  0.0633
5  G3-photoreceptor  0.2965
4          G4-Mixed  0.0910
2       G4-neuronal  0.0591
==> Done! Move along
Step 4: run_discover
==> About to perform DiSCoVER.


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return getattr(section, self.name)[new_key]


==> DiSCoVER done!
==> Saving results to file.
==> Saving done!
==> NOT Restricting to clinically relevant drugs.


  return func(*args, **kwargs)


==> Done restricting to clinically relevant drugs!
==> Making the DiSCoVER powerpoint.
==> Using all of the drugs
==> Done making the DiSCoVER powerpoint slide!
==> Savig the variables to a file.
==> Saving the formatted results of DiSCoVER to a csv
==> Done savig the variables to a file!
==> Done with all the taks in this cell. Move along.
All analyses completed suscessfully with "patient" MED1911


In [22]:
# "MED2312": "MED2312",
run_pipeline("MED2312")

Step 1: read_user_input
Reminder: if you want to use a custom control expresion, you must set control to 'custom'
==> Setting patient_dir = case_id
Setup done!
Step 2: download_and_preprocess_rnaseq
==> Checking if local file (/pdx-hts/Notebooks/data/preprocessed/exp/MED2312.csv) exist.
==> This file containes the expression of the PDXs. Printing dataframe's info:
<class 'pandas.core.frame.DataFrame'>
Index: 1 entries, MED2312 to MED2312
Columns: 16247 entries, A1BG to ZZZ3
dtypes: float64(16247)
memory usage: 126.9+ KB
==> None
==> File /pdx-hts/Notebooks/patients/MED2312/gene_abundance.sleuth.csv saved successfully
==> Done preprocessing!
Step 3: classify_sample


  infer_datetime_format=infer_datetime_format)


     Cavalli     Cho  Northcott
G4    0.3566  0.3127     0.3420
G3    0.3466  0.4662     0.4647
SHH   0.2075  0.1429     0.1196
WNT   0.0893  0.0782     0.0737
           Cavalli
G4_alpha    0.2114
G3_gamma    0.1422
G3_beta     0.1187
SHH_alpha   0.1003
G3_alpha    0.0857
G4_gamma    0.0770
G4_beta     0.0682
WNT_beta    0.0487
SHH_gamma   0.0444
WNT_alpha   0.0406
SHH_delta   0.0365
SHH_beta    0.0263
           Subtypes     Cho
3               SHH  0.1429
1            G3-MYC  0.2868
6               WNT  0.0782
5  G3-photoreceptor  0.1794
4          G4-Mixed  0.2002
2       G4-neuronal  0.1125
==> Done! Move along
Step 4: run_discover
==> About to perform DiSCoVER.


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return getattr(section, self.name)[new_key]


==> DiSCoVER done!
==> Saving results to file.
==> Saving done!
==> NOT Restricting to clinically relevant drugs.


  return func(*args, **kwargs)


==> Done restricting to clinically relevant drugs!
==> Making the DiSCoVER powerpoint.
==> Using all of the drugs
==> Done making the DiSCoVER powerpoint slide!
==> Savig the variables to a file.
==> Saving the formatted results of DiSCoVER to a csv
==> Done savig the variables to a file!
==> Done with all the taks in this cell. Move along.
All analyses completed suscessfully with "patient" MED2312


In [23]:
# "RCMB18": "RCMB18",
run_pipeline("RCMB18")

Step 1: read_user_input
Reminder: if you want to use a custom control expresion, you must set control to 'custom'
==> Setting patient_dir = case_id
Setup done!
Step 2: download_and_preprocess_rnaseq
==> Checking if local file (/pdx-hts/Notebooks/data/preprocessed/exp/RCMB18.csv) exist.
==> This file containes the expression of the PDXs. Printing dataframe's info:
<class 'pandas.core.frame.DataFrame'>
Index: 1 entries, RCMB18 to RCMB18
Columns: 16247 entries, A1BG to ZZZ3
dtypes: float64(16247)
memory usage: 126.9+ KB
==> None
==> File /pdx-hts/Notebooks/patients/RCMB18/gene_abundance.sleuth.csv saved successfully
==> Done preprocessing!
Step 3: classify_sample


  infer_datetime_format=infer_datetime_format)


     Cavalli     Cho  Northcott
SHH   0.5419  0.5989     0.5614
G3    0.1889  0.2016     0.1955
WNT   0.1428  0.0853     0.0888
G4    0.1264  0.1142     0.1543
           Cavalli
SHH_alpha   0.3039
SHH_delta   0.1044
G3_gamma    0.0921
SHH_gamma   0.0814
WNT_alpha   0.0767
G4_alpha    0.0732
WNT_beta    0.0661
G3_beta     0.0544
SHH_beta    0.0522
G3_alpha    0.0424
G4_gamma    0.0277
G4_beta     0.0255
           Subtypes     Cho
3               SHH  0.5989
1            G3-MYC  0.1400
6               WNT  0.0853
5  G3-photoreceptor  0.0616
4          G4-Mixed  0.0703
2       G4-neuronal  0.0439
==> Done! Move along
Step 4: run_discover
==> About to perform DiSCoVER.


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return getattr(section, self.name)[new_key]


==> DiSCoVER done!
==> Saving results to file.
==> Saving done!
==> NOT Restricting to clinically relevant drugs.


  return func(*args, **kwargs)


==> Done restricting to clinically relevant drugs!
==> Making the DiSCoVER powerpoint.
==> Using all of the drugs
==> Done making the DiSCoVER powerpoint slide!
==> Savig the variables to a file.
==> Saving the formatted results of DiSCoVER to a csv
==> Done savig the variables to a file!
==> Done with all the taks in this cell. Move along.
All analyses completed suscessfully with "patient" RCMB18


In [24]:
# "RCMB20": "RCMB20",
run_pipeline("RCMB20")

Step 1: read_user_input
Reminder: if you want to use a custom control expresion, you must set control to 'custom'
==> Setting patient_dir = case_id
Setup done!
Step 2: download_and_preprocess_rnaseq
==> Checking if local file (/pdx-hts/Notebooks/data/preprocessed/exp/RCMB20.csv) exist.
==> This file containes the expression of the PDXs. Printing dataframe's info:
<class 'pandas.core.frame.DataFrame'>
Index: 1 entries, RCMB20 to RCMB20
Columns: 16247 entries, A1BG to ZZZ3
dtypes: float64(16247)
memory usage: 126.9+ KB
==> None
==> File /pdx-hts/Notebooks/patients/RCMB20/gene_abundance.sleuth.csv saved successfully
==> Done preprocessing!
Step 3: classify_sample


  infer_datetime_format=infer_datetime_format)


     Cavalli     Cho  Northcott
G3    0.5179  0.6719     0.5700
G4    0.1694  0.1453     0.2650
SHH   0.1598  0.1002     0.0897
WNT   0.1529  0.0826     0.0753
           Cavalli
G3_gamma    0.2554
G3_beta     0.1326
G3_alpha    0.1299
G4_alpha    0.0846
WNT_alpha   0.0830
SHH_alpha   0.0778
WNT_beta    0.0699
G4_gamma    0.0519
SHH_gamma   0.0349
G4_beta     0.0329
SHH_delta   0.0307
SHH_beta    0.0164
           Subtypes     Cho
3               SHH  0.1002
1            G3-MYC  0.4437
6               WNT  0.0826
5  G3-photoreceptor  0.2282
4          G4-Mixed  0.0865
2       G4-neuronal  0.0588
==> Done! Move along
Step 4: run_discover
==> About to perform DiSCoVER.


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return getattr(section, self.name)[new_key]


==> DiSCoVER done!
==> Saving results to file.
==> Saving done!
==> NOT Restricting to clinically relevant drugs.


  return func(*args, **kwargs)


==> Done restricting to clinically relevant drugs!
==> Making the DiSCoVER powerpoint.
==> Using all of the drugs
==> Done making the DiSCoVER powerpoint slide!
==> Savig the variables to a file.
==> Saving the formatted results of DiSCoVER to a csv
==> Done savig the variables to a file!
==> Done with all the taks in this cell. Move along.
All analyses completed suscessfully with "patient" RCMB20


In [25]:
# "RCMB24": "RCMB24",
run_pipeline("RCMB24")

Step 1: read_user_input
Reminder: if you want to use a custom control expresion, you must set control to 'custom'
==> Setting patient_dir = case_id
Setup done!
Step 2: download_and_preprocess_rnaseq
==> Checking if local file (/pdx-hts/Notebooks/data/preprocessed/exp/RCMB24.csv) exist.
==> This file containes the expression of the PDXs. Printing dataframe's info:
<class 'pandas.core.frame.DataFrame'>
Index: 1 entries, RCMB24 to RCMB24
Columns: 16247 entries, A1BG to ZZZ3
dtypes: float64(16247)
memory usage: 126.9+ KB
==> None
==> File /pdx-hts/Notebooks/patients/RCMB24/gene_abundance.sleuth.csv saved successfully
==> Done preprocessing!
Step 3: classify_sample


  infer_datetime_format=infer_datetime_format)


     Cavalli     Cho  Northcott
SHH   0.6501  0.7451     0.7121
WNT   0.1322  0.0620     0.0779
G3    0.1225  0.1258     0.1196
G4    0.0952  0.0671     0.0904
           Cavalli
SHH_alpha   0.3869
SHH_delta   0.1288
SHH_gamma   0.0850
WNT_alpha   0.0691
WNT_beta    0.0631
G3_gamma    0.0616
G4_alpha    0.0536
SHH_beta    0.0494
G3_beta     0.0346
G3_alpha    0.0263
G4_beta     0.0227
G4_gamma    0.0189
           Subtypes     Cho
3               SHH  0.7451
1            G3-MYC  0.0879
6               WNT  0.0620
5  G3-photoreceptor  0.0379
4          G4-Mixed  0.0361
2       G4-neuronal  0.0310
==> Done! Move along
Step 4: run_discover
==> About to perform DiSCoVER.


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return getattr(section, self.name)[new_key]


==> DiSCoVER done!
==> Saving results to file.
==> Saving done!
==> NOT Restricting to clinically relevant drugs.


  return func(*args, **kwargs)


==> Done restricting to clinically relevant drugs!
==> Making the DiSCoVER powerpoint.
==> Using all of the drugs
==> Done making the DiSCoVER powerpoint slide!
==> Savig the variables to a file.
==> Saving the formatted results of DiSCoVER to a csv
==> Done savig the variables to a file!
==> Done with all the taks in this cell. Move along.
All analyses completed suscessfully with "patient" RCMB24


In [26]:
# "RCMB28": "RCMB28",
run_pipeline("RCMB28")

Step 1: read_user_input
Reminder: if you want to use a custom control expresion, you must set control to 'custom'
==> Setting patient_dir = case_id
Setup done!
Step 2: download_and_preprocess_rnaseq
==> Checking if local file (/pdx-hts/Notebooks/data/preprocessed/exp/RCMB28.csv) exist.
==> This file containes the expression of the PDXs. Printing dataframe's info:
<class 'pandas.core.frame.DataFrame'>
Index: 1 entries, RCMB28 to RCMB28
Columns: 16247 entries, A1BG to ZZZ3
dtypes: float64(16247)
memory usage: 126.9+ KB
==> None
==> File /pdx-hts/Notebooks/patients/RCMB28/gene_abundance.sleuth.csv saved successfully
==> Done preprocessing!
Step 3: classify_sample


  infer_datetime_format=infer_datetime_format)


     Cavalli     Cho  Northcott
G3    0.5086  0.6521     0.6009
G4    0.1845  0.1510     0.2513
SHH   0.1726  0.1307     0.0847
WNT   0.1343  0.0662     0.0631
           Cavalli
G3_gamma    0.2723
G3_beta     0.1271
G3_alpha    0.1092
G4_alpha    0.0981
SHH_alpha   0.0895
WNT_alpha   0.0738
WNT_beta    0.0605
G4_gamma    0.0493
G4_beta     0.0371
SHH_delta   0.0330
SHH_gamma   0.0323
SHH_beta    0.0178
           Subtypes     Cho
3               SHH  0.1307
1            G3-MYC  0.4778
6               WNT  0.0662
5  G3-photoreceptor  0.1743
4          G4-Mixed  0.0835
2       G4-neuronal  0.0675
==> Done! Move along
Step 4: run_discover
==> About to perform DiSCoVER.


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return getattr(section, self.name)[new_key]


==> DiSCoVER done!
==> Saving results to file.
==> Saving done!
==> NOT Restricting to clinically relevant drugs.


  return func(*args, **kwargs)


==> Done restricting to clinically relevant drugs!
==> Making the DiSCoVER powerpoint.
==> Using all of the drugs
==> Done making the DiSCoVER powerpoint slide!
==> Savig the variables to a file.
==> Saving the formatted results of DiSCoVER to a csv
==> Done savig the variables to a file!
==> Done with all the taks in this cell. Move along.
All analyses completed suscessfully with "patient" RCMB28


In [27]:
# "RCMB32": "RCMB32",
run_pipeline("RCMB32")

Step 1: read_user_input
Reminder: if you want to use a custom control expresion, you must set control to 'custom'
==> Setting patient_dir = case_id
Setup done!
Step 2: download_and_preprocess_rnaseq
==> Checking if local file (/pdx-hts/Notebooks/data/preprocessed/exp/RCMB32.csv) exist.
==> This file containes the expression of the PDXs. Printing dataframe's info:
<class 'pandas.core.frame.DataFrame'>
Index: 1 entries, RCMB32 to RCMB32
Columns: 16247 entries, A1BG to ZZZ3
dtypes: float64(16247)
memory usage: 126.9+ KB
==> None
==> File /pdx-hts/Notebooks/patients/RCMB32/gene_abundance.sleuth.csv saved successfully
==> Done preprocessing!
Step 3: classify_sample


  infer_datetime_format=infer_datetime_format)


     Cavalli     Cho  Northcott
SHH   0.4895  0.5919     0.5032
G4    0.2000  0.1307     0.1775
G3    0.1758  0.2030     0.2124
WNT   0.1347  0.0744     0.1069
           Cavalli
SHH_alpha   0.2368
G4_alpha    0.1115
SHH_delta   0.1032
G3_gamma    0.0785
SHH_gamma   0.0760
SHH_beta    0.0735
WNT_alpha   0.0681
WNT_beta    0.0666
G3_beta     0.0583
G4_gamma    0.0446
G4_beta     0.0439
G3_alpha    0.0390
           Subtypes     Cho
3               SHH  0.5919
1            G3-MYC  0.1353
6               WNT  0.0744
5  G3-photoreceptor  0.0677
4          G4-Mixed  0.0729
2       G4-neuronal  0.0578
==> Done! Move along
Step 4: run_discover
==> About to perform DiSCoVER.


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return getattr(section, self.name)[new_key]


==> DiSCoVER done!
==> Saving results to file.
==> Saving done!
==> NOT Restricting to clinically relevant drugs.


  return func(*args, **kwargs)


==> Done restricting to clinically relevant drugs!
==> Making the DiSCoVER powerpoint.
==> Using all of the drugs
==> Done making the DiSCoVER powerpoint slide!
==> Savig the variables to a file.
==> Saving the formatted results of DiSCoVER to a csv
==> Done savig the variables to a file!
==> Done with all the taks in this cell. Move along.
All analyses completed suscessfully with "patient" RCMB32


In [28]:
# "RCMB38": "RCMB38",
run_pipeline("RCMB38")

Step 1: read_user_input
Reminder: if you want to use a custom control expresion, you must set control to 'custom'
==> Setting patient_dir = case_id
Setup done!
Step 2: download_and_preprocess_rnaseq
==> Checking if local file (/pdx-hts/Notebooks/data/preprocessed/exp/RCMB38.csv) exist.
==> This file containes the expression of the PDXs. Printing dataframe's info:
<class 'pandas.core.frame.DataFrame'>
Index: 1 entries, RCMB38 to RCMB38
Columns: 16247 entries, A1BG to ZZZ3
dtypes: float64(16247)
memory usage: 126.9+ KB
==> None
==> File /pdx-hts/Notebooks/patients/RCMB38/gene_abundance.sleuth.csv saved successfully
==> Done preprocessing!
Step 3: classify_sample


  infer_datetime_format=infer_datetime_format)


     Cavalli     Cho  Northcott
G4    0.5081  0.5459     0.5517
G3    0.2685  0.3398     0.3134
SHH   0.1376  0.0684     0.0905
WNT   0.0858  0.0459     0.0444
           Cavalli
G4_gamma    0.2129
G4_alpha    0.2058
G3_alpha    0.1006
G3_beta     0.0955
G4_beta     0.0894
G3_gamma    0.0724
SHH_alpha   0.0545
WNT_alpha   0.0451
WNT_beta    0.0407
SHH_gamma   0.0348
SHH_delta   0.0250
SHH_beta    0.0233
           Subtypes     Cho
3               SHH  0.0684
1            G3-MYC  0.1312
6               WNT  0.0459
5  G3-photoreceptor  0.2086
4          G4-Mixed  0.2792
2       G4-neuronal  0.2667
==> Done! Move along
Step 4: run_discover
==> About to perform DiSCoVER.


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return getattr(section, self.name)[new_key]


==> DiSCoVER done!
==> Saving results to file.
==> Saving done!
==> NOT Restricting to clinically relevant drugs.


  return func(*args, **kwargs)


==> Done restricting to clinically relevant drugs!
==> Making the DiSCoVER powerpoint.
==> Using all of the drugs
==> Done making the DiSCoVER powerpoint slide!
==> Savig the variables to a file.
==> Saving the formatted results of DiSCoVER to a csv
==> Done savig the variables to a file!
==> Done with all the taks in this cell. Move along.
All analyses completed suscessfully with "patient" RCMB38


In [29]:
# "RCMB40": "RCMB40"
run_pipeline("RCMB40")

Step 1: read_user_input
Reminder: if you want to use a custom control expresion, you must set control to 'custom'
==> Setting patient_dir = case_id
Setup done!
Step 2: download_and_preprocess_rnaseq
==> Checking if local file (/pdx-hts/Notebooks/data/preprocessed/exp/RCMB40.csv) exist.
==> This file containes the expression of the PDXs. Printing dataframe's info:
<class 'pandas.core.frame.DataFrame'>
Index: 1 entries, RCMB40 to RCMB40
Columns: 16247 entries, A1BG to ZZZ3
dtypes: float64(16247)
memory usage: 126.9+ KB
==> None
==> File /pdx-hts/Notebooks/patients/RCMB40/gene_abundance.sleuth.csv saved successfully
==> Done preprocessing!
Step 3: classify_sample


  infer_datetime_format=infer_datetime_format)


     Cavalli     Cho  Northcott
G3    0.4511  0.5835     0.5616
G4    0.2304  0.1947     0.2778
SHH   0.2037  0.1603     0.1029
WNT   0.1148  0.0615     0.0577
           Cavalli
G3_gamma    0.2370
G3_beta     0.1243
G4_alpha    0.1182
SHH_alpha   0.1054
G3_alpha    0.0898
G4_gamma    0.0614
WNT_alpha   0.0593
WNT_beta    0.0555
G4_beta     0.0508
SHH_gamma   0.0409
SHH_delta   0.0396
SHH_beta    0.0178
           Subtypes     Cho
3               SHH  0.1603
1            G3-MYC  0.4282
6               WNT  0.0615
5  G3-photoreceptor  0.1553
4          G4-Mixed  0.1192
2       G4-neuronal  0.0755
==> Done! Move along
Step 4: run_discover
==> About to perform DiSCoVER.


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return getattr(section, self.name)[new_key]


==> DiSCoVER done!
==> Saving results to file.
==> Saving done!
==> NOT Restricting to clinically relevant drugs.


  return func(*args, **kwargs)


==> Done restricting to clinically relevant drugs!
==> Making the DiSCoVER powerpoint.
==> Using all of the drugs
==> Done making the DiSCoVER powerpoint slide!
==> Savig the variables to a file.
==> Saving the formatted results of DiSCoVER to a csv
==> Done savig the variables to a file!
==> Done with all the taks in this cell. Move along.
All analyses completed suscessfully with "patient" RCMB40


In [30]:
print('done!')

done!


In [1]:
# Requires GenePattern Notebook: pip install genepattern-notebook
import gp
import genepattern

# Username and password removed for security reasons.
genepattern.display(genepattern.session.register("https://cloud.genepattern.org/gp", "", ""))

GPAuthWidget()

In [2]:
@genepattern.build_ui(parameters={
    "output_var": {
        "default": "case_id",
        "hide": False,
    },
    "case_id": {"type": "choice",
                "description": "The name of the case, e.g., 'BT084'",
                "choices": {
                    "BT084": "BT084",
                    "DMB006": "DMB006",
                    "ICB984": "ICB984",
                    "ICB1299": "ICB1299",
                    "ICB1487": "ICB1487",
                    "ICB1572": "ICB1572",
                    "MB002": "MB002",
                    "MB009": "MB009",
                    "MED211": "MED211",
                    "MED411": "MED411",
                    "MED1712": "MED1712",
                    "MED1911": "MED1911",
                    "MED2312": "MED2312",
                    "RCMB18": "RCMB18",
                    "RCMB20": "RCMB20",
                    "RCMB24": "RCMB24",
                    "RCMB28": "RCMB28",
                    "RCMB32": "RCMB32",
                    "RCMB38": "RCMB38",
                    "RCMB40": "RCMB40",},
               "default":"BT084"},
})
def set_case_id(case_id):
    print(f'case_id set to "{case_id}"')
    return case_id

UIBuilder(function_import='set_case_id', name='set_case_id', params=[{'name': 'case_id', 'label': 'case_id', '…

## User input

Select parameters before running the rest of the notebook.

<div class="alert alert-info">
<h3 style="margin-top: 0;"> Instructions <i class="fa fa-info-circle"></i></h3>
Select parameters before running the rest of the notebook.
</div>

In [3]:
from companion_script import *
# # Select case
# case_id = 'case17'
# # This patient directory should match the directory name on DNANexus.
# patient_dir = '18-10716_tumor-normal'
# is_medullo = True # set False if it is another kind of brain tumor
%load_ext autoreload
%autoreload 2
%matplotlib inline
import readline # required for rpy2 extension
%load_ext rpy2.ipython


def rmagic_warning(
    message,
    category = rpy2.rinterface.RRuntimeWarning,
    filename = '',
    lineno = -1,
    file=None,
    line=None):
    print(message)
default_showwarning = warnings.showwarning


@genepattern.build_ui(parameters={
    "output_var": {
        "default": "setup",
        "hide": False,
    },
    "case_id": {"type": "text",
                "description": "The name of the case, e.g., 'PDX1'",
               "default":"PDX1"},
    "patient_dir": {"type": "text",
                    "description": 'For DNA Nexus downloads only. The name of the "patient" directory, e.g. "18-10716_tumor-normal" (quotes are required)',
                    "default":"PDX1_dir"},
    "dna_nexus_bool": {"type": "bool",
                   "description": "Whether or not this sample has been classified as medulloblastoma",
                  "default":False},
    "is_medullo": {"type": "bool",
                   "description": "Whether or not this sample has been classified as medulloblastoma",},
    "control": {"type": "choice",
                "description": "Whether or not to use a custom control",
                "choices": {
                    "original": "original",
                    "custom": "custom",
                            }
               },
    "custom_control_expression": {"type": "file",
                           "kinds": ["gct"],
                           "description": "The file (or path to the GCT file) which contains the gene expression of the custom control.",
                           "default":None},
})
def read_user_input(case_id, patient_dir, dna_nexus_bool=False, is_medullo=False, control='original',custom_control_expression=None):
    # Select control for DiSCoVER and Connectivity Map
    # Generally, if the tumor is a medulloblastoma, we use `cerebellar_stem` (comment the `neural_stem` line).
    # And if it is any other kind of brain tumor, we use `neural_stem`.
    if control == 'original':
        expression_control = 'cerebellar_stem' if is_medullo else 'neural_stem'
    elif control == 'custom':
        expression_control = 'custom_control'
    else:
        print('Unexpected value for variable named control, value:', control)
        
    if (len(custom_control_expression) is not 0) and (control is not 'custom'):
        print("Reminder: if you want to use a custom control expresion, you must set control to 'custom'")

    base_dir = os.getcwd()
    utilities_dir = '/build'
    patients_dir = os.path.join(base_dir, 'patients')
    if not dna_nexus_bool:
        log('Setting patient_dir = case_id')
        patient_dir = case_id
        
    in_dir = os.path.join(patients_dir, patient_dir)
    
    out_dir = in_dir
    os.makedirs(out_dir, exist_ok=True)
    
#     out['base_dir'] = base_dir
#     out['utilities_dir'] = utilities_dir
#     out['patients_dir'] = patients_dir
#     

    platform = sys.platform
    if platform.startswith('linux'):
        os_string = 'linux'
    elif platform == 'darwin':
        os_string = 'mac'
    else:
        raise ValueError('Platform "{}" not supported'.format(platform))

    # RNASeq quantification
    kallisto_dir = '/build/kallisto'
    kallisto_path = os.path.join(kallisto_dir, 'kallisto_{}-v0.44.0/kallisto'.format(os_string))
    transcriptome_index_path = os.path.join(kallisto_dir, 'GRCh38.ensembl.transcriptome.idx')
    local_fastqs_dir = os.path.join(in_dir, 'fastqs')
    os.makedirs(local_fastqs_dir, exist_ok=True)
    patient_gexp_file = os.path.join(out_dir, 'gene_abundance.sleuth.csv')

    # Medulloblastoma classification
#     from sklearn.ensemble import RandomForestClassifier
#     from tumor_classification.medulloblastoma import classify_cavalli, classify_cho, classify_northcott
    medullo_classify_out_dir = os.path.join(out_dir, 'medulloblastoma_classification')
    if not os.path.exists(medullo_classify_out_dir):
        os.mkdir(medullo_classify_out_dir)
    cavalli_subgroup_file = os.path.join(medullo_classify_out_dir, 'cavalli_subgroups.csv')
    cavalli_subgroup_direct_file = os.path.join(medullo_classify_out_dir, 'cavalli_subgroups_direct.csv')
    cavalli_subtype_file = os.path.join(medullo_classify_out_dir, 'cavalli_subtypes.csv')
    cho_subtype_file = os.path.join(medullo_classify_out_dir, 'cho_subtypes.csv')
    cho_subgroup_file = os.path.join(medullo_classify_out_dir, 'cho_subgroups.csv')
    northcott_subgroup_file = os.path.join(medullo_classify_out_dir, 'northcott_subgroups.csv')

    drug_suggestion_out_dir = os.path.join(out_dir, 'drug_suggestions')
    os.makedirs(drug_suggestion_out_dir, exist_ok=True)

    # DiSCoVER
    discover_out_dir = os.path.join(drug_suggestion_out_dir, 'discover/{}'.format(expression_control))
    os.makedirs(discover_out_dir, exist_ok=True)
    discover_heatmap_file = os.path.join(discover_out_dir, 'ctrp.png')
    full_discover_results_file = os.path.join(discover_out_dir, 'discover.all.csv')
    rdrugs_discover_file = os.path.join(discover_out_dir, '{}.discover.{}.reasonable.annotated.csv'.format(case_id, expression_control))

    # Connectivity Map
    cmap_out_dir = os.path.join(drug_suggestion_out_dir, 'cmap/{}'.format(expression_control))
    os.makedirs(cmap_out_dir, exist_ok=True)
    cmap_all_ranked_drugs_file = os.path.join(cmap_out_dir, '{}.cmap.{}.all.csv'.format(case_id, expression_control))
    cmap_reasonable_ranked_drugs_file = os.path.join(cmap_out_dir, '{}.cmap.{}.reasonable.annotated.csv'.format(case_id, expression_control))
    
    # Powerpoint for MTB
#     from slides import make_medullo_classification_slide, make_discover_workflow_slide, make_exp_drug_ranking_results_slide, make_intersection_slide
    mtb_ppt_file = os.path.join(out_dir, '{}.mtb_slides.pptx'.format(case_id))

    # DNANexus
    dx_source_path = os.path.join(utilities_dir, 'dx-toolkit/environment')
    dnanexus_project = 'UW_UCSD_RNAseq_collaboration_share'
    # Replace the contents of this file with your own DNANexus token.
    dnanexus_token_file = os.path.join(base_dir, 'dnanexus_token.txt')
    # To use the dx command, we must update some environment variables. 
    # From the command line, this is done with source dx-toolkit/environment, 
    # but from Python we have to use a workaround, because normally any changes 
    # to environment variables done in a subprocess are not reflected in the 
    # parent process. The workaround runs the source command in a subprocess, 
    # fetches the environment variables from the subprocess and updates those 
    # of the parent process.
#     from utils import source_and_update_env_vars
    source_and_update_env_vars(dx_source_path)    
    out = {"case_id": case_id,
                 "patient_dir": patient_dir,
                 "is_medullo": is_medullo}
    out['dna_nexus_bool']=dna_nexus_bool
    out['expression_control'] = expression_control
    out['custom_control_expression'] = custom_control_expression
    out['dnanexus_token_file'] = dnanexus_token_file
    out['local_fastqs_dir'] = local_fastqs_dir
    out['dnanexus_project'] = dnanexus_project
    out['local_fastqs_dir'] = local_fastqs_dir
    out['transcriptome_index_path'] = transcriptome_index_path
    out['kallisto_path'] = kallisto_path
    out['kallisto_dir'] = kallisto_dir
    out['out_dir'] = out_dir
    out['r_out_dir'] = out_dir.replace('\\',r'\\')
    out['patient_gexp_file'] = patient_gexp_file
    out['in_dir'] = in_dir
    out['cavalli_subgroup_file'] = cavalli_subgroup_file
    out['cavalli_subtype_file'] = cavalli_subtype_file
    out['cavalli_subgroup_direct_file'] = cavalli_subgroup_direct_file
    out['cho_subgroup_file'] = cho_subgroup_file
    out['cho_subtype_file'] = cho_subtype_file
    out['northcott_subgroup_file'] = northcott_subgroup_file
    out['mtb_ppt_file'] = mtb_ppt_file
    out['expression_control'] = expression_control
    out['full_discover_results_file'] = full_discover_results_file
    out['discover_out_dir'] = discover_out_dir
    out['discover_heatmap_file'] = discover_heatmap_file
    out['rdrugs_discover_file'] = rdrugs_discover_file
    out['cmap_out_dir'] = cmap_out_dir
    out['cmap_all_ranked_drugs_file'] = cmap_all_ranked_drugs_file
    out['cmap_reasonable_ranked_drugs_file'] = cmap_reasonable_ranked_drugs_file
    out['mtb_ppt_file'] = mtb_ppt_file
    out['out_dir'] = os.path.join(patients_dir, case_id)

    if not os.path.exists(out['out_dir']):
        os.mkdir(out['out_dir'])
    print('Setup done!')
    pickle.dump(out, file=open(os.path.join(out['out_dir'], case_id+'_backup1_input.p'),'wb'))
    return Bunch(out)

UIBuilder(function_import='read_user_input', name='read_user_input', params=[{'name': 'case_id', 'label': 'cas…

<div class="well">
Running all cells below this point will execute all the analyses except for one: the Connectivity Map analysis at the end of the notebook, which requires two manual steps.
</div>

# Download RNAseq data

In [4]:
@genepattern.build_ui(parameters={
    "setup": {"default": "setup",
              "hide": False,
              "description": "The variable which has the setup information"},
    "input_expression_dir":{"hide":True},
    "output_var": {
        "default": "setup",
        "hide": True,
    },
})
def download_and_preprocess_rnaseq(setup, input_expression_dir = None):
    setup.input_expression_dir = f'/pdx-hts/Notebooks/data/preprocessed/exp/{setup.case_id}.csv'
    if setup.dna_nexus_bool:
        if input_expression_dir is not None:
            log(f"input_expression_dir has a value ({setup.input_expression_dir}), and it will be ignored!")
        log('About to download fastqfiles from DNA Nexus. This may take a while.')
        with open(setup.dnanexus_token_file, 'r') as f:
            dnanexus_token = f.readline().strip()
        login_command = 'dx login --token {} --noprojects; dx select {}'.format(dnanexus_token, setup.dnanexus_project)
        # subprocess.check_output('ls', shell=True).decode('utf-8').strip()
        subprocess.check_output(login_command, shell=True).decode('utf-8').strip()

        find_fastq_command = 'dx find data --name "*.fastq.gz" --path {}:{}'.format(setup.dnanexus_project, setup.patient_dir)
        find_fastq_return_lines = subprocess.check_output(find_fastq_command, shell=True).decode().strip().split('\n')
        re_string = '.*(/{}/.*\.fastq.gz) .*'.format(setup.patient_dir)
        fastq_path_re = re.compile(re_string)
        remote_fastq_paths = []
        local_fastq_subdirs = []

        for line in find_fastq_return_lines:
            search = fastq_path_re.search(line)
            remote_fastq_path = search.group(1)
            remote_fastq_paths.append(remote_fastq_path)
            fastq_subdir_path = os.path.dirname(remote_fastq_path)
            fastq_subdir = os.path.basename(fastq_subdir_path)
            local_fastq_subdir = os.path.join(setup.local_fastqs_dir, fastq_subdir)
            os.makedirs(local_fastq_subdir, exist_ok=True)
            local_fastq_subdirs.append(local_fastq_subdir)

        for remote_fastq_path, local_fastq_subdir in zip(remote_fastq_paths, local_fastq_subdirs):
            download_command = 'dx download "{}" -o "{}"'.format(remote_fastq_path, local_fastq_subdir)
            print('\t'+download_command)
            try:
                a=subprocess.check_output(download_command, shell=True)
            except subprocess.CalledProcessError as e:
                print('\tEncountered a dx error, this likely means you already have the file indicated above.')
                print('\tContinuing...\n')
                continue
        log('Done downloading the fastq files.')
        log('Preprocessing RNASeq data now:')
        log('Using kallisto to compute transcript abundance.')
        preprocess_rna_seq(setup)
        log('Done with tanscript abundance.')
        log('Using sleuth to aggregate transcript abundance into gene abbundance.')
        run_sleuth(setup)
        patient_exp = pd.read_csv(setup.patient_gexp_file, index_col=0).T
        patient_exp.index = [setup.case_id]
        setup.patient_exp = patient_exp
        patient_exp.to_csv(setup.patient_gexp_file)
        log('Habemus Genus Expressium *release the white smoke*')
    else:
        log(f'Checking if local file ({setup.input_expression_dir}) exist.')
        if os.path.isfile(setup.input_expression_dir):
            df = pd.read_csv(setup.input_expression_dir, index_col=0)
            setup.expression_input = df
            log("This file containes the expression of the PDXs. Printing dataframe's info:")
            log(setup.expression_input.info())
        else:
            log('File could not be located please check and run again.')
        patient_exp = df
        setup.patient_exp = patient_exp
        patient_exp.to_csv(setup.patient_gexp_file)
        log(f'File {setup.patient_gexp_file} saved successfully')
    
    pickle.dump(setup, file=open(os.path.join(setup.out_dir, setup.case_id+'_backup2_download.p'),'wb'))
#     pickle.dump(setup, file=open(setup.out_dir+'_backup2_download.p','wb'))
    log('Done preprocessing!')
    return setup

UIBuilder(function_import='download_and_preprocess_rnaseq', name='download_and_preprocess_rnaseq', params=[{'n…

# Classify the tumor by medulloblastoma subgroup and subtype

In [5]:
@genepattern.build_ui(
    description="This function classifies a medulloblastoma sample into a subgroup. Non-medulloblastoma samples are ignored.",
    parameters={
    "setup": {"default": "setup",
              "hide": False,
              "description": "The variable which has the setup information"},
    "output_var": {
        "default": "setup",
        "hide": True,
    },
})
def classify_sample(setup):
    # We have three datasets we can use to classify based on expression:
    # - [Cavalli et al. 2017](http://www.sciencedirect.com/science/article/pii/S1535610817302015) cohort. This cohort includes 763 tumors, and was used to define 12 finer-grained subtypes nested in the 4 subgroups. Both expression and methylation data are available.

    # - [Cho et al. 2011](http://www.mesirovlab.org/medulloblastoma/cho/) cohort. This paper identified two subtypes within G3 and two within G4, for a total of 6. It contains 194 tumors.

    # - [Northcott et al. 2017](http://www.nature.com/nature/journal/v547/n7663/full/nature22973.html) expression data (shared by Sebastian). The labels we have for this data are of the 4 basic subgroups only. There are 223 tumors.

    # When finer-grained subtypes are known, we perform the finer-grained classification first and also collapse the subtypes to the 4 basic subgroups, so as to report both subtype and subgroup probabilities. Classification is done using random forests.

    # Since the patient data are from the same platform and contain the same features each time, we can use pre-fit models to classify them. The classification methods also have a fallback in case the data looks different.

    # The tumor board is arranging for methylation data to be obtained from patient samples as well, since it seems it may be more informative than expression. Methylation data would also allow comparison to a large and variety collection of brain tumors, currently available through a DKFZ [web portal](https://www.molecularneuropathology.org/mnp).

    if setup.is_medullo:
        # Read in patient's gene-level RNASeq TPM data
        patient_exp = pd.read_csv(setup.patient_gexp_file, index_col=0)

        cavalli_subgroups, cavalli_subtypes = classify_cavalli(patient_exp)
        cavalli_subgroups.to_csv(setup.cavalli_subgroup_file)
        cavalli_subtypes.to_csv(setup.cavalli_subtype_file)
        setup.cavalli_subgroups = cavalli_subgroups
        setup.cavalli_subgroups.name='Cavalli'
        setup.cavalli_subtypes = cavalli_subtypes
        setup.cavalli_subtypes.name='Cavalli'

        cho_subgroups, cho_subtypes = classify_cho(patient_exp)
        cho_subtypes.to_csv(setup.cho_subtype_file)
        cho_subgroups.to_csv(setup.cho_subgroup_file)
        setup.cho_subtypes = cho_subtypes
        setup.cho_subtypes.name = 'Cho'
        setup.cho_subgroups = cho_subgroups
        setup.cho_subgroups.name = 'Cho'

        northcott_subgroups = classify_northcott(patient_exp)
        northcott_subgroups.to_csv(setup.northcott_subgroup_file)
        setup.northcott_subgroups = northcott_subgroups
        setup.northcott_subgroups.name = 'Northcott'
        

        make_medullo_classification_slide(setup.mtb_ppt_file,
                                          setup.cavalli_subgroup_file,
                                          setup.cavalli_subtype_file,
                                          setup.cho_subgroup_file,
                                          setup.cho_subtype_file,
                                          setup.northcott_subgroup_file)
        subgroups = pd.DataFrame(data=setup.cavalli_subgroups).join(setup.cho_subgroups).join(setup.northcott_subgroups)
        setup.subgroups = subgroups
        print(setup.subgroups)
        cavalli_subtypes = pd.DataFrame(data=setup.cavalli_subtypes)
        setup.cavalli_subtypes = cavalli_subtypes
        print(cavalli_subtypes)
        cho_subtypes = pd.DataFrame(pd.Series(index=[3,1,6,5,4,2], data=['SHH','G3-MYC','WNT','G3-photoreceptor','G4-Mixed','G4-neuronal'],name='Subtypes')).join(setup.cho_subtypes)
        setup.cho_subtypes = cho_subtypes
        print(cho_subtypes)
        
        log('Done! Move along')
    else:
        log('This is not medulloblastoma. Nothing to do here. Move along')
#     pickle.dump(setup, file=open(setup.out_dir+'_backup3_classify.p','wb'))
    pickle.dump(setup, file=open(os.path.join(setup.out_dir, setup.case_id+'_backup3_classify.p'),'wb'))
    
    return setup

UIBuilder(description='This function classifies a medulloblastoma sample into a subgroup. Non-medulloblastoma …

# Suggest drugs based on RNAseq data (DiSCoVER)

In [6]:
@genepattern.build_ui(
  description="Run DiSCoVER on the provided sample and control.",
  parameters={
    "setup": {"default": "setup",
              "hide": False,
              "description": "The variable which has the setup information"},
    "output_var": {
        "default": "setup",
        "hide": True,
    },
})
def run_discover(setup):
    from rpy2.robjects import numpy2ri
    numpy2ri.activate()
    from discover import discover_from_expression, plot_discover_from_expression
    from drug_suggestion.expression.controls import load_control_exp
    patient_exp = pd.read_csv(setup.patient_gexp_file, index_col=0)
    control_exp = load_control_exp(setup.expression_control)
    log("About to perform DiSCoVER.")
    discover_results = discover_from_expression(exp=patient_exp, 
                                                control_exp=control_exp, 
                                                verbose=False)
    setup.raw_discover_results = discover_results
#     pickle.dump(setup, file=open(setup.out_dir+'_backup4-0_DISCoVER.p','wb'))
#     print(discover_results)
    # move some files created by DiSCoVER
#     for cl_name in ['ccle','ctrp','gdsc']:
#         os.rename(f"COSMIC_cell_lines_IDs_and_types_{cl_name}.csv", os.path.join(setup.out_dir, f"COSMIC_cell_lines_IDs_and_types_{cl_name}.csv"))
    
    log("DiSCoVER done!")
    numpy2ri.deactivate()
    log('Saving results to file.')
    # display(discover_results)
    discover_results.T.sort_values(by=setup.case_id, ascending=False).to_csv(setup.full_discover_results_file)
    log("Saving done!")
    log("NOT Restricting to clinically relevant drugs.")
    #Not all drugs in CCLE, CTRP, and GDSC are realistic candidates for treatment. We compiled a list of medications that are FDA-approved or in late-stage clinical trials, and Dr. Wechsler-Reya curated it to include only those that are relevant for treating brain tumors. Here we limit the results to these drugs and add Dr. Wechsler-Reya's mechanism-of-action annotations. To enable comparison of drug lists, drugs from the different sources have been mapped to PubChem compound IDs (CIDs) using [PubChemPy](http://pubchempy.readthedocs.io/en/latest/).
    from drug_suggestion.drug_annotation import subset_to_reasonable_drugs
    from drug_suggestion.expression.discover import load_discover_drug_to_cids
    disco2cid = load_discover_drug_to_cids()
    reasonable_results = subset_to_reasonable_drugs(discover_results, 
                                                disco2cid, 
                                                out_prefix='discover.{}'.format(setup.expression_control), 
                                                out_dir=setup.discover_out_dir)
    # This will override the file setup.rdrugs_discover_file
    all_drugs = format_drugs(discover_results, 
                                disco2cid, 
                                out_prefix='discover.{}'.format(setup.expression_control), 
                                out_dir=setup.discover_out_dir)
    log('Done restricting to clinically relevant drugs!')
    
##     log('making a discover illustrative method')
##     from drug_suggestion.expression.discover import plot_discover_from_expression
##     plot_discover_from_expression(case_id, 
##                                   discover_results, 
##                                   exp=patient_exp,
##                                   control_exp=control_exp,
##                                   cl='ctrp',
##                                   out_file=discover_heatmap_file)
##     make_discover_workflow_slide(mtb_ppt_file, discover_heatmap_file)
    log('Making the DiSCoVER powerpoint.')
    rdrugs_discover = pd.read_csv(setup.rdrugs_discover_file, index_col=None)

    
    # Using all of the drugs
    log('Using all of the drugs')
    df = split_discover_dataframe(df=rdrugs_discover)
    df = rank_drugs_discover(df)
#     df.head()
    make_exp_drug_ranking_results_slide(setup.mtb_ppt_file, df.head(20), setup.expression_control, method='DiSCoVER')
    log('Done making the DiSCoVER powerpoint slide!')
    log('Savig the variables to a file.')
    setup.discover_results = discover_results
    setup.disco2cid = disco2cid
    setup.control_exp = control_exp
    setup.reasonable_results = reasonable_results
    setup.df = df
    log('Saving the formatted results of DiSCoVER to a csv')
    df.to_csv(os.path.join(setup.out_dir, setup.case_id+'_formated_DISCoVER_results.csv'))
#     pickle.dump(setup, file=open(setup.out_dir+'_backup4_DISCoVER.p','wb'))
    pickle.dump(setup, file=open(os.path.join(setup.out_dir, setup.case_id+'_backup4_DISCoVER.p'),'wb'))
#     pickle.dump(setup, file=open(setup.case_id+'_DISCoVER.p','wb'))
    log('Done savig the variables to a file!')
    
    log('Done with all the taks in this cell. Move along.')
    return setup

UIBuilder(description='Run DiSCoVER on the provided sample and control.', function_import='run_discover', name…