# Collect RNAseq level3 data into a data library

In [1]:
import sys
import os
from glob import glob as GlobToFilePaths
from bioblend.galaxy import GalaxyInstance
from bioblend.galaxy.histories import HistoryClient
from bioblend.galaxy.tools import ToolClient
from bioblend.galaxy.workflows import WorkflowClient
from bioblend.galaxy.datasets import DatasetClient
from bioblend.galaxy.libraries import LibraryClient

## I. Connect to Galaxy Instance

#### You will need a `.env` file with your API_KEY

In [2]:
import environ

ROOT_DIR = environ.Path('.')

# Load operating system environment variables and then prepare to use them
env = environ.Env()

# Operating System Environment variables have precedence over variables defined in the .env file,
# that is to say variables from the .env files will only be used if not defined
# as environment variables.
env_file = str(ROOT_DIR.path('.env'))
print('Loading : {}'.format(env_file))
env.read_env(env_file)
print('The .env file has been loaded.')

Loading : /Users/alex/Documents/galaxy-neurolincs/bioblend/.env
The .env file has been loaded.


In [3]:
GALAXY_URL = "https://answer.csbi.mit.edu"
API_KEY = env("API_KEY")
API_KEY == None

False

In [4]:
galaxyInstance = GalaxyInstance(url=GALAXY_URL, key=API_KEY)

## II. Get All RNAseq Histories

In [5]:
published_histories = [history for history in galaxyInstance.histories.get_histories() if history['published']]
published_histories

[{'annotation': None,
  'deleted': False,
  'id': 'd0bfe935d0f5258d',
  'model_class': 'History',
  'name': 'NeuroLINCS Exp 2: iMNs - ATAC pipeline',
  'published': True,
  'purged': False,
  'tags': [],
  'url': '/api/histories/d0bfe935d0f5258d'},
 {'annotation': None,
  'deleted': False,
  'id': 'c851ab275e52f8af',
  'model_class': 'History',
  'name': 'NeuroLINCS Exp 2: iMNs - RNAseq Pipeline',
  'published': True,
  'purged': False,
  'tags': [],
  'url': '/api/histories/c851ab275e52f8af'},
 {'annotation': None,
  'deleted': False,
  'id': '24d84bcf64116fe7',
  'model_class': 'History',
  'name': 'NeuroLINCS Exp 1: iPSC - RNAseq Pipeline',
  'published': True,
  'purged': False,
  'tags': [],
  'url': '/api/histories/24d84bcf64116fe7'},
 {'annotation': None,
  'deleted': False,
  'id': 'c24141d7e4e77705',
  'model_class': 'History',
  'name': 'iMNs/Exp 2 Level 1 data',
  'published': True,
  'purged': False,
  'tags': [],
  'url': '/api/histories/c24141d7e4e77705'},
 {'annotation':

In [6]:
RNAseq_history_IDs = ['24d84bcf64116fe7', 'c851ab275e52f8af']

In [7]:
published_RNAseq_histories = [h for h in published_histories if h['id'] in RNAseq_history_IDs]
published_RNAseq_histories

[{'annotation': None,
  'deleted': False,
  'id': 'c851ab275e52f8af',
  'model_class': 'History',
  'name': 'NeuroLINCS Exp 2: iMNs - RNAseq Pipeline',
  'published': True,
  'purged': False,
  'tags': [],
  'url': '/api/histories/c851ab275e52f8af'},
 {'annotation': None,
  'deleted': False,
  'id': '24d84bcf64116fe7',
  'model_class': 'History',
  'name': 'NeuroLINCS Exp 1: iPSC - RNAseq Pipeline',
  'published': True,
  'purged': False,
  'tags': [],
  'url': '/api/histories/24d84bcf64116fe7'}]

## III. Get Destination Data Library

In [58]:
transcriptomics_level3_library = [item for item in galaxyInstance.libraries.get_libraries() if item['name'] == 'NeuroLINCS - Transcriptomics - Level 3 (counts)']
transcriptomics_level3_library_id = transcriptomics_level3_library[0]['id']
transcriptomics_level3_library

[{'can_user_add': True,
  'can_user_manage': True,
  'can_user_modify': True,
  'create_time': '2017-10-20T19:31:11.948329',
  'create_time_pretty': '2 months ago',
  'deleted': False,
  'description': '',
  'id': 'ba751ee0539fff04',
  'model_class': 'Library',
  'name': 'NeuroLINCS - Transcriptomics - Level 3 (counts)',
  'root_folder_id': 'Fbefc5c2fe5c30689',
  'synopsis': ''}]

## IV. Upload all output datasets from each RNAseq history

### i. `NeuroLINCS Exp 1: iPSC - RNAseq Pipeline`

In [11]:
history_contents = galaxyInstance.histories.show_history(RNAseq_history_IDs[0], contents=True)

In [None]:
collections = [(item['id'], item['name']) for item in history_contents if item['history_content_type'] == 'dataset_collection' and not item['deleted']]
collections

In [31]:
output_collection_ids = ['56fc5a09f8ae2546', '21b91b9198fe5ccf']

In [43]:
collection1 = galaxyInstance.histories.show_dataset_collection(RNAseq_history_IDs[0],output_collection_ids[0])['elements']

In [44]:
[item for item in collection1 if item['model_class'] != 'DatasetCollectionElement']

[]

In [59]:
datasets = [(item['element_identifier'], item['object']['id'], item['object']['name']) for item in collection1]
datasets[:5]

[('R219-L1-P01-ATCACG-',
  '617484c96abcde2a',
  'htseq-count on data 2237 and data 1677 (no feature)'),
 ('R219-L1-P02-CGATGT-',
  '2a1afc8f9490a8b7',
  'htseq-count on data 2237 and data 1678 (no feature)'),
 ('R219-L1-P03-TTAGGC-',
  '248bbd3445e61bc5',
  'htseq-count on data 2237 and data 1679 (no feature)'),
 ('R219-L1-P04-TGACCA-',
  'd9315556a02ade5a',
  'htseq-count on data 2237 and data 1680 (no feature)'),
 ('R219-L1-P05-ACAGTG-',
  'eabb909765704043',
  'htseq-count on data 2237 and data 1681 (no feature)')]

In [45]:
collection2 = galaxyInstance.histories.show_dataset_collection(RNAseq_history_IDs[0],output_collection_ids[1])['elements']

In [46]:
[item for item in collection2 if item['model_class'] != 'DatasetCollectionElement']

[]

In [60]:
mo_datasets = [(item['element_identifier'], item['object']['id'], item['object']['name']) for item in collection2]
mo_datasets[:5]

[('R219-L1-P01-ATCACG-',
  '652c6e153d8fab36',
  'htseq-count on data 2237 and data 1677'),
 ('R219-L1-P02-CGATGT-',
  'eac4142f40a115d5',
  'htseq-count on data 2237 and data 1678'),
 ('R219-L1-P03-TTAGGC-',
  '540c073e67632cec',
  'htseq-count on data 2237 and data 1679'),
 ('R219-L1-P04-TGACCA-',
  'cb7b3da348f26a93',
  'htseq-count on data 2237 and data 1680'),
 ('R219-L1-P05-ACAGTG-',
  '6f20225e3cebcd9a',
  'htseq-count on data 2237 and data 1681')]

In [62]:
galaxyInstance.libraries.create_folder(transcriptomics_level3_library_id, "iPSC")

[{'id': 'F712bb19b076d5e1a',
  'name': 'iPSC',
  'url': '/api/libraries/ba751ee0539fff04/contents/F712bb19b076d5e1a'}]

In [63]:
for dataset in datasets:
    galaxyInstance.libraries.copy_from_dataset(transcriptomics_level3_library_id, dataset[1], folder_id='F712bb19b076d5e1a')

In [64]:
for dataset in mo_datasets:
    galaxyInstance.libraries.copy_from_dataset(transcriptomics_level3_library_id, dataset[1], folder_id='F712bb19b076d5e1a')

### i. `NeuroLINCS Exp 2: iMNs - RNAseq Pipeline`

In [65]:
history_contents = galaxyInstance.histories.show_history(RNAseq_history_IDs[1], contents=True)

In [66]:
collections = [(item['id'], item['name']) for item in history_contents if item['history_content_type'] == 'dataset_collection' and not item['deleted']]
collections

[('53baad0929431091', 'LINCS_iMN_30JUL2015_rawreads - fastq'),
 ('fdb0150c645315f6', 'FastQC on collection 163: Webpage'),
 ('da8c1c7e2e6e28c7', 'FastQC on collection 163: RawData'),
 ('9301843aab84a406', 'Trimmomatic on collection 163: paired'),
 ('df6be339a52c816d', 'Trimmomatic on collection 163: unpaired'),
 ('f09f4fc56bdd0c92', 'TopHat on collection 815: deletions'),
 ('c247aa4ca71d0a48', 'TopHat on collection 815: align_summary'),
 ('687964d2abc1d496', 'TopHat on collection 815: insertions'),
 ('c5770bc5f3721970', 'TopHat on collection 815: accepted_hits'),
 ('64ed206b5444cf1a', 'TopHat on collection 815: splice junctions'),
 ('e4c0e60daf76dabb', 'Sort on collection 1225'),
 ('424aa8a9e0f64384', 'htseq-count on collection 1308 (no feature)'),
 ('4d6b7ca01eb9ee3d', 'htseq-count on collection 1308')]

In [67]:
output_collection_ids = ['424aa8a9e0f64384', '4d6b7ca01eb9ee3d']

In [68]:
collection3 = galaxyInstance.histories.show_dataset_collection(RNAseq_history_IDs[1],output_collection_ids[0])['elements']

In [69]:
[item for item in collection3 if item['model_class'] != 'DatasetCollectionElement']

[]

In [71]:
datasets3 = [(item['element_identifier'], item['object']['id'], item['object']['name']) for item in collection3]
datasets3[:5]

[('R231-L3-P01-ATCACG-',
  '73f6f654445026ee',
  'htseq-count on data 1556 and data 1227 (no feature)'),
 ('R231-L3-P02-CGATGT-',
  '3ee0e0207e20aff6',
  'htseq-count on data 1556 and data 1228 (no feature)'),
 ('R231-L3-P03-TTAGGC-',
  'f4cef392d41f7f78',
  'htseq-count on data 1556 and data 1229 (no feature)'),
 ('R231-L3-P04-TGACCA-',
  'f324b775074c7e75',
  'htseq-count on data 1556 and data 1230 (no feature)'),
 ('R231-L3-P05-ACAGTG-',
  '384a4cca7cdb80b9',
  'htseq-count on data 1556 and data 1231 (no feature)')]

In [72]:
collection4 = galaxyInstance.histories.show_dataset_collection(RNAseq_history_IDs[1],output_collection_ids[1])['elements']

In [74]:
[item for item in collection4 if item['model_class'] != 'DatasetCollectionElement']

[]

In [75]:
datasets4 = [(item['element_identifier'], item['object']['id'], item['object']['name']) for item in collection4]
datasets4[:5]

[('R231-L3-P01-ATCACG-',
  'cb3193d5b0baf0f9',
  'htseq-count on data 1556 and data 1227'),
 ('R231-L3-P02-CGATGT-',
  '6b8fb61ab0fcf61f',
  'htseq-count on data 1556 and data 1228'),
 ('R231-L3-P03-TTAGGC-',
  '02f5b613a7345076',
  'htseq-count on data 1556 and data 1229'),
 ('R231-L3-P04-TGACCA-',
  'a3bb5e2a6ccb4d65',
  'htseq-count on data 1556 and data 1230'),
 ('R231-L3-P05-ACAGTG-',
  'acf1972cab0da102',
  'htseq-count on data 1556 and data 1231')]

In [76]:
galaxyInstance.libraries.create_folder(transcriptomics_level3_library_id, "iMN")

[{'id': 'Fa8146cfb819e54ec',
  'name': 'iMN',
  'url': '/api/libraries/ba751ee0539fff04/contents/Fa8146cfb819e54ec'}]

In [77]:
for dataset in datasets3:
    galaxyInstance.libraries.copy_from_dataset(transcriptomics_level3_library_id, dataset[1], folder_id='Fa8146cfb819e54ec')

In [78]:
for dataset in datasets4:
    galaxyInstance.libraries.copy_from_dataset(transcriptomics_level3_library_id, dataset[1], folder_id='Fa8146cfb819e54ec')