# Collect RNAseq level3 data into a data library

In [1]:
import sys
import os
from glob import glob as GlobToFilePaths
from bioblend.galaxy import GalaxyInstance
from bioblend.galaxy.histories import HistoryClient
from bioblend.galaxy.tools import ToolClient
from bioblend.galaxy.workflows import WorkflowClient
from bioblend.galaxy.datasets import DatasetClient
from bioblend.galaxy.libraries import LibraryClient

## I. Connect to Galaxy Instance

#### You will need a `.env` file with your API_KEY

In [2]:
import environ

ROOT_DIR = environ.Path('.')

# Load operating system environment variables and then prepare to use them
env = environ.Env()

# Operating System Environment variables have precedence over variables defined in the .env file,
# that is to say variables from the .env files will only be used if not defined
# as environment variables.
env_file = str(ROOT_DIR.path('.env'))
print('Loading : {}'.format(env_file))
env.read_env(env_file)
print('The .env file has been loaded.')

Loading : /Users/alex/Documents/galaxy-neurolincs/bioblend/.env
The .env file has been loaded.


In [3]:
GALAXY_URL = "https://answer.csbi.mit.edu"
API_KEY = env("API_KEY")
API_KEY == None

False

In [4]:
galaxyInstance = GalaxyInstance(url=GALAXY_URL, key=API_KEY)

## II. Get All RNAseq Histories

In [5]:
published_histories = [history for history in galaxyInstance.histories.get_histories() if history['published']]
published_histories

[{'annotation': None,
  'deleted': False,
  'id': '68013dab1c13fb37',
  'model_class': 'History',
  'name': 'iPSC test 2',
  'published': True,
  'purged': False,
  'tags': [],
  'url': '/api/histories/68013dab1c13fb37'},
 {'annotation': None,
  'deleted': False,
  'id': 'ceefdfd6cf7aa5ad',
  'model_class': 'History',
  'name': 'iPSC NEW PIPELINE TEST 20180205',
  'published': True,
  'purged': False,
  'tags': [],
  'url': '/api/histories/ceefdfd6cf7aa5ad'},
 {'annotation': None,
  'deleted': False,
  'id': '49ee1ea2b297a18d',
  'model_class': 'History',
  'name': 'iMNS NEW PIPELINE TEST 20180202',
  'published': True,
  'purged': False,
  'tags': [],
  'url': '/api/histories/49ee1ea2b297a18d'},
 {'annotation': None,
  'deleted': False,
  'id': '9005c5112febe774',
  'model_class': 'History',
  'name': 'd32_diMNs NEW PIPELINE TEST 20180131',
  'published': True,
  'purged': False,
  'tags': [],
  'url': '/api/histories/9005c5112febe774'},
 {'annotation': None,
  'deleted': False,
  'id

In [9]:
RNAseq_history_IDs = [history['id'] for history in published_histories if "RNAseq" in history["name"]]
RNAseq_history_IDs

['0b900c60f93c0654', '6b7e1d14aa0742ec']

In [10]:
published_RNAseq_histories = [h for h in published_histories if h['id'] in RNAseq_history_IDs]
published_RNAseq_histories

[{'annotation': None,
  'deleted': False,
  'id': '0b900c60f93c0654',
  'model_class': 'History',
  'name': 'NeuroLINCS iMNs: (level1 > level3) RNAseq pipeline history',
  'published': True,
  'purged': False,
  'tags': [],
  'url': '/api/histories/0b900c60f93c0654'},
 {'annotation': None,
  'deleted': False,
  'id': '6b7e1d14aa0742ec',
  'model_class': 'History',
  'name': 'NeuroLINCS iPSC: (level1 > level3) RNAseq pipeline history',
  'published': True,
  'purged': False,
  'tags': [],
  'url': '/api/histories/6b7e1d14aa0742ec'}]

## III. Get Destination Data Library

In [58]:
transcriptomics_level3_library = [item for item in galaxyInstance.libraries.get_libraries() if item['name'] == 'NeuroLINCS - Transcriptomics - Level 3 (counts)']
transcriptomics_level3_library_id = transcriptomics_level3_library[0]['id']
transcriptomics_level3_library

[{'can_user_add': True,
  'can_user_manage': True,
  'can_user_modify': True,
  'create_time': '2017-10-20T19:31:11.948329',
  'create_time_pretty': '2 months ago',
  'deleted': False,
  'description': '',
  'id': 'ba751ee0539fff04',
  'model_class': 'Library',
  'name': 'NeuroLINCS - Transcriptomics - Level 3 (counts)',
  'root_folder_id': 'Fbefc5c2fe5c30689',
  'synopsis': ''}]

## IV. Upload all output datasets from each RNAseq history

### i. `NeuroLINCS Exp 1: iPSC - RNAseq Pipeline`

In [11]:
history_contents = galaxyInstance.histories.show_history(RNAseq_history_IDs[0], contents=True)

In [12]:
collections = [(item['id'], item['name']) for item in history_contents if item['history_content_type'] == 'dataset_collection' and not item['deleted']]
collections

[('10d2377e3553f019', 'NeuroLINCS iMNs: fastqs'),
 ('1f2ff62a1063e808', 'FastQC on collection 163: Webpage'),
 ('9f916718ff70f082', 'FastQC on collection 163: RawData'),
 ('b3d4dd46673b4cd9', 'Trimmomatic on collection 163: paired'),
 ('bc339828b9a45763', 'Trimmomatic on collection 163: unpaired'),
 ('1d5a38af5ddad1da', 'TopHat on collection 815: deletions'),
 ('1b4890ecbaeb5091', 'TopHat on collection 815: align_summary'),
 ('484e0b6740dcf97a', 'TopHat on collection 815: insertions'),
 ('aa9bc7d7deb6c67f', 'TopHat on collection 815: accepted_hits'),
 ('0bd9d7603257dfc1', 'TopHat on collection 815: splice junctions'),
 ('4758bcb5e6921b84', 'featureCounts on collection 1225: summary'),
 ('067ff85fccf28736', 'featureCounts on collection 1225')]

In [13]:
output_collection_ids = ['067ff85fccf28736']

In [14]:
collection1 = galaxyInstance.histories.show_dataset_collection(RNAseq_history_IDs[0],output_collection_ids[0])['elements']

In [15]:
[item for item in collection1 if item['model_class'] != 'DatasetCollectionElement']

[]

In [16]:
datasets = [(item['element_identifier'], item['object']['id'], item['object']['name']) for item in collection1]
datasets[:5]

[('R231-L3-P01-ATCACG--Sequences',
  '9d5c8285b392eb55',
  'featureCounts on data 164 and data 821'),
 ('R231-L3-P02-CGATGT--Sequences',
  'dfa75c894b03b95d',
  'featureCounts on data 164 and data 826'),
 ('R231-L3-P03-TTAGGC--Sequences',
  '4e196eddbeeb8d11',
  'featureCounts on data 164 and data 831'),
 ('R231-L3-P04-TGACCA--Sequences',
  '2174cd1ac32b7d7a',
  'featureCounts on data 164 and data 836'),
 ('R231-L3-P05-ACAGTG--Sequences',
  '791ca9b7466fda64',
  'featureCounts on data 164 and data 841')]

In [22]:
for dataset in datasets:
    try: 
        galaxyInstance.datasets.download_dataset(dataset[1], file_path=dataset[0], use_default_filename=False)
    except: 
        print(dataset)

('R231-L7-P10-GTGAAA--Sequences', '477c23da7548c0da', 'featureCounts on data 164 and data 1141')
('R231-L7-P11-GTGGCC--Sequences', 'a2ab7b3b87093bc2', 'featureCounts on data 164 and data 1146')
('R231-L7-P12-ACTGAT--Sequences', '98c3ae82f4c28563', 'featureCounts on data 164 and data 1151')
('R231-L8-P03-TTAGGC--Sequences', '6aa067d17905128c', 'featureCounts on data 164 and data 1171')
('R231-L8-P09-ATGTCA--Sequences', 'c7a4d14636e052fb', 'featureCounts on data 164 and data 1201')
('R231-L8-P10-GTGAAA--Sequences', '7d60c85abc8c9d41', 'featureCounts on data 164 and data 1206')
('R231-L8-P11-GTGGCC--Sequences', '944a6f7e53c4b019', 'featureCounts on data 164 and data 1211')
('R231-L8-P12-ACTGAT--Sequences', '581fb13be8269fe0', 'featureCounts on data 164 and data 1216')


In [17]:
collection2 = galaxyInstance.histories.show_dataset_collection(RNAseq_history_IDs[0],output_collection_ids[1])['elements']

IndexError: list index out of range

In [18]:
[item for item in collection2 if item['model_class'] != 'DatasetCollectionElement']

NameError: name 'collection2' is not defined

In [60]:
mo_datasets = [(item['element_identifier'], item['object']['id'], item['object']['name']) for item in collection2]
mo_datasets[:5]

[('R219-L1-P01-ATCACG-',
  '652c6e153d8fab36',
  'htseq-count on data 2237 and data 1677'),
 ('R219-L1-P02-CGATGT-',
  'eac4142f40a115d5',
  'htseq-count on data 2237 and data 1678'),
 ('R219-L1-P03-TTAGGC-',
  '540c073e67632cec',
  'htseq-count on data 2237 and data 1679'),
 ('R219-L1-P04-TGACCA-',
  'cb7b3da348f26a93',
  'htseq-count on data 2237 and data 1680'),
 ('R219-L1-P05-ACAGTG-',
  '6f20225e3cebcd9a',
  'htseq-count on data 2237 and data 1681')]

In [62]:
galaxyInstance.libraries.create_folder(transcriptomics_level3_library_id, "iPSC")

[{'id': 'F712bb19b076d5e1a',
  'name': 'iPSC',
  'url': '/api/libraries/ba751ee0539fff04/contents/F712bb19b076d5e1a'}]

In [63]:
for dataset in datasets:
    galaxyInstance.libraries.copy_from_dataset(transcriptomics_level3_library_id, dataset[1], folder_id='F712bb19b076d5e1a')

In [64]:
for dataset in mo_datasets:
    galaxyInstance.libraries.copy_from_dataset(transcriptomics_level3_library_id, dataset[1], folder_id='F712bb19b076d5e1a')

### i. `NeuroLINCS Exp 2: iMNs - RNAseq Pipeline`

In [23]:
history_contents = galaxyInstance.histories.show_history(RNAseq_history_IDs[1], contents=True)

In [24]:
collections = [(item['id'], item['name']) for item in history_contents if item['history_content_type'] == 'dataset_collection' and not item['deleted']]
collections

[('449edefbd273aad3', 'NeuroLINCS iPSC: fastqs'),
 ('5037c6a2beeaab0f', 'FastQC on collection 223: Webpage'),
 ('a022b13d513a18a6', 'FastQC on collection 223: RawData'),
 ('c89a7b7b675d5362', 'Trimmomatic on collection 223: paired'),
 ('e3ca3db0a290510f', 'Trimmomatic on collection 223: unpaired'),
 ('24a31d159cba9d38', 'TopHat on collection 1115: deletions'),
 ('5ad66ab0f6c619e8', 'TopHat on collection 1115: align_summary'),
 ('2924410f55e55fcb', 'TopHat on collection 1115: insertions'),
 ('7f52853230b573e4', 'TopHat on collection 1115: accepted_hits'),
 ('8fbe42edce4b2753', 'TopHat on collection 1115: splice junctions'),
 ('f0315bfa1fadf37a', 'featureCounts on collection 1675: summary'),
 ('abb4fffc3f994d02', 'featureCounts on collection 1675'),
 ('508ccef795c2a59e', 'featureCounts on collection 1675: summary'),
 ('f52c95b2c7ffaf93', 'featureCounts on collection 1675')]

In [25]:
output_collection_ids = ['f52c95b2c7ffaf93']

In [26]:
collection3 = galaxyInstance.histories.show_dataset_collection(RNAseq_history_IDs[1],output_collection_ids[0])['elements']

In [27]:
[item for item in collection3 if item['model_class'] != 'DatasetCollectionElement']

[]

In [28]:
datasets3 = [(item['element_identifier'], item['object']['id'], item['object']['name']) for item in collection3]
datasets3[:5]

[('R219-L1-P01-ATCACG--Sequences',
  'a03002c7cfa04567',
  'featureCounts on data 224 and data 1121'),
 ('R219-L1-P02-CGATGT--Sequences',
  '4b1c0ea4a4c038c4',
  'featureCounts on data 224 and data 1126'),
 ('R219-L1-P03-TTAGGC--Sequences',
  '2a7179b07fe301d1',
  'featureCounts on data 224 and data 1131'),
 ('R219-L1-P04-TGACCA--Sequences',
  '1308e1c29bca46c9',
  'featureCounts on data 224 and data 1136'),
 ('R219-L1-P05-ACAGTG--Sequences',
  'd59465f19e1b900b',
  'featureCounts on data 224 and data 1141')]

In [30]:
for dataset in datasets3:
    try: 
        galaxyInstance.datasets.download_dataset(dataset[1], file_path=dataset[0], use_default_filename=False)
    except: 
        print(dataset)

In [72]:
collection4 = galaxyInstance.histories.show_dataset_collection(RNAseq_history_IDs[1],output_collection_ids[1])['elements']

In [74]:
[item for item in collection4 if item['model_class'] != 'DatasetCollectionElement']

[]

In [75]:
datasets4 = [(item['element_identifier'], item['object']['id'], item['object']['name']) for item in collection4]
datasets4[:5]

[('R231-L3-P01-ATCACG-',
  'cb3193d5b0baf0f9',
  'htseq-count on data 1556 and data 1227'),
 ('R231-L3-P02-CGATGT-',
  '6b8fb61ab0fcf61f',
  'htseq-count on data 1556 and data 1228'),
 ('R231-L3-P03-TTAGGC-',
  '02f5b613a7345076',
  'htseq-count on data 1556 and data 1229'),
 ('R231-L3-P04-TGACCA-',
  'a3bb5e2a6ccb4d65',
  'htseq-count on data 1556 and data 1230'),
 ('R231-L3-P05-ACAGTG-',
  'acf1972cab0da102',
  'htseq-count on data 1556 and data 1231')]

In [76]:
galaxyInstance.libraries.create_folder(transcriptomics_level3_library_id, "iMN")

[{'id': 'Fa8146cfb819e54ec',
  'name': 'iMN',
  'url': '/api/libraries/ba751ee0539fff04/contents/Fa8146cfb819e54ec'}]

In [77]:
for dataset in datasets3:
    galaxyInstance.libraries.copy_from_dataset(transcriptomics_level3_library_id, dataset[1], folder_id='Fa8146cfb819e54ec')

In [78]:
for dataset in datasets4:
    galaxyInstance.libraries.copy_from_dataset(transcriptomics_level3_library_id, dataset[1], folder_id='Fa8146cfb819e54ec')