In [2]:
import os
import subprocess
import pandas as pd
import numpy as np
import re

In [101]:
# Initialise job parameters
target_project_NAME = "Second_Complete_data_RESTRICTED" #Project name, case sensitive, must be exact.
target_pipeline_CODE = "DRAGEN Germline Whole Genome 4-2-4-v2" #Remains fixed for this script. Need to modify DRAGEN parameters if changed
out_folder_NAME = 'germline' #folder needs to exist prior to running
fastq_list = pd.read_csv("all_fastqlist.csv") #Need locate copy to map sample names to fastq file names on ICA
RGSM = "PK6_Fibro" #Sample name
run_name = f"{RGSM}_CLI" #Can customise if needed
storage_size = "Medium" #Run storage size

In [102]:
# Enter projects
projects_request = subprocess.run(['icav2', 'projects', 'list'], stdout=subprocess.PIPE)
project_table = [x.split('\t') for x in projects_request.stdout.decode('utf-8').split('\n')][1:-2] # Remove Messy Header and last 2 unnessecary rows
project_table = pd.DataFrame(project_table, columns=['ID','NAME','OWNER','Other'])
project_table.NAME = [x.rstrip() for x in project_table.NAME.to_list()]
target_project_ID = project_table.loc[project_table.NAME==target_project_NAME,"ID"].item()

In [103]:
#Get pipeline ID
subprocess.call(['icav2','projects','enter',target_project_NAME])
pipelines_request = subprocess.run(['icav2','projectpipelines','list'], stdout=subprocess.PIPE)
pipeline_table = [x.split('\t') for x in pipelines_request.stdout.decode('utf-8').split('\n')][1:-2] # Remove Messy Header and last 2 unnessecary rows
pipeline_table = pd.DataFrame(pipeline_table, columns=['ID','CODE','DESCRIPTION','Other'])
target_pipeline_ID = pipeline_table.loc[pipeline_table.CODE.str.contains(target_pipeline_CODE),"ID"].item()

#Fixed Reference genome ID
reference_ID = "fil.05c5aae95c6b456c366f08db3aac4879" #Hg38 Alt-Maksed v3, Graph Enabled)

# Get out folder ID
out_folder_request = subprocess.run(f"icav2 projectdata list --data-type FOLDER --file-name {out_folder_NAME}", shell = True, stdout=subprocess.PIPE)
out_folder_ID = [x.split('\t') for x in out_folder_request.stdout.decode('utf-8').split('\n')][1][3]

In [104]:
# Get Fastq list ID
fastq_list_request = subprocess.run(f"icav2 projectdata list --data-type FILE --match-mode FUZZY --file-name fastq_list", shell = True, stdout=subprocess.PIPE)
fastq_list_table = [x.split('\t') for x in fastq_list_request.stdout.decode('utf-8').split('\n')][1:-2]
fastq_list_table = pd.DataFrame(fastq_list_table, columns=['NAME','TYPE','STATUS','ID','OWNING','PROJECT ID','PATH','Other'])
fastq_list_table = fastq_list_table.loc[fastq_list_table.OWNING==target_project_NAME,]
fastq_list_table.NAME = [x.rstrip() for x in fastq_list_table.NAME.to_list()]
fastq_list_ID = fastq_list_table.loc[fastq_list_table.NAME==f'{RGSM}_fastq_list.csv',"ID"].item()

In [105]:
#Get Relavent Fastq file IDs
fastq_request = subprocess.run(f"icav2 projectdata list --data-type FILE --match-mode FUZZY --file-name .fastq.gz --sort-by name", shell = True, stdout=subprocess.PIPE)
fastq_table = [x.split('\t') for x in fastq_request.stdout.decode('utf-8').split('\n')][1:-2]
fastq_table = pd.DataFrame(fastq_table, columns=['NAME','TYPE','STATUS','ID','OWNING','PROJECT ID','PATH','Other'])
fastq_table = fastq_table.loc[fastq_table.OWNING==target_project_NAME,]
fastq_table.NAME = [x.rstrip() for x in fastq_table.NAME.to_list()]

fastq_NAMEs = fastq_list.loc[fastq_list.RGSM==RGSM,"Read1File"].to_list() + fastq_list.loc[fastq_list.RGSM==RGSM,"Read2File"].to_list()
fastq_IDs = fastq_table.loc[fastq_table.NAME.isin(fastq_NAMEs),"ID"].to_list()

In [106]:
#Check IDs are non-empty
print(target_pipeline_ID)
print(target_project_ID)
print(out_folder_ID)
print(reference_ID)
print(fastq_IDs)
print(fastq_list_ID)

54404e49-232c-4e9e-bb8a-1b6902e73711
85f42989-d886-4301-9430-9627be16c76a
fol.9664f6d011df412fbaa908dcb1b64d4b
fil.05c5aae95c6b456c366f08db3aac4879
['fil.8fe1059d38844ae8d0f308dc661c8867', 'fil.743014896f29493cd0f508dc661c8867', 'fil.479b91e2b0674f15712308dc83987b7e', 'fil.90296bb9b6504ba0b9dc08dc7a0ebac2', 'fil.b2abcdcd6c474d1cd0f208dc661c8867', 'fil.c15a085bd7e54193d0f108dc661c8867', 'fil.3c0c7eda2fe54546d0f408dc661c8867', 'fil.ad090f594e7e4bfbd0f608dc661c8867']
fil.d3d7a046cabe404d3a8b08dc85aebaa1


In [107]:
#Build run CMD line
subprocess.call(['icav2','projectpipelines','start','nextflow',target_pipeline_ID,
                 '--user-reference',run_name,
                 '--project-id', target_project_ID,
                 '--storage-size',storage_size,
                 '--output-parent-folder',out_folder_ID,
                 '--input',f'ref_tar:{reference_ID}',
                 '--input',f'fastqs:{",".join(fastq_IDs)}',
                 '--input',f'fastq_list:{fastq_list_ID}',
                 '--parameters',f'enable_map_align:true',
                 '--parameters',f'enable_map_align_output:true',
                 '--parameters',f'enable_duplicate_marking:true',
                 '--parameters',f'output_format:BAM',
                 '--parameters',f'enable_variant_caller:true',
                 '--parameters',f'vc_emit_ref_confidence:GVCF',
                 '--parameters',f'vc_enable_vcf_output:true',
                 '--parameters',f'enable_cnv:true',
                 '--parameters',f'enable_sv:true',
                 '--parameters',f'enable_gba:true',
                 '--parameters',f'enable_variant_annotation:true',
                 '--parameters',f'variant_annotation_assembly:GRCh38',
                 '--parameters',f'additional_args:--enable-maf-output true --maf-transcript-source Refseq',
                 '--parameters',f'enable_dragen_reports:true'
                ])

analysisPriority                      MEDIUM
analysisStorage.description           2.4TB
analysisStorage.id                    96b5a0a9-30d7-4bdb-b3f0-3113b095ef04
analysisStorage.name                  Medium
analysisStorage.ownerId               8ec463f6-1acb-341b-b321-043c39d8716a
analysisStorage.tenantId              f91bb1a0-c55f-4bce-8014-b2e60c0ec7d3
analysisStorage.tenantName            ica-cp-admin
analysisStorage.timeCreated           2021-11-05T10:28:20Z
analysisStorage.timeModified          2023-05-31T16:38:19Z
id                                    e2b3d88d-38bf-4be0-ae5c-f770acb19a90
ownerId                               0fc66a48-fa4a-376b-9399-c306a178bed9
pipeline.analysisStorage.description  2.4TB
pipeline.analysisStorage.id           96b5a0a9-30d7-4bdb-b3f0-3113b095ef04
pipeline.analysisStorage.name         Medium
pipeline.analysisStorage.ownerId      8ec463f6-1acb-341b-b321-043c39d8716a
pipeline.analysisStorage.tenantId     f91bb1a0-c55f-4bce-8014-b2e60c0ec7d3
pipeline

0