In [1]:
import os
import subprocess
import pandas as pd
import numpy as np
import re

In [260]:
# Initialise job parameters
target_project_NAME = "Second_Complete_data_RESTRICTED"
target_pipeline_CODE = "DRAGEN Somatic Whole Genome 4-2-4-v2" #Remains fixed for this script. Need to modify DRAGEN parameters if changed
out_folder_NAME = 'somatic' #folder needs to exist prior to running
fastq_list = pd.read_csv("all_fastqlist.csv") #Need locate copy to map sample names to fastq file names on ICA
normal_RGSM = "KOLF2_Fibro"
tumour_RGSM = "KOLF2-1_New_iPS_p5"
run_name = f"{normal_RGSM}_vs_{tumour_RGSM}_w_TMB30" #Can customise if needed
storage_size = "Medium"
output_prefix = "KOLF2-1_New_iPS_p5"
sample_sex = "auto"
enable_germline_on_normal = 'false'
tmb_thresh = 30

In [261]:
# Retreive project ID
projects_request = subprocess.run(['icav2', 'projects', 'list'], stdout=subprocess.PIPE)
project_table = [x.split('\t') for x in projects_request.stdout.decode('utf-8').split('\n')][1:-2] # Remove Messy Header and last 2 unnessecary rows
project_table = pd.DataFrame(project_table, columns=['ID','NAME','OWNER','Other'])
project_table.NAME = [x.rstrip() for x in project_table.NAME.to_list()]
target_project_ID = project_table.loc[project_table.NAME==target_project_NAME,"ID"].item()

# Retreive pipeline ID
subprocess.call(['icav2','projects','enter',target_project_NAME])
pipelines_request = subprocess.run(['icav2','projectpipelines','list'], stdout=subprocess.PIPE)
pipeline_table = [x.split('\t') for x in pipelines_request.stdout.decode('utf-8').split('\n')][1:-2] # Remove Messy Header and last 2 unnessecary rows
pipeline_table = pd.DataFrame(pipeline_table, columns=['ID','CODE','DESCRIPTION','Other'])
target_pipeline_ID = pipeline_table.loc[pipeline_table.CODE.str.contains(target_pipeline_CODE),"ID"].item()

# Retreive Fixed Reference genome ID
reference_ID = "fil.05c5aae95c6b456c366f08db3aac4879" #Hg38 Alt-Maksed v3, Graph Enabled)

# Retreive OUT folder ID
out_folder_request = subprocess.run(f"icav2 projectdata list --data-type FOLDER --file-name {out_folder_NAME}", shell = True, stdout=subprocess.PIPE)
out_folder_ID = [x.split('\t') for x in out_folder_request.stdout.decode('utf-8').split('\n')][1][3]

In [262]:
# Get Fastq list ID
#fastq_list_NAME = 'all_fastqlist'
#fastq_list_request = subprocess.run(f"icav2 projectdata list --data-type FILE --match-mode FUZZY --file-name {fastq_list_NAME}", shell = True, stdout=subprocess.PIPE)
#fastq_list_ID = [x.split('\t') for x in fastq_list_request.stdout.decode('utf-8').split('\n')][1][3]

In [263]:
# Get Fastq list ID
fastq_list_request = subprocess.run(f"icav2 projectdata list --data-type FILE --match-mode FUZZY --file-name fastq_list", shell = True, stdout=subprocess.PIPE)
fastq_list_table = [x.split('\t') for x in fastq_list_request.stdout.decode('utf-8').split('\n')][1:-2]
fastq_list_table = pd.DataFrame(fastq_list_table, columns=['NAME','TYPE','STATUS','ID','OWNING','PROJECT ID','PATH','Other'])
fastq_list_table = fastq_list_table.loc[fastq_list_table.OWNING==target_project_NAME,]
fastq_list_table.NAME = [x.rstrip() for x in fastq_list_table.NAME.to_list()]
normal_fastq_list_ID = fastq_list_table.loc[fastq_list_table.NAME==f'{normal_RGSM}_fastq_list.csv',"ID"].item()
tumour_fastq_list_ID = fastq_list_table.loc[fastq_list_table.NAME==f'{tumour_RGSM}_fastq_list.csv',"ID"].item()

In [264]:
#Get Relavent Fastq file IDs
fastq_request = subprocess.run(f"icav2 projectdata list --data-type FILE --match-mode FUZZY --file-name .fastq.gz --sort-by name", shell = True, stdout=subprocess.PIPE)
fastq_table = [x.split('\t') for x in fastq_request.stdout.decode('utf-8').split('\n')][1:-2]
fastq_table = pd.DataFrame(fastq_table, columns=['NAME','TYPE','STATUS','ID','OWNING','PROJECT ID','PATH','Other'])
fastq_table = fastq_table.loc[fastq_table.OWNING==target_project_NAME,]
fastq_table.NAME = [x.rstrip() for x in fastq_table.NAME.to_list()]

normal_fastq_NAMEs = fastq_list.loc[fastq_list.RGSM==normal_RGSM,"Read1File"].to_list() + fastq_list.loc[fastq_list.RGSM==normal_RGSM,"Read2File"].to_list()
tumour_fastq_NAMEs = fastq_list.loc[fastq_list.RGSM==tumour_RGSM,"Read1File"].to_list() + fastq_list.loc[fastq_list.RGSM==tumour_RGSM,"Read2File"].to_list()
normal_fastq_IDs = fastq_table.loc[fastq_table.NAME.isin(normal_fastq_NAMEs),"ID"].to_list()
tumour_fastq_IDs = fastq_table.loc[fastq_table.NAME.isin(tumour_fastq_NAMEs),"ID"].to_list()

In [265]:
#Check IDs are non-empty
print(target_pipeline_ID)
print(target_project_ID)
print(out_folder_ID)
print(reference_ID)
print(normal_fastq_IDs)
print(tumour_fastq_IDs)
print(normal_fastq_list_ID)
print(tumour_fastq_list_ID)

c4314895-bcb9-49c3-997f-39d294d7d5b4
85f42989-d886-4301-9430-9627be16c76a
fol.f57f9ca46bb041c2b45208dc82039eee
fil.05c5aae95c6b456c366f08db3aac4879
['fil.a3ff7f6ff33c4dae76ba08dcaaf0c7b4', 'fil.67da79132bad43a3fc7508dcad1115ea']
['fil.f6a340d2c213426d5d6408dc8062247e', 'fil.e8e4504664a340b05d6e08dc8062247e', 'fil.b87d14d563d945c65d7808dc8062247e', 'fil.e38cfb3d2a234caa5d8108dc8062247e', 'fil.b630da78c4d64e855d8908dc8062247e', 'fil.56ccd7162b514e635d9408dc8062247e', 'fil.fa2ee16627764c6c5d9d08dc8062247e', 'fil.5d146db8526649ab5da708dc8062247e']
fil.9e9220ccbb4b44a1780908dcaaf0c7b4
fil.0c74245dc13645d64edb08dc859845c0


In [266]:
#Build run CMD line
subprocess.call(['icav2','projectpipelines','start','nextflow',target_pipeline_ID,
                 '--user-reference',run_name,
                 '--project-id', target_project_ID,
                 '--storage-size',storage_size,
                 '--output-parent-folder',out_folder_ID,
                 '--input',f'ref_tar:{reference_ID}',
                 '--input',f'normal_fastqs:{",".join(normal_fastq_IDs)}',
                 '--input',f'tumor_fastqs:{",".join(tumour_fastq_IDs)}',
                 '--input',f'normal_fastq_list:{normal_fastq_list_ID}',
                 '--input',f'tumor_fastq_list:{tumour_fastq_list_ID}',
                 #'--parameters',f'additional_args:"--fastq-list-sample-id {normal_RGSM} --tumor-fastq-list-sample-id {tumour_RGSM}"',
                 '--parameters',f'output_file_prefix:{output_prefix}',
                 '--parameters',f'sample_sex:{sample_sex}',
                 '--parameters',f'enable_germline_vc_on_normal:{enable_germline_on_normal}',
                 '--parameters',f'enable_dragen_reports:true',
                 '--parameters',f'enable_map_align:true',
                 '--parameters',f'enable_map_align_output:true',
                 '--parameters',f'enable_duplicate_marking:true',
                 '--parameters',f'output_format:BAM',
                 '--parameters',f'enable_variant_caller:true',
                 '--parameters',f'enable_cnv:true',
                 '--parameters',f'cnv_use_somatic_vc_baf:true',
                 '--parameters',f'enable_sv:true',
                 '--parameters',f'enable_variant_annotation:true',
                 '--parameters',f'variant_annotation_assembly:GRCh38',
                 '--parameters',f'maf_transcript_source:ensembl',
                 '--parameters',f'enable_tmb:true',
                 '--parameters',f'additional_args:--vc-callability-tumor-thresh="{tmb_thresh}"'
                ])

analysisPriority                      MEDIUM
analysisStorage.description           2.4TB
analysisStorage.id                    96b5a0a9-30d7-4bdb-b3f0-3113b095ef04
analysisStorage.name                  Medium
analysisStorage.ownerId               8ec463f6-1acb-341b-b321-043c39d8716a
analysisStorage.tenantId              f91bb1a0-c55f-4bce-8014-b2e60c0ec7d3
analysisStorage.tenantName            ica-cp-admin
analysisStorage.timeCreated           2021-11-05T10:28:20Z
analysisStorage.timeModified          2023-05-31T16:38:19Z
id                                    d1fa7ff1-374e-4f2e-a4c9-2631f4b8d315
ownerId                               0fc66a48-fa4a-376b-9399-c306a178bed9
pipeline.analysisStorage.description  2.4TB
pipeline.analysisStorage.id           96b5a0a9-30d7-4bdb-b3f0-3113b095ef04
pipeline.analysisStorage.name         Medium
pipeline.analysisStorage.ownerId      8ec463f6-1acb-341b-b321-043c39d8716a
pipeline.analysisStorage.tenantId     f91bb1a0-c55f-4bce-8014-b2e60c0ec7d3
pipeline

0