In [1]:
import pandas as pd
import os
from genoslurm.genoslurm import chunks, GenCallJob

In [2]:
# cluster config variables
max_nodes = 10

# paths- MAKE SURE TO SET TO YOUR OWN HOME PATH IN THE CLUSTER
userhome = '/home/dan_datatecnica_com'
datapath = f'/tmp/data'
log_dir = f'{userhome}/logs'
ilmn_files_path = f'{userhome}/ilmn_files'
iaap = f'{userhome}/GenoTools/executables/iaap-cli-linux-x64-1.1.0-sha.80d7e5b3d9c1fdfc2e99b472a90652fd3848bbc7/iaap-cli/iaap-cli'

# copy these from gcs
bpm = f'{ilmn_files_path}/NeuroBooster_20042459_A2.bpm'
egt = f'{ilmn_files_path}/recluster_09272022.egt'

# paths for running pipelines
plink_file_path = f'{datapath}/gp2_plink'
gcs_plink_path = f'gp2_uk/gp2_plink'
gcs_idat_path = f'gp2_uk/gp2_idats'

In [3]:
# list of idats in directory of choice
!gsutil ls gs://gp2_uk/gp2_idats/ > cluster_scripts/idats.txt
idats_in = pd.read_csv('cluster_scripts/idats.txt', header=None)
idat_list = idats_in.loc[1:,0]
idat_list = [x.replace('gs://gp2_uk/gp2_idats',f'{datapath}')[:-1] for x in idat_list]
gcs_idat_paths = [x.replace('gs://', '').rstrip('/') for x in idats_in.loc[1:,0]]

# chunk list by max nodes
idat_list_chunks = chunks(idat_list, max_nodes)
gcs_idat_paths_chunks = chunks(gcs_idat_paths, max_nodes)


In [4]:
# the following is run in batches of <=199 jobs (1 job per sentrix barcode)
# sbatch_cmds saves the commands that will be run in the notebook using gcloud compute ...
sbatch_cmds = []

# loop through chunks of <=199 barcodes
for i, idat_list_chunk in enumerate(idat_list_chunks):

    input_file_path = f'inputs/call_gts_{i}.inputs'

    job_name = f'callgts'
    ntasks = len(idat_list_chunk)

    # this writes input_file path which contains 3 columns: 
    # 1. path to idat in the cluster 
    # 2. path to idat directory in gcs 
    # 3. path to output in gcs

    with open(input_file_path, 'w') as f:
        for j, idat in enumerate(idat_list_chunk):
            code = idat.split('/')[-1]
            gcs_idat_path = f'{gcs_idat_path}/{code}'
            gcs_plink_path_out = f'{gcs_plink_path}/'
            # f.write(f"{idat}\t{gcs_idat_paths_chunks[i][j]}\t{gcs_idat_paths_chunks[i][j]}\n")
            f.write(f"{idat}\t{gcs_idat_paths_chunks[i][j]}\t{gcs_plink_path_out}\n")
    f.close()

    # now write command for this batch of <=199 sentrix barcodes
    sbatch_cmd = f"sbatch --job-name {job_name} --output={userhome}/logs/{job_name}.%j.o --error={userhome}/logs/{job_name}.%j.e --cpus-per-task=3 --time=01:00:00 --array=1-{ntasks} {userhome}/scripts/call_gts.sh {userhome}/{input_file_path}"
    sbatch_cmds.append(sbatch_cmd)
    print(sbatch_cmd)
# %A_%a

sbatch --job-name callgts --output=/home/dan_datatecnica_com/logs/callgts.%j.o --error=/home/dan_datatecnica_com/logs/callgts.%j.e --cpus-per-task=3 --time=01:00:00 --array=1-10 /home/dan_datatecnica_com/scripts/call_gts.sh /home/dan_datatecnica_com/inputs/call_gts_0.inputs
sbatch --job-name callgts --output=/home/dan_datatecnica_com/logs/callgts.%j.o --error=/home/dan_datatecnica_com/logs/callgts.%j.e --cpus-per-task=3 --time=01:00:00 --array=1-10 /home/dan_datatecnica_com/scripts/call_gts.sh /home/dan_datatecnica_com/inputs/call_gts_1.inputs
sbatch --job-name callgts --output=/home/dan_datatecnica_com/logs/callgts.%j.o --error=/home/dan_datatecnica_com/logs/callgts.%j.e --cpus-per-task=3 --time=01:00:00 --array=1-10 /home/dan_datatecnica_com/scripts/call_gts.sh /home/dan_datatecnica_com/inputs/call_gts_2.inputs
sbatch --job-name callgts --output=/home/dan_datatecnica_com/logs/callgts.%j.o --error=/home/dan_datatecnica_com/logs/callgts.%j.e --cpus-per-task=3 --time=01:00:00 --array=1-

In [12]:
# copy scripts to vm
# !gcloud compute scp cluster_scripts/call_gts.sh genoslurm-uk-controller:{userhome}/scripts/ --project genotools --zone europe-west2-a
!gcloud compute scp cluster_scripts/call_gts.py genoslurm-uk-controller:{userhome}/scripts/ --project genotools --zone europe-west2-a
# !gcloud compute scp inputs/* genoslurm-uk-controller:{userhome}/inputs/ --project genotools --zone europe-west2-a

call_gts.py                                   100% 1665    11.2KB/s   00:00    


In [13]:
# launch commands
for cmd in sbatch_cmds[0:5]:
    !gcloud compute ssh genoslurm-uk-controller --zone europe-west2-a --project genotools --command "{cmd}"

Submitted batch job 152
Submitted batch job 162
Submitted batch job 172
Submitted batch job 182
Submitted batch job 192
