In [1]:
import pandas as pd
import os
from genoslurm.genoslurm import chunks, GenCallJob

In [2]:
# cluster config variables
max_nodes = 100

# paths- MAKE SURE TO SET TO YOUR OWN HOME PATH IN THE CLUSTER
userhome = '/home/dan_datatecnica_com'
datapath = f'{userhome}/data'
# tmp_dir = f'{userhome}/tmp'
log_dir = f'{userhome}/logs'
ilmn_files_path = f'{userhome}/ilmn_files'
iaap = f'{userhome}/GenoTools/executables/iaap-cli-linux-x64-1.1.0-sha.80d7e5b3d9c1fdfc2e99b472a90652fd3848bbc7/iaap-cli/iaap-cli'
bpm = f'{ilmn_files_path}/NeuroBooster_20042459_A2.bpm'
egt = f'{ilmn_files_path}/recluster_09272022.egt'
plink_file_path = f'{datapath}/gp2_plink'
gcs_plink_path = f'gp2_uk/gp2_plink'
gcs_idat_path = f'gp2_uk/gp2_idats'

In [3]:
# list of idats in directory of choice
!gsutil ls gs://gp2_uk/gp2_idats/ > cluster_scripts/idats.txt
idats_in = pd.read_csv('cluster_scripts/idats.txt', header=None)
idat_list = idats_in.loc[1:,0]
idat_list = [x.replace('gs://gp2_uk/gp2_idats',f'{datapath}')[:-1] for x in idat_list]
gcs_idat_paths = [x.replace('gs://', '').rstrip('/') for x in idats_in.loc[1:,0]]
# chunk list by max nodes
idat_list_chunks = chunks(idat_list, max_nodes)
gcs_idat_paths_chunks = chunks(gcs_idat_paths, max_nodes)
# need this without gs://
gcs_plink_path = 'gp2_uk/gp2_idats'

In [5]:
# the following is run in batches of <=100 jobs (1 job per sentrix barcode)
# sbatch_cmds saves the commands that will be run in the notebook using gcloud compute ...
sbatch_cmds = []

# loop through chunks of <=100 barcodes
for i, idat_list_chunk in enumerate(idat_list_chunks):

    input_file_path = f'inputs/call_gts_{i}.inputs'

    job_name = f'callgts_{i}'
    ntasks = len(idat_list_chunk)

    # this writes input_file path which contains 3 columns: 
    # 1. path to idat in the cluster 
    # 2. path to idat directory in gcs 
    # 3. path to output in gcs

    with open(input_file_path, 'w') as f:
        for j, idat in enumerate(idat_list_chunk):
            code = idat.split('/')[-1]
            gcs_idat_path = f'{gcs_idat_path}/{code}'
            gcs_plink_path_out = f'{gcs_plink_path}/{code}'
            # f.write(f"{idat}\t{gcs_idat_paths_chunks[i][j]}\t{gcs_idat_paths_chunks[i][j]}\n")
            f.write(f"{idat}\t{gcs_idat_paths_chunks[i][j]}\t{gcs_plink_path_out}\n")
    f.close()

    # now write command for this batch of <=100 sentrix barcodes
    sbatch_cmd = f"sbatch --job-name {job_name} --output={userhome}/logs/{job_name}.%A_%a.out --error={userhome}/logs/{job_name}.%A_%a.err --ntasks={ntasks} --cpus-per-task=3 --time=01:00:00 --array=1-{ntasks} {userhome}/scripts/call_gts.sh {userhome}/{input_file_path}"
    sbatch_cmds.append(sbatch_cmd)
    print(sbatch_cmd)


sbatch --job-name callgts_0 --output=/home/dan_datatecnica_com/logs/callgts_0.%A_%a.out --error=/home/dan_datatecnica_com/logs/callgts_0.%A_%a.err --ntasks=12 --cpus-per-task=3 --time=01:00:00 --array=1-12 /home/dan_datatecnica_com/scripts/call_gts.sh /home/dan_datatecnica_com/inputs/call_gts_0.inputs


In [9]:
# copy scripts to vm
!gcloud compute scp cluster_scripts/call_gts.sh genoslurm-uk-v1-controller:{userhome}/scripts/ --project genotools --zone europe-west2-a
!gcloud compute scp cluster_scripts/call_gts.py genoslurm-uk-v1-controller:{userhome}/scripts/ --project genotools --zone europe-west2-a
!gcloud compute scp inputs/* genoslurm-uk-v1-controller:{userhome}/inputs/ --project genotools --zone europe-west2-a

call_gts.sh                                   100%  460     1.8KB/s   00:00    
call_gts.py                                   100% 1702     6.1KB/s   00:00    
call_gts_0.inputs                             100% 1040     3.2KB/s   00:00    
call_gts_1.inputs                             100%  208     1.0KB/s   00:00    


In [10]:
# launch commands
for cmd in sbatch_cmds:
    !gcloud compute ssh --zone europe-west2-a --project genotools genoslurm-uk-v1-controller --command "{cmd}"

Submitted batch job 48
