In [1]:
import pandas as pd
import os
from genoslurm.genoslurm import chunks, GenCallJob

In [2]:
# cluster config variables
max_nodes = 10

# paths
userhome = '/home/dan_datatecnica_com'
datapath = f'{userhome}/data'
# tmp_dir = f'{userhome}/tmp'
log_dir = f'{userhome}/logs'
ilmn_files_path = f'{userhome}/ilmn_files'
iaap = f'{userhome}/GenoTools/executables/iaap-cli-linux-x64-1.1.0-sha.80d7e5b3d9c1fdfc2e99b472a90652fd3848bbc7/iaap-cli/iaap-cli'
bpm = f'{ilmn_files_path}/NeuroBooster_20042459_A2.bpm'
egt = f'{ilmn_files_path}/recluster_09272022.egt'
plink_file_path = f'{datapath}/gp2_plink'
gcs_plink_path = f'gp2_uk/gp2_plink'
gcs_idat_path = f'gp2_uk/gp2_idats'

In [3]:
# list of idats in directory of choice
!gsutil ls gs://gp2_uk/gp2_idats/ > cluster_scripts/idats.txt
idats_in = pd.read_csv('cluster_scripts/idats.txt', header=None)
idat_list = idats_in.loc[1:,0]
idat_list = [x.replace('gs://gp2_uk/gp2_idats',f'{datapath}')[:-1] for x in idat_list]
gcs_idat_paths = [x.replace('gs://', '').rstrip('/') for x in idats_in.loc[1:,0]]
# chunk list by max nodes
idat_list_chunks = chunks(idat_list, max_nodes)

In [4]:
for i, idat_list_chunk in enumerate(idat_list_chunks):

    script_path = f'cluster_scripts/call_gts_{i}.sh'
    job_name = f'callgts_{i}'
    log_path = f'{userhome}/logs'
    nodes = len(idat_list_chunk)
    ntasks = len(idat_list_chunk)
    time_limit = '01:00:00'
    array = f'1-{len(idat_list_chunk)}'
    cpus_per_task = '2'


    job = GenCallJob(
        sbatch_path=script_path, 
        idat_dir_ins=idat_list_chunk, 
        gcs_idat_path=gcs_idat_path, 
        iaap=iaap, 
        bpm=bpm, 
        egt=egt, 
        gcs_plink_path=gcs_plink_path,
        log_path=log_dir,
        job_name=job_name,
        nodes=nodes,
        tasks_per_node=1,
        threads=4, 
        ntasks=ntasks, 
        cpus_per_task=3, 
        mem_per_cpu='2G', 
        time='01:00:00'
        )
        
    job.write_sbatch_script()

In [5]:
# copy scripts to vm
!gcloud compute scp cluster_scripts/call_gts_*.sh genoslurm-uk-v1-login0:/home/dan_datatecnica_com/scripts/ --project genotools --zone europe-west2-a
!gcloud compute scp cluster_scripts/call_gts.py genoslurm-uk-v1-login0:/home/dan_datatecnica_com/scripts/ --project genotools --zone europe-west2-a

call_gts_0.sh                                 100% 2038    15.0KB/s   00:00    
call_gts_1.sh                                 100%  554     4.2KB/s   00:00    
call_gts.py                                   100% 1633    12.6KB/s   00:00    


In [6]:
# launch commands
!gcloud compute ssh --zone europe-west2-a --project genotools genoslurm-uk-v1-login0 --command 'sbatch -N 2 /home/dan_datatecnica_com/scripts/call_gts_1.sh'

Submitted batch job 29


In [5]:
!cat cluster_scripts/call_gts_{i}.sh

#!/bin/bash
srun python3 /home/dan_datatecnica_com/scripts/call_gts.py --input /home/dan_datatecnica_com/data/206451070115 --gcs_in gp2_uk/gp2_idats/206451070115 --gcs_out gp2_uk/gp2_plink &
srun python3 /home/dan_datatecnica_com/scripts/call_gts.py --input /home/dan_datatecnica_com/data/206451070117 --gcs_in gp2_uk/gp2_idats/206451070117 --gcs_out gp2_uk/gp2_plink &
wait