In [1]:
from QC.utils import shell_do
import os
import subprocess
import requests
import json
import time

In [2]:
geno = '/data/vitaled2/test_data/mcgill/MCGILL_all_callrate_sex'
out = '/data/vitaled2/test_data/mcgill'

# ref_path = '/data/vitaled2/GenoTools/ref'
# check_bim_pl_script = f'{ref_path}/HRC-1000G-check-bim.pl'
# ref_panel_path = f'{ref_path}/PASS.Variantsbravo-dbsnp-all.tab'

In [3]:
# download file to check
# bash1 = "wget http://www.well.ox.ac.uk/~wrayner/tools/HRC-1000G-check-bim.v4.2.5.zip -P " + out_path
# bash2 = "unzip " + out_path + "HRC-1000G-check-bim.v4.2.5.zip -d " + out_path
# bash3 = "wget ftp://ngs.sanger.ac.uk/production/hrc/HRC.r1-1/HRC.r1-1.GRCh37.wgs.mac5.sites.tab.gz -P " + out_path
# bash4 = "gunzip " + out_path + "HRC.r1-1.GRCh37.wgs.mac5.sites.tab.gz"

# bashes = [bash1,bash2,bash3,bash4]

# for bash in bashes:
#     shell_do(bash)

# !curl 'https://bravo.sph.umich.edu/freeze3a/hg19/download/all' -H 'Accept-Encoding: gzip, deflate, br' -H 'Cookie: remember_token="dan@datatecnica.com|439f5115690bc00f69ad686507220d3892abfcaadd650f8b9ead3362b0bb9a9fa95448f933e724f97f70f48b271a637c47414b10dbd99b6af8a1303353cdf5cf"' --compressed > {ref_path}/bravo-dbsnp-all.vcf.gz
# !wget https://www.well.ox.ac.uk/~wrayner/tools/CreateTOPMed.zip -P {ref_path}
# !unzip /data/vitaled2/GenoTools/ref/CreateTOPMed.zip -d {ref_path}
# !gunzip {ref_path}/PASS.Variantsbravo-dbsnp-all.tab.gz

In [15]:
def impute_data_prep(geno_path, out_path, ref_panel='ref/PASS.Variantsbravo-dbsnp-all.tab'):

    '''
    info here:
    https://www.well.ox.ac.uk/~wrayner/tools/
    '''
    
    check_bim_pl_script = 'ref/HRC-1000G-check-bim.pl'
    workdir = os.getcwd()
    os.chdir(out_path)

    plink1 = f'plink --bfile {geno_path} --freq --out {geno_path}'
    check_bim_cmd = f'perl {check_bim_pl_script} -b {geno_path}.bim -f {geno_path}.frq -r {ref_panel} -h'
    bash1 = 'sh Run-plink.sh'

    cmds = [plink1, check_bim_cmd, bash1]

    for cmd in cmds:
        shell_do(cmd)

    os.chdir(workdir)
    
    mk_vcf_cmds = [f'plink --bfile {geno_path}-updated-chr{str(i)} --recode vcf --chr {str(i)} --out {geno_path}_chr{str(i)}' for i in range(1,24)]   

    for cmd in mk_vcf_cmds:
        shell_do(cmd)

    # ## then sort and zip
    sort_zip_cmds = [f'vcf-sort {geno_path}_chr{str(i)}.vcf | bgzip -c > {geno_path}_pre_impute_chr{str(i)}.vcf.gz' for i in range(1,24)]

    for cmd in sort_zip_cmds:
        subprocess.run(cmd, stdout=subprocess.PIPE, shell=True)
    
    vcf_outpaths = [f'{geno_path}_pre_impute_chr{str(i)}.vcf.gz' for i in range(1,24)]
    return {'vcfs': vcf_outpaths}


def check_impute_status(token, job_id):
        
    # imputation server url
    url = 'https://imputation.biodatacatalyst.nhlbi.nih.gov/api/v2'

    # add token to header (see authentication)
    headers = {'X-Auth-Token' : token }

    # get all jobs
    r = requests.get(url + "/jobs", headers=headers)
    if r.status_code != 200:
        raise Exception('GET /jobs/ {}'.format(r.status_code))

    status = r.json()
    for stat in status['data']:
        if stat['id'] == job_id:
            if stat['state'] == 1:
                print("Launching Job:", stat['id'])
            elif stat['state'] == 2:
                print("Running Job:", stat['id'])
            elif stat['state'] == 3:
                print(stat['id'], "returned state '3', have a look at jobs on the web front for more information")
            elif stat['state'] == 5:
                print(stat['id'], "has failed. consult docs on data input to ensure your vcfs are correct")
            elif stat['state'] == 4:
                print(stat['id'], "COMPLETED!")

            return stat['state']

        else:
            pass

        
def pull_imputed_data(out_path, token, job_id, password):
    
    workdir = os.getcwd()
    os.chdir(out_path)
    
    # imputation server url
    url = 'https://imputation.biodatacatalyst.nhlbi.nih.gov/api/v2'

    # add token to header (see authentication)
    headers = {'X-Auth-Token' : token }

    r = requests.get(f'{url}/jobs/{job_id}', headers=headers)
    if r.status_code != 200:
        raise Exception(f'GET /jobs/ {r.status_code}')

    output_json = r.json()

    hashes_dict = {output_json['outputParams'][i]['id'] : output_json['outputParams'][i]['hash'] for i in range(len(output_json['outputParams']))}

    # run a curl for each
    curls = [f'curl -sL https://imputation.biodatacatalyst.nhlbi.nih.gov/get/{str(key)}/{str(hashes_dict[key])} | bash' for key in hashes_dict]
    
    for curl in curls:
        print(f"Curling output data with the following command: {curl}")
        subprocess.run(curl, shell=True)
    print() 
    print("Finished Pulling Imputed Data!")
    print()
    
    os.chdir(workdir)
    

def submit_job(vcf_list, server='topmed', password='imputer', token=None):
    # test topmed server
    url = 'https://imputation.biodatacatalyst.nhlbi.nih.gov/api/v2'
    
    # add token to header (see Authentication)
    headers = {'X-Auth-Token' : token}

    open_vcfs = [open(vcf, 'rb') for vcf in vcf_list]

    files = set([('input-files-upload', vcf) for vcf in open_vcfs])
    # files = {'input-files': open(vcf_list[0], 'rb')}

    data = {'input-mode' : 'imputation',
            'input-files-source': 'file-upload',
            'input-password': password,
            'input-refpanel': 'apps@topmed-r2@1.0.0',
            'input-phasing': 'eagle',
            'input-population': 'all'}

    r = requests.post(url + "/jobs/submit/imputationserver", files=files, headers=headers, data=data)
    if r.status_code != 200:
        raise Exception('POST /jobs/submit/imputationserver {}'.format(r.status_code))

    job_id = r.json()['id']
    message = r.json()['message']
    print(job_id, message)
    
    print('***************************')
    print('* * * * * * * * * * * * * *')
    
    return r.json()


def run_imputation(vcf_list, out_path, password, token):
    
    job_json = submit_job(vcf_list, token=token)
    job_id = job_json['id']
    
    imp_state = 0
    while imp_state < 3:
        time.sleep(600)
        os.system('clear')
        imp_state = check_impute_status(token, job_id)

        if imp_state == 4:
            print("Pulling Completed Data from Imputation Server!")
            pull_imputed_data(out_path=out_path, token=token, job_id=job_id, password=password)
    
    return job_json


In [22]:
!mkdir /data/vitaled2/test_data/mcgill/imputed

In [16]:
key = 'eyJjdHkiOiJ0ZXh0XC9wbGFpbiIsImFsZyI6IkhTMjU2In0.eyJtYWlsIjoidml0YWxlZDJAbmloLmdvdiIsImV4cGlyZSI6MTYyNjQwNjczMzc5OCwibmFtZSI6IkRhbiBWaXRhbGUiLCJhcGkiOnRydWUsInVzZXJuYW1lIjoidml0YWxlZDIifQ.hcHyBgJmcTZDEpFnb8t5gH1lfxSZQZHC4Lu9IhN0E18'
imputed_out = f'{out}/imputed'
impute_data = impute_data_prep(geno, out)
run_imputation(impute_data['vcfs'], imputed_out, password='imputer', token=key)

Executing: plink --bfile /data/vitaled2/test_data/mcgill/MCGILL_all_callrate_sex --freq --out /data/vitaled2/test_data/mcgill/MCGILL_all_callrate_sex
Executing: perl ref/HRC-1000G-check-bim.pl -b /data/vitaled2/test_data/mcgill/MCGILL_all_callrate_sex.bim -f /data/vitaled2/test_data/mcgill/MCGILL_all_callrate_sex.frq -r ref/PASS.Variantsbravo-dbsnp-all.tab -h
Executing: sh Run-plink.sh
Executing: plink --bfile /data/vitaled2/test_data/mcgill/MCGILL_all_callrate_sex-updated-chr1 --recode vcf --chr 1 --out /data/vitaled2/test_data/mcgill/MCGILL_all_callrate_sex_chr1
Executing: plink --bfile /data/vitaled2/test_data/mcgill/MCGILL_all_callrate_sex-updated-chr2 --recode vcf --chr 2 --out /data/vitaled2/test_data/mcgill/MCGILL_all_callrate_sex_chr2
Executing: plink --bfile /data/vitaled2/test_data/mcgill/MCGILL_all_callrate_sex-updated-chr3 --recode vcf --chr 3 --out /data/vitaled2/test_data/mcgill/MCGILL_all_callrate_sex_chr3
Executing: plink --bfile /data/vitaled2/test_data/mcgill/MCGILL_a

job-20210621-204729-289 Your job was successfully added to the job queue.
***************************
* * * * * * * * * * * * * *
Launching Job: job-20210621-204729-289
Running Job: job-20210621-204729-289
Running Job: job-20210621-204729-289
Running Job: job-20210621-204729-289
Running Job: job-20210621-204729-289
Running Job: job-20210621-204729-289
Running Job: job-20210621-204729-289
Running Job: job-20210621-204729-289
Running Job: job-20210621-204729-289
Running Job: job-20210621-204729-289
Running Job: job-20210621-204729-289
Running Job: job-20210621-204729-289
Running Job: job-20210621-204729-289
Running Job: job-20210621-204729-289
Running Job: job-20210621-204729-289
Running Job: job-20210621-204729-289
Running Job: job-20210621-204729-289
Running Job: job-20210621-204729-289
Running Job: job-20210621-204729-289
Running Job: job-20210621-204729-289
Running Job: job-20210621-204729-289
Running Job: job-20210621-204729-289
Running Job: job-20210621-204729-289
Running Job: job-

{'success': True,
 'id': 'job-20210621-204729-289',
 'message': 'Your job was successfully added to the job queue.'}

In [12]:
# pull_imputed_data(out_path=imputed_out, token=key, job_id='job-20210618-214638-236',password='imputer')
# pull_imputed_data(out_path, token, impute_id, password)

Curling output data with the following command: curl -sL https://imputation.biodatacatalyst.nhlbi.nih.gov/get/322730/1a2c05d4879a154fe8e76d09476a5807d24525a504a0c0929b1359f3799768f3 | bash
Curling output data with the following command: curl -sL https://imputation.biodatacatalyst.nhlbi.nih.gov/get/322734/9520f9b23ff03166bf61e257bce7a3c21d864b06a1ab256d8198a318fbade616 | bash
Curling output data with the following command: curl -sL https://imputation.biodatacatalyst.nhlbi.nih.gov/get/322736/5d6889d44ef669eda2ef4ebc1d3c7109bb7c755be5240b43bc3d473dcf8da3f2 | bash
Curling output data with the following command: curl -sL https://imputation.biodatacatalyst.nhlbi.nih.gov/get/322737/ac5b2653ada5dc257b6b272d14626766ab027be87ddfcbd1ddc4f3ed380fbe92 | bash

Finished Pulling Imputed Data!



In [None]:


# mcgill_vcf_list = [f'{geno_path}_pre_impute_chr{str(i)}.vcf.gz' for i in range(1,24)]



# def check_impute_status(token, impute_id):
        
#     # imputation server url
#     url = 'https://imputation.biodatacatalyst.nhlbi.nih.gov/api/v2'

#     # add token to header (see authentication)
#     headers = {'X-Auth-Token' : token }

#     # get all jobs
#     r = requests.get(url + "/jobs", headers=headers)
#     if r.status_code != 200:
#         raise Exception('GET /jobs/ {}'.format(r.status_code))

#     status = r.json()
#     for stat in status['data']:
#         if stat['id'] == impute_id:
#             if stat['state'] == 1:
#                 print("Launching Job:", stat['id'])
#             elif stat['state'] == 2:
#                 print("Running Job:", stat['id'])
#             elif stat['state'] == 3:
#                 print(stat['id'], "returned state '3', have a look at jobs on the web front for more information")
#             elif stat['state'] == 5:
#                 print(stat['id'], "has failed. consult docs on data input to ensure your vcfs are correct")
#             elif stat['state'] == 4:
#                 print(stat['id'], "COMPLETED!")

#             return stat['state']

#         else:
#             pass

# def pull_imputed_data(out_path, token, impute_id, password):

#     # imputation server url
#     url = 'https://imputation.biodatacatalyst.nhlbi.nih.gov/api/v2'

#     # add token to header (see authentication)
#     headers = {'X-Auth-Token' : token }

#     r = requests.get(f'{url}/jobs/{_id}', headers=headers)
#     if r.status_code != 200:
#         raise Exception(f'GET /jobs/ {r.status_code}')

#     output_json = r.json()

#     hashes_dict = {output_json['outputParams'][i]['id'] : output_json['outputParams'][i]['hash'] for i in range(len(output_json['outputParams']))}

#     # run a curl for each
#     curls = [f'curl -sL https://imputation.biodatacatalyst.nhlbi.nih.gov/get/{str(key)}/{str(hashes_dict[key])} | bash' for key in hashes_dict]
    
#     for curl in curls:
#         print(f"Curling output data with the following command: {curl}")
#         subprocess.run(curl, shell=True)
#     print() 
#     print("Finished Pulling Imputed Data!")
#     print()
    

# def impute(vcf_list, out_path, server='topmed', password='imputer', token=None)
#     # test topmed server
#     url = 'https://imputation.biodatacatalyst.nhlbi.nih.gov/api/v2'
    
#     # add token to header (see Authentication)
#     headers = {'X-Auth-Token' : token}

#     open_vcfs = [open(vcf, 'rb') for vcf in vcf_list]

#     files = set([('input-files-upload', vcf) for vcf in open_vcfs])
#     # files = {'input-files': open(vcf_list[0], 'rb')}

#     data = {'input-mode' : 'imputation',
#             'input-files-source': 'file-upload',
#             'input-password': password,
#             'input-refpanel': 'apps@topmed-r2@1.0.0',
#             'input-phasing': 'eagle',
#             'input-population': 'all'}

#     r = requests.post(url + "/jobs/submit/imputationserver", files=files, headers=headers, data=data)
#     if r.status_code != 200:
#         raise Exception('POST /jobs/submit/imputationserver {}'.format(r.status_code))

#     impute_id = r.json()['id']
#     message = r.json()['message']
#     print(impute_id, message)
    
#     print('***************************')
#     print('* * * * * * * * * * * * * *') 
        
#     imp_state = 0
#     while imp_state < 3:
#         time.sleep(600)
#         os.system('clear')
#         imp_state = check_impute_status(token, impute_id)

#         if imp_state == 4:
#             print("Pulling Completed Data from Imputation Server!")
#             pull_imputed_data(token, impute_id, password)

In [None]:
# def impute_make_vcf(geno_path, out_path):
# then make vcf files
# def make_vcfs(plink_paths)
#     step1 = "RECODE PLINK FILES TO VCF"
#     mk_vcf_cmds = [f'plink --bfile {geno_path}-updated-chr{str(i)} --recode vcf --chr {str(i)} --out {geno_path}_chr{str(i)}' for i in range(1,24)]   

#     for cmd in mk_vcf_cmds:
#         shell_do(cmd)

#     # ## then sort and zip
#     step2 =  "vcf-sort AND bgzip VCFS"
#     sort_zip_cmds = [f'vcf-sort {geno_path}_chr{str(i)}.vcf | bgzip -c > {geno_path}_pre_impute_chr{str(i)}.vcf.gz' for i in range(1,24)]

#     for cmd in sort_zip_cmds:
#         subprocess.run(cmd, stdout=subprocess.PIPE, shell=True)

In [None]:


#     #now unzip all ".zip" files (one for each chromosome)
#     zip_list = glob.glob('{out_path}/imputed/*.zip')
#     unzip_cmds = ['unzip -P ' + pw + ' ' + file for file in zip_list]

#     for cmd in unzip_cmds:
#         print("Unzipping: " + cmd)
#         subprocess.run(cmd, shell=True)
#     print("Finished Unzipping")




In [None]:
# def impute(self, key, input_population='eur', pw='imputer', vcf_list=None):
#         geno_path = self.geno_path
#         vcf_list = self.vcf_list_for_impute
        
#         # imputation server url
#         url = 'https://imputationserver.sph.umich.edu/api/v2'

#         # add token to header (see Authentication)
#         headers = {'X-Auth-Token' : key}

#         open_vcfs = [open(vcf, 'rb') for vcf in vcf_list]
        
#         files = set([('input-files-upload', vcf) for vcf in open_vcfs])

#         data = {'input-mode' : 'imputation',
#                 'input-files-source': 'file-upload',
#                 'input-password': pw,
#                 'input-refpanel': 'apps@hrc-r1.1',
#                 'input-phasing': 'eagle',
#                 'input-population': input_population}

#         r = requests.post(url + "/jobs/submit/minimac4", files=files, headers=headers, data=data)
#         if r.status_code != 200:
#             raise Exception('POST /jobs/submit/minimac4 {}'.format(r.status_code))
        
#         impute_id = r.json()['id']
#         message = r.json()['message']

#         print(message)
#         print(impute_id)
#         print('***************************')
#         print('* * * * * * * * * * * * * *') 
        
#         imp_state = 0
#         while imp_state < 3:
#             time.sleep(60)
#             os.system('clear')
#             imp_state = self.check_impute_status(key, impute_id)
            
#             if imp_state == 4:
#                 print("Pulling Completed Data from Imputation Server!")
#                 self.pull_imputed_data(key, impute_id, pw)