In [1]:
import glob
import numpy as np
import os
import pandas as pd
import pickle
import prefect
import subprocess
import tempfile
import wolf

Error configuring prefect logger


In [2]:
# for Hapaseg itself
hapaseg = wolf.ImportTask(
  task_path = "../", # TODO: make remote
  task_name = "hapaseg"
)

In [3]:
# localize reference files to RODISK
localization_task = wolf.LocalizeToDisk(
  files = dict(
    ref_fasta = "gs://getzlab-workflows-reference_files-oa/hg19/Homo_sapiens_assembly19.fasta",
    ref_fasta_idx = "gs://getzlab-workflows-reference_files-oa/hg19/Homo_sapiens_assembly19.fasta.fai",
    ref_fasta_dict = "gs://getzlab-workflows-reference_files-oa/hg19/Homo_sapiens_assembly19.dict",
    coverage_csv='/home/opriebe/dev/HapASeg/exome/6_C1D1_META.cov',
    allelic_clusters_object='/home/opriebe/dev/HapASeg/exome/6_C1D1_META.DP_clusts.auto_ref_correct.overdispersion92.no_phase_correct.npz',
    SNPs_pickle='/home/opriebe/dev/HapASeg/exome/6_C1D1_META.SNPs.pickle',
    covariate_dir='/home/opriebe/dev/HapASeg/covars',
  )
)

In [4]:
loc_res = localization_task.run()

[2022-02-21 18:59:54+0000] INFO - prefect | Starting Slurm controller ...
[2022-02-21 18:59:54+0000] INFO - prefect | Waiting up to 60 seconds for Slurm controller to start ...
[2022-02-21 19:00:08+0000] INFO - prefect | Hashing file /home/opriebe/dev/HapASeg/covars/GSE137764_H1.hg19_raw_liftover.pickle (0 MiB)
[2022-02-21 19:00:08+0000] INFO - prefect | Hashing file /home/opriebe/dev/HapASeg/covars/GC.pickle (0 MiB)
[2022-02-21 19:00:08+0000] INFO - prefect | Hashing file /home/opriebe/dev/HapASeg/covars/GSE137764_H1_GaussiansGSE137764_mooth_scaled_autosome.mat (0 MiB)
[2022-02-21 19:00:08+0000] INFO - prefect | Hashing file /home/opriebe/dev/HapASeg/covars/GSE137764_H1.hg38.pickle (0 MiB)
[2022-02-21 19:00:08+0000] INFO - prefect | Hashing file /home/opriebe/dev/HapASeg/covars/GSE137764_H1.hg19_liftover.pickle (0 MiB)
[2022-02-21 19:00:08+0000] INFO - prefect | Localizing inputs...
[2022-02-21 19:00:14+0000] INFO - prefect | Disk name is canine-5eabafd987978c5006fcb69a64dd7a4e
[2022-

In [5]:
loc_res

{'SNPs_pickle': 'rodisk://canine-5eabafd987978c5006fcb69a64dd7a4e/SNPs_pickle/6_C1D1_META.SNPs.pickle',
 'allelic_clusters_object': 'rodisk://canine-5eabafd987978c5006fcb69a64dd7a4e/allelic_clusters_object/6_C1D1_META.DP_clusts.auto_ref_correct.overdispersion92.no_phase_correct.npz',
 'covariate_dir': 'rodisk://canine-5eabafd987978c5006fcb69a64dd7a4e/covariate_dir/covars',
 'coverage_csv': 'rodisk://canine-5eabafd987978c5006fcb69a64dd7a4e/coverage_csv/6_C1D1_META.cov',
 'ref_fasta': 'rodisk://canine-5eabafd987978c5006fcb69a64dd7a4e/ref_fasta/Homo_sapiens_assembly19.fasta',
 'ref_fasta_dict': 'rodisk://canine-5eabafd987978c5006fcb69a64dd7a4e/ref_fasta_dict/Homo_sapiens_assembly19.dict',
 'ref_fasta_idx': 'rodisk://canine-5eabafd987978c5006fcb69a64dd7a4e/ref_fasta_idx/Homo_sapiens_assembly19.fasta.fai'}

In [6]:
prep_cov_mcmc_task = hapaseg.Hapaseg_prepare_coverage_mcmc(
    inputs={
        "coverage_csv":loc_res["coverage_csv"],
        "allelic_clusters_object":loc_res["allelic_clusters_object"],
        "SNPs_pickle":loc_res["SNPs_pickle"],
        "covariate_dir":loc_res["covariate_dir"],
        "allelic_sample":-1,
        "ref_file_path":loc_res["ref_fasta"]
    }
)

In [7]:
prep_cov_mcmc_res = prep_cov_mcmc_task.run()

[2022-02-21 19:03:17+0000] INFO - prefect | Starting Slurm controller ...
[2022-02-21 19:03:17+0000] INFO - prefect | Waiting up to 60 seconds for Slurm controller to start ...
[2022-02-21 19:03:18+0000] INFO - prefect | Localizing inputs...
[2022-02-21 19:03:18+0000] INFO - prefect | Job staged on SLURM controller in: /mnt/nfs/workspace/Hapaseg_prepare_coverage_mcmc__2022-02-21--19-03-18_leiwweq_0z03eha_vaetv3zjnipra
[2022-02-21 19:03:18+0000] INFO - prefect | Preparing pipeline script
[2022-02-21 19:05:49+0000] INFO - prefect | Finished with status COMPLETED


In [8]:
prep_cov_mcmc_res["preprocess_data"]

'/mnt/nfs/workspace/Hapaseg_prepare_coverage_mcmc__2022-02-21--19-03-18_leiwweq_0z03eha_vaetv3zjnipra/outputs/0/preprocess_data/preprocess_data.npz'

In [9]:
np.load(prep_cov_mcmc_res["preprocess_data"])["Pi"].shape[1]

18

In [10]:
np.load(prep_cov_mcmc_res["preprocess_data"])["adp_cluster"]

array(-1)

In [11]:
cluster_idxs = [i for i in np.arange(np.load(prep_cov_mcmc_res["preprocess_data"])["Pi"].shape[1])]

In [12]:
cluster_idxs

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]

In [13]:
cov_mcmc_scatter_task = hapaseg.Hapaseg_coverage_mcmc(
    inputs={
        "preprocess_data":prep_cov_mcmc_res["preprocess_data"],
        "num_draws":10,
        "cluster_num":cluster_idxs
    }
)

In [14]:
cov_mcmc_scatter_res = cov_mcmc_scatter_task.run()

[2022-02-21 19:06:51+0000] INFO - prefect | Starting Slurm controller ...
[2022-02-21 19:06:51+0000] INFO - prefect | Waiting up to 60 seconds for Slurm controller to start ...
[2022-02-21 19:06:52+0000] INFO - prefect | Localizing inputs...
[2022-02-21 19:06:52+0000] INFO - prefect | Job staged on SLURM controller in: /mnt/nfs/workspace/Hapaseg_coverage_mcmc__2022-02-21--19-06-52_ug3blqi_0z03eha_0ywmku3e5jt4g
[2022-02-21 19:06:52+0000] INFO - prefect | Preparing pipeline script
[2022-02-21 19:17:28+0000] INFO - prefect | Finished with statuses COMPLETED: 18


In [15]:
cov_mcmc_gather_task = hapaseg.Hapaseg_collect_coverage_mcmc(
    inputs={
        "cov_mcmc_files":[cov_mcmc_scatter_res["cov_segmentation_data"]],
        "cov_df_pickle":prep_cov_mcmc_res["cov_df_pickle"]
    }
)

In [16]:
cov_mcmc_gather_res = cov_mcmc_gather_task.run()

[2022-02-21 19:19:32+0000] INFO - prefect | Starting Slurm controller ...
[2022-02-21 19:19:32+0000] INFO - prefect | Waiting up to 60 seconds for Slurm controller to start ...
[2022-02-21 19:19:33+0000] INFO - prefect | Localizing inputs...
[2022-02-21 19:19:33+0000] INFO - prefect | Job staged on SLURM controller in: /mnt/nfs/workspace/Hapaseg_collect_coverage_mcmc__2022-02-21--19-19-33_fueiipq_0z03eha_enjckqodcde22
[2022-02-21 19:19:33+0000] INFO - prefect | Preparing pipeline script
[2022-02-21 19:20:04+0000] INFO - prefect | Finished with status COMPLETED


In [21]:
cov_mcmc_gather_res["cov_collected_data"],

('/mnt/nfs/workspace/Hapaseg_collect_coverage_mcmc__2022-02-21--19-19-33_fueiipq_0z03eha_enjckqodcde22/outputs/0/cov_collected_data/cov_mcmc_collected_data.npz',)

In [22]:
prep_cov_mcmc_res["cov_df_pickle"]

'/mnt/nfs/workspace/Hapaseg_prepare_coverage_mcmc__2022-02-21--19-03-18_leiwweq_0z03eha_vaetv3zjnipra/outputs/0/cov_df_pickle/cov_df.pickle'

In [17]:
cov_dp_task = hapaseg.Hapaseg_coverage_dp(
    inputs={
        "f_cov_df":prep_cov_mcmc_res["cov_df_pickle"],
        "cov_mcmc_data": cov_mcmc_gather_res["cov_collected_data"],
        "num_segmentation_samples":10,
        "num_draws":10
    }
)

In [18]:
cov_dp_res = cov_dp_task.run()

[2022-02-21 19:20:11+0000] INFO - prefect | Starting Slurm controller ...
[2022-02-21 19:20:11+0000] INFO - prefect | Waiting up to 60 seconds for Slurm controller to start ...
[2022-02-21 19:20:12+0000] INFO - prefect | Localizing inputs...
[2022-02-21 19:20:12+0000] INFO - prefect | Job staged on SLURM controller in: /mnt/nfs/workspace/Hapaseg_coverage_dp__2022-02-21--19-20-12_vr5yroi_0z03eha_3jsg0ejxgek5k
[2022-02-21 19:20:12+0000] INFO - prefect | Preparing pipeline script
[2022-02-21 22:04:31+0000] INFO - prefect | Finished with status COMPLETED


In [24]:
cov_dp_res['cov_dp_object']

'/mnt/nfs/workspace/Hapaseg_coverage_dp__2022-02-21--19-20-12_vr5yroi_0z03eha_3jsg0ejxgek5k/outputs/0/cov_dp_object/Cov_DP_model.pickle'

In [None]:
adp_draw_num = int(np.load(prep_cov_mcmc_res["preprocess_data"])["adp_cluster"])
gen_acdp_task = hapaseg.Hapaseg_acdp_generate_df(
    inputs = {
        "SNPs_pickle":loc_res["SNPs_pickle"],
        "allelic_clusters_object":loc_res["allelic_clusters_object"],
        "coverage_dp_object":cov_dp_res["cov_dp_object"],
        "allelic_draw_index":adp_draw_num,
        "ref_file_path":loc_res["ref_fasta"]
    }
)

In [None]:
gen_acdp_results = gen_acdp_task.run()

[2022-02-21 22:04:31+0000] INFO - prefect | Starting Slurm controller ...
[2022-02-21 22:04:31+0000] INFO - prefect | Waiting up to 60 seconds for Slurm controller to start ...
[2022-02-21 22:04:32+0000] INFO - prefect | Localizing inputs...
[2022-02-21 22:04:32+0000] INFO - prefect | Job staged on SLURM controller in: /mnt/nfs/workspace/Hapaseg_acdp_generate_df__2022-02-21--22-04-32_b3hxleq_0z03eha_5hz5qdv0ftdpg
[2022-02-21 22:04:32+0000] INFO - prefect | Preparing pipeline script
[2022-02-21 22:08:04+0000] INFO - prefect | Finished with status COMPLETED


In [None]:
acdp_task = hapaseg.Hapaseg_run_acdp(
    inputs = {
        "coverage_dp_object":cov_dp_results["cov_dp_object"],
        "acdp_df":gen_acdp_results["acdp_df_pickle"],
        "num_samples":10,
        "cytoband_df":
    }

In [25]:
2082/60

34.7

In [None]:
pd.read_pickle('../exome/filtered_allelic_segs.pickle