Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

trio-dual/bug for hic phasing #462

Merged
merged 1 commit into from
May 31, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CommandLines.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ static ko_longopt_t long_options[] = {
{ "dbg-ovec", ko_no_argument, 345},
{ "path-max", ko_required_argument, 346},
{ "path-min", ko_required_argument, 347},
{ "trio-dual", ko_no_argument, 348},
// { "path-round", ko_required_argument, 348},
{ 0, 0, 0 }
};
Expand Down Expand Up @@ -129,6 +130,8 @@ void Print_H(hifiasm_opt_t* asm_opt)
fprintf(stderr, " --t-occ INT\n");
fprintf(stderr, " forcedly remove unitigs with >INT unexpected haplotype-specific reads;\n");
fprintf(stderr, " ignore graph topology; [%d]\n", asm_opt->trio_flag_occ_thres);
fprintf(stderr, " --trio-dual utilize homology information to correct trio phasing errors\n");


fprintf(stderr, " Purge-dups:\n");
fprintf(stderr, " -l INT purge level. 0: no purging; 1: light; 2/3: aggressive [0 for trio; 3 for unzip]\n");
Expand Down Expand Up @@ -286,6 +289,7 @@ void init_opt(hifiasm_opt_t* asm_opt)
asm_opt->max_path_drop_rate = 0.6;
asm_opt->hifi_pst_join = 1;
asm_opt->ul_pst_join = 1;
asm_opt->trio_cov_het_ovlp = -1;
}

void destory_enzyme(enzyme* f)
Expand Down Expand Up @@ -830,6 +834,7 @@ int CommandLine_process(int argc, char *argv[], hifiasm_opt_t* asm_opt)
else if (c == 345) asm_opt->dbg_ovec_cal = 1;
else if (c == 346) asm_opt->max_path_drop_rate = atof(opt.arg);
else if (c == 347) asm_opt->min_path_drop_rate = atof(opt.arg);
else if (c == 348) asm_opt->trio_cov_het_ovlp = 1;
else if (c == 'l') { ///0: disable purge_dup; 1: purge containment; 2: purge overlap
asm_opt->purge_level_primary = asm_opt->purge_level_trio = atoi(opt.arg);
}
Expand Down
4 changes: 2 additions & 2 deletions CommandLines.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
#include <pthread.h>
#include <stdint.h>

#define HA_VERSION "0.19.5-r590"
#define HA_VERSION "0.19.5-r592"

#define VERBOSE 0

Expand Down Expand Up @@ -131,7 +131,7 @@ typedef struct {
float dp_e;
int64_t hg_size;
float kpt_rate;
int64_t infor_cov, s_hap_cov;
int64_t infor_cov, s_hap_cov, trio_cov_het_ovlp;
double ul_error_rate, ul_error_rate_low, ul_error_rate_hpc;
int32_t ul_ec_round;
uint8_t is_dbg_het_cnt;
Expand Down
65 changes: 63 additions & 2 deletions Overlaps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17034,6 +17034,62 @@ long long gap_fuzz, ug_opt_t *opt)
0.05, 0.9, max_hang, min_ovlp, gap_fuzz, b_mask_t, NULL, NULL);
}

void output_bp_trio_graph(asg_t *sg, ma_sub_t* coverage_cut, char* output_file_name,
ma_hit_t_alloc* sources, ma_hit_t_alloc* reverse_sources, long long tipsLen, float tip_drop_ratio, long long stops_threshold,
R_to_U* ruIndex, float chimeric_rate, float drop_ratio, int max_hang, int min_ovlp, bub_label_t* b_mask_t,
long long gap_fuzz, ug_opt_t *opt)
{
hic_clean(sg);

kvec_asg_arc_t_warp new_rtg_edges;
kv_init(new_rtg_edges.a);
ma_ug_t *ug = NULL;
ug = ma_ug_gen_primary(sg, PRIMARY_LABLE);

hap_cov_t *cov = NULL;
trans_chain* t_ch = NULL;
if((asm_opt.flag & HA_F_VERBOSE_GFA)) t_ch = load_hc_trans(output_file_name);

if(!t_ch) {
new_rtg_edges.a.n = 0;
asg_t *copy_sg = copy_read_graph(sg);
ma_ug_t *copy_ug = copy_untig_graph(ug);
///asm_opt.purge_overlap_len = asm_opt.purge_overlap_len_hic;
///asm_opt.purge_simi_thres = asm_opt.purge_simi_rate_hic;
adjust_utg_by_primary(&copy_ug, copy_sg, TRIO_THRES, sources, reverse_sources, coverage_cut,
tipsLen, tip_drop_ratio, stops_threshold, ruIndex, chimeric_rate, drop_ratio,
max_hang, min_ovlp, &new_rtg_edges, &cov, b_mask_t, 1, 0);

ma_ug_destroy(copy_ug);
asg_destroy(copy_sg);

if((asm_opt.flag & HA_F_VERBOSE_GFA)) write_trans_chain(cov->t_ch, output_file_name);
}

// clean_u_trans_t_idx_filter_adv(&(cov?cov->t_ch->k_trans:t_ch->k_trans), ug, sg);
trio_phasing_refine(ug, sg, cov?&(cov->t_ch->k_trans):&(t_ch->k_trans), opt);

if(cov) destory_hap_cov_t(&cov);
if(t_ch) destory_trans_chain(&t_ch);


// char* gfa_name = (char*)malloc(strlen(output_file_name)+50);
// sprintf(gfa_name, "%s.after.clean_d_utg.noseq.gfa", output_file_name);
// FILE* output_file = fopen(gfa_name, "w");
// ma_ug_print_simple(ug, sg, coverage_cut, sources, ruIndex, "utg", output_file);
// fclose(output_file);
// free(gfa_name);



ma_ug_destroy(ug);
kv_destroy(new_rtg_edges.a);


output_trio_graph_joint(sg, coverage_cut, output_file_name, sources, reverse_sources, (asm_opt.max_short_tip*2), 0.15, 3, ruIndex,
0.05, 0.9, max_hang, min_ovlp, gap_fuzz, b_mask_t, NULL, NULL);
}

ma_ug_t* merge_utg(ma_ug_t **dest, ma_ug_t **src)
{
asg_t *g_d = (*dest)->g, *g_s = (*src)->g;
Expand Down Expand Up @@ -36393,8 +36449,13 @@ ma_sub_t **coverage_cut_ptr, int debug_g)
if(asm_opt.flag & HA_F_PARTITION) asm_opt.flag -= HA_F_PARTITION;
// output_trio_graph(sg, coverage_cut, o_file, sources, reverse_sources, (asm_opt.max_short_tip*2),
// 0.15, 3, ruIndex, 0.05, 0.9, max_hang_length, mini_overlap_length, 0, gap_fuzz, &uopt, &b_mask_t);
output_trio_graph_joint(sg, coverage_cut, o_file, sources, reverse_sources, (asm_opt.max_short_tip*2),
0.15, 3, ruIndex, 0.05, 0.9, max_hang_length, mini_overlap_length, gap_fuzz, &b_mask_t, NULL, NULL);
if(asm_opt.trio_cov_het_ovlp > 0) {
output_bp_trio_graph(sg, coverage_cut, o_file, sources, reverse_sources, (asm_opt.max_short_tip*2),
0.15, 3, ruIndex, 0.05, 0.9, max_hang_length, mini_overlap_length, &b_mask_t, gap_fuzz, &uopt);
} else {
output_trio_graph_joint(sg, coverage_cut, o_file, sources, reverse_sources, (asm_opt.max_short_tip*2),
0.15, 3, ruIndex, 0.05, 0.9, max_hang_length, mini_overlap_length, gap_fuzz, &b_mask_t, NULL, NULL);
}
}
else if(ha_opt_hic(&asm_opt))
{
Expand Down
1 change: 1 addition & 0 deletions gfa_ut.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,5 +40,6 @@ void filter_sg_by_ug(asg_t *rg, ma_ug_t *ug, ug_opt_t *uopt);
void ug_ext_gfa(ug_opt_t *uopt, asg_t *sg, uint32_t max_len);
void update_sg_uo(asg_t *g, ma_hit_t_alloc *src);
uint32_t get_arcs(asg_t *g, uint32_t v, uint32_t* idx, uint32_t idx_n);
uint64_t ug_occ_w(uint64_t is, uint64_t ie, ma_utg_t *u);

#endif
184 changes: 184 additions & 0 deletions hic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17722,6 +17722,190 @@ void hic_analysis(ma_ug_t *ug, asg_t* read_g, trans_chain* t_ch, ug_opt_t *opt,
destory_hc_pt_index(ug_index);free(ug_index);
}

uint64_t ug_occ_hap_w(uint64_t is, uint64_t ie, ma_utg_t *u)
{
uint64_t l, i, us, ue, occ;
for (i = l = occ = 0; i < u->n; i++) {
us = l; ue = l + Get_READ_LENGTH(R_INF, (u->a[i]>>33));
if(is <= us && ie >= ue) {
if((R_INF.trio_flag[u->a[i]>>33] == FATHER) || (R_INF.trio_flag[u->a[i]>>33] == MOTHER)) {
occ++;
}
}
if(us >= ie) break;
l += (uint32_t)u->a[i];
}
return occ;
}

void trio_phasing_refine(ma_ug_t *iug, asg_t* sg, kv_u_trans_t *ref, ug_opt_t *opt)
{
ma_ug_t *ug = copy_untig_graph(iug); uint8_t *bf = NULL;
uint64_t ug_n0 = ug->g->n_seq, k, i, ul, cis_n, trans_n, w_n, tot_hap, tot_r, frid, mrid, flag;
kv_u_trans_t in; memset(&in, 0, sizeof(in)); ma_utg_t *p; u_trans_t *z; int64_t fh, mh;
bubble_type *bub = gen_bubble_chain(sg, ug, opt, &bf, 0); free(bf);
clean_u_trans_t_idx_filter_adv(ref, ug, sg);

///update ug itself
ul = FATHER; frid = ug->g->n_seq; asg_seq_set(ug->g, frid, ul, 0);
kv_pushp(ma_utg_t, ug->u, &p); memset(p, 0, sizeof((*p)));
p->s = 0; p->start = UINT32_MAX; p->end = UINT32_MAX; p->len = ul, p->n = p->m = 1; p->circ = 1;
kv_roundup32(p->m); p->a = (uint64_t*)malloc(8 * p->m);
p->a[0] = UINT32_MAX; p->a[0] <<= 32; p->a[0] |= ul;

ul = MOTHER; mrid = ug->g->n_seq; asg_seq_set(ug->g, mrid, ul, 0);
kv_pushp(ma_utg_t, ug->u, &p); memset(p, 0, sizeof((*p)));
p->s = 0; p->start = UINT32_MAX; p->end = UINT32_MAX; p->len = ul, p->n = p->m = 1; p->circ = 1;
kv_roundup32(p->m); p->a = (uint64_t*)malloc(8 * p->m);
p->a[0] = UINT32_MAX; p->a[0] <<= 32; p->a[0] |= ul;

free(ug->g->idx); ug->g->idx = 0; ug->g->is_srt = 0; asg_cleanup(ug->g);

///update bubble
REALLOC(bub->index, ug->g->n_seq);
for (k = ug_n0; k < ug->g->n_seq; k++) bub->index[k] = bub->f_bub+1;

bub->b_s_idx.n = ug->g->n_seq; kv_resize(uint64_t, bub->b_s_idx, ug->g->n_seq);
for (k = ug_n0; k < bub->b_s_idx.m; k++) bub->b_s_idx.a[k] = (uint64_t)-1;

///check if a node has haplotype markers
CALLOC(bf, ug->g->n_seq);
for (k = cis_n = tot_hap = tot_r = 0; k < ug_n0; k++) {
if(ug->g->seq[k].del) continue;
p = &(ug->u.a[k]); tot_r += p->n;
if(p->n == 0 || p->m == 0) continue;
for (i = fh = mh = 0; i < p->n; i++) {
if(R_INF.trio_flag[p->a[i]>>33] == FATHER) {
tot_hap++; fh = 1;
} else if(R_INF.trio_flag[p->a[i]>>33] == MOTHER) {
tot_hap++; mh = 1;
}
if((R_INF.trio_flag[p->a[i]>>33] == FATHER) || (R_INF.trio_flag[p->a[i]>>33] == MOTHER)) {
tot_hap++; bf[k] = 1;
}
}
if(fh > 0) {
bf[k] = 1; cis_n+=2;
}
if(mh > 0) {
bf[k] = 1; cis_n+=2;
}
}
for (; k < ug->g->n_seq; k++) bf[k] = 1;///for father/mother nodes


///update trans overlaps
kv_pushp(u_trans_t, *ref, &z); memset(z, 0, sizeof((*z)));
z->f = RC_2; z->nw = (DBL_MAX/2); z->rev = 0;
z->qn = ug_n0; z->tn = ug_n0+1; z->del = 0;
z->qs = 0; z->qe = ug->g->seq[z->qn].len;
z->ts = 0; z->te = ug->g->seq[z->tn].len;

kv_pushp(u_trans_t, *ref, &z); memset(z, 0, sizeof((*z)));
z->f = RC_2; z->nw = (DBL_MAX/2); z->rev = 0;
z->qn = ug_n0+1; z->tn = ug_n0; z->del = 0;
z->qs = 0; z->qe = ug->g->seq[z->qn].len;
z->ts = 0; z->te = ug->g->seq[z->tn].len;

kt_u_trans_t_idx(ref, ug->g->n_seq);
// clean_u_trans_t_idx_adv(ref, ug, sg);
// clean_u_trans_t_idx_filter_adv(ref, ug, sg);

///gen all links
for (k = trans_n = 0; k < ref->n; k++) {
if(ref->a[k].del) continue;
if((IF_HOM(ref->a[k].qn, (*bub))) || (IF_HOM(ref->a[k].tn, (*bub)))) continue;
///disable bf
// if((!bf[ref->a[k].qn]) || (!bf[ref->a[k].tn])) continue;
trans_n++;
}
kv_resize(u_trans_t, in, (trans_n+cis_n));

for (k = in.n = 0; k < ref->n; k++) {
if(ref->a[k].del) continue;
if((IF_HOM(ref->a[k].qn, (*bub))) || (IF_HOM(ref->a[k].tn, (*bub)))) continue;
///disable bf
// if((!bf[ref->a[k].qn]) || (!bf[ref->a[k].tn])) continue;

kv_pushp(u_trans_t, in, &z); memset(z, 0, sizeof((*z))); w_n = 0;
if((ref->a[k].qn < ug_n0) && (ref->a[k].tn < ug_n0)) {
w_n += ug_occ_hap_w(ref->a[k].qs, ref->a[k].qe, &(ug->u.a[ref->a[k].qn])) +
ug_occ_hap_w(ref->a[k].ts, ref->a[k].te, &(ug->u.a[ref->a[k].tn]));
w_n += ug_occ_w(ref->a[k].qs, ref->a[k].qe, &(ug->u.a[ref->a[k].qn])) +
ug_occ_w(ref->a[k].ts, ref->a[k].te, &(ug->u.a[ref->a[k].tn]));
} else {
w_n += tot_hap + tot_r;
}
w_n >>= 1;///wn/4
(*z) = ref->a[k]; z->nw = ((w_n)?(w_n):(1));
}

for (k = 0; k < ug_n0; k++) {
if(!bf[k]) continue;
p = &(ug->u.a[k]);
if(p->n == 0 || p->m == 0) continue;
fh = mh = 0;
for (i = 0; i < p->n; i++) {
if(R_INF.trio_flag[p->a[i]>>33] == FATHER) fh--;
if(R_INF.trio_flag[p->a[i]>>33] == MOTHER) mh--;
}
if(fh != 0) {
w_n = frid;

kv_pushp(u_trans_t, in, &z); memset(z, 0, sizeof((*z)));
z->f = RC_2; z->nw = fh; z->rev = 0;
z->qn = w_n; z->tn = k; z->del = 0;
z->qs = 0; z->qe = MIN(ug->g->seq[z->qn].len, ug->g->seq[z->tn].len);
z->ts = 0; z->te = MIN(ug->g->seq[z->qn].len, ug->g->seq[z->tn].len);

kv_pushp(u_trans_t, in, &z); memset(z, 0, sizeof((*z)));
z->f = RC_2; z->nw = fh; z->rev = 0;
z->qn = k; z->tn = w_n; z->del = 0;
z->qs = 0; z->qe = MIN(ug->g->seq[z->qn].len, ug->g->seq[z->tn].len);
z->ts = 0; z->te = MIN(ug->g->seq[z->qn].len, ug->g->seq[z->tn].len);
}

if(mh != 0) {
w_n = mrid;

kv_pushp(u_trans_t, in, &z); memset(z, 0, sizeof((*z)));
z->f = RC_2; z->nw = mh; z->rev = 0;
z->qn = w_n; z->tn = k; z->del = 0;
z->qs = 0; z->qe = MIN(ug->g->seq[z->qn].len, ug->g->seq[z->tn].len);
z->ts = 0; z->te = MIN(ug->g->seq[z->qn].len, ug->g->seq[z->tn].len);

kv_pushp(u_trans_t, in, &z); memset(z, 0, sizeof((*z)));
z->f = RC_2; z->nw = mh; z->rev = 0;
z->qn = k; z->tn = w_n; z->del = 0;
z->qs = 0; z->qe = MIN(ug->g->seq[z->qn].len, ug->g->seq[z->tn].len);
z->ts = 0; z->te = MIN(ug->g->seq[z->qn].len, ug->g->seq[z->tn].len);
}
}
free(bf);
assert(in.n == (trans_n+cis_n));
kt_u_trans_t_idx(&in, ug->g->n_seq);
// clean_u_trans_t_idx_adv(&in, ug, sg);

ps_t *s = init_ps_t(11, ug->g->n_seq); s->s[frid] = 1; s->s[mrid] = -1;
mc_solve(NULL, NULL, &in, ug, sg, 0.8, NULL, 0, s->s, 1, bub, ref, 0, 0);
// fprintf(stderr, "[M::%s::] s[frid]::%d, s->s[mrid]::%d\n", __func__, s->s[frid], s->s[mrid]);
if((s->s[frid] != 0) && (s->s[mrid] != 0) && (s->s[frid] != s->s[mrid])) {
memset(R_INF.trio_flag, AMBIGU, R_INF.total_reads * sizeof(uint8_t));
for (i = 0; i < ug_n0; i++) {
if(ug->g->seq[i].del) continue;
flag = AMBIGU;
if(s->s[i] == 0) continue;
flag = (s->s[i] == s->s[frid]? FATHER:MOTHER);
p = &ug->u.a[i];
if(p->m == 0) continue;
for (k = 0; k < p->n; k++) R_INF.trio_flag[p->a[k]>>33] = flag;
}
}

destory_bubbles(bub); free(bub); ma_ug_destroy(ug);
destory_ps_t(&s); kv_destroy(in); kv_destroy(in.idx);
}

spg_t *hic_pre_analysis(ma_ug_t *ug, asg_t* read_g, trans_chain* t_ch, ug_opt_t *opt, kvec_pe_hit **rhits)
{
ug_index = NULL;
Expand Down
1 change: 1 addition & 0 deletions hic.h
Original file line number Diff line number Diff line change
Expand Up @@ -117,5 +117,6 @@ void hic_analysis(ma_ug_t *ug, asg_t* read_g, trans_chain* t_ch, ug_opt_t *opt,
spg_t *hic_pre_analysis(ma_ug_t *ug, asg_t* read_g, trans_chain* t_ch, ug_opt_t *opt, kvec_pe_hit **rhits);
void prt_bubble_gfa_adv(FILE *fp, bubble_type *bub, const char* utg_pre, const char* bub_pre, const char* chain_pre);
void bp_solve(ug_opt_t *opt, kv_u_trans_t *ref, ma_ug_t *ug, asg_t *sg, bubble_type *bub, double cis_rate);
void trio_phasing_refine(ma_ug_t *ug, asg_t* sg, kv_u_trans_t *ta, ug_opt_t *opt);

#endif
5 changes: 3 additions & 2 deletions horder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3969,8 +3969,8 @@ asg_t *i_rg, ma_ug_t* i_ug, bubble_type* bub, kv_u_trans_t *ref, ug_opt_t *opt,

int cmp_mc_edge_t_w(const void * a, const void * b)
{
if((*(osg_arc_t*)a).nw == (*(osg_arc_t*)b).nw) return 0;
return (*(osg_arc_t*)a).nw < (*(osg_arc_t*)b).nw ? -1 : 1;
if((*(mc_edge_t*)a).w == (*(mc_edge_t*)b).w) return 0;
return (*(mc_edge_t*)a).w < (*(mc_edge_t*)b).w ? -1 : 1;
}

void cal_chain_arch(scg_t *sg, const mc_match_t *ma, uint32_t v, uint32_t *va, uint32_t vn, uint64_t *idx, asg64_v *srt)
Expand Down Expand Up @@ -4303,6 +4303,7 @@ double min_cut, double max_cut, uint64_t cut_round)
kv_pushp(mc_edge_t, sm, &sp);
sp->w = sg->g->arc[k].nw; sp->x = k;
}
// fprintf(stderr, "sm.n::%lu\n", (uint64_t)sm.n);
qsort(sm.a, sm.n, sizeof(mc_edge_t), cmp_mc_edge_t_w);


Expand Down