Skip to content

Commit

Permalink
Merge pull request #140 from maxplanck-ie/improved_self_circles_count
Browse files Browse the repository at this point in the history
Improved self circles count
  • Loading branch information
fidelram committed Oct 24, 2017
2 parents 10815f6 + fca38b3 commit b8c68e8
Show file tree
Hide file tree
Showing 37 changed files with 862 additions and 355 deletions.
167 changes: 83 additions & 84 deletions hicexplorer/hicBuildMatrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,15 +184,15 @@ def parse_arguments(args=None):
# action='store_true'
)

parser.add_argument('--removeSelfCircles',
help='If set, outward facing reads, at a distance '
'of less than 25kbs are removed.',
parser.add_argument('--keepSelfCircles',
help='If set, outward facing reads without any restriction fragment (self circles) are kept. '
'They will be counted and shown in the QC plots.',
required=False,
action='store_true'
)

parser.add_argument('--minMappingQuality',
help='minimun mapping quality for reads to be accepted. '
help='minimum mapping quality for reads to be accepted. '
'Because the restriction enzyme site could be located '
'on top of the read, this may reduce the '
'reported quality of the read. Thus, this parameter '
Expand Down Expand Up @@ -679,7 +679,7 @@ def readBamFiles(pFileOneIterator, pFileTwoIterator, pNumberOfItemsPerBuffer, pS


def process_data(pMateBuffer1, pMateBuffer2, pMinMappingQuality,
pRemoveSelfCircles, pRestrictionSequence, pRemoveSelfLigation, pMatrixSize,
pKeepSelfCircles, pRestrictionSequence, pRemoveSelfLigation, pMatrixSize,
pRfPositions, pRefId2name,
pDanglingSequences, pBinsize, pResultIndex,
pQueueOut, pTemplate, pOutputBamSet, pCounter,
Expand All @@ -695,7 +695,7 @@ def process_data(pMateBuffer1, pMateBuffer2, pMinMappingQuality,
pMateBuffer1 : List of n reads of type 'pysam.libcalignedsegment.AlignedSegment' of sam input file 1
pMateBuffer2 : List of n reads of type 'pysam.libcalignedsegment.AlignedSegment' of sam input file 2
pMinMappingQuality : integer, minimum mapping quality of a read
pRemoveSelfCircles : boolean, if self circles should be removed
pKeepSelfCircles : boolean, if self circles should be kept
pRestrictionSequence : String, the restriction sequence
pRemoveSelfLigation : If self ligations should be removed
pMatrixSize : integer, the size of the interaction matrix
Expand Down Expand Up @@ -846,12 +846,29 @@ def process_data(pMateBuffer1, pMateBuffer2, pMinMappingQuality,
orientation = 'same-strand-right'

# check self-circles
# self circles are defined as pairs within 25kb
# with 'outward' orientation (Jin et al. 2013. Nature)
# self circles are defined as outward pairs that do not
# have a restriction sequence in between. The distance of < 25kb is
# used to only check close outward pairs as far apart pairs can not be self-circles
if abs(mate2.pos - mate1.pos) < 25000 and orientation == 'outward':
self_circle += 1
if pRemoveSelfCircles:
continue
if pRfPositions and pRestrictionSequence:
# check if in between the two mate
# ends the restriction fragment is found.

# the interval used is:
# start of fragment + length of restriction sequence
# end of fragment - length of restriction sequence
# the restriction sequence length is subtracted
# such that only fragments internally containing
# the restriction site are identified
frag_start = min(mate1.pos, mate2.pos) + len(pRestrictionSequence)
frag_end = max(mate1.pos + mate1.qlen, mate2.pos + mate2.qlen) - len(pRestrictionSequence)
mate_ref = pRefId2name[mate1.rname]
has_rf = sorted(pRfPositions[mate_ref][frag_start: frag_end])

if len(has_rf) == 0:
self_circle += 1
if not pKeepSelfCircles:
continue

# check for dangling ends if the restriction sequence
# is known:
Expand Down Expand Up @@ -1139,7 +1156,7 @@ def main(args=None):
pMateBuffer1=buffer_workers1[i],
pMateBuffer2=buffer_workers2[i],
pMinMappingQuality=args.minMappingQuality,
pRemoveSelfCircles=args.removeSelfCircles,
pKeepSelfCircles=args.keepSelfCircles,
pRestrictionSequence=args.restrictionSequence,
pRemoveSelfLigation=args.removeSelfLigation,
pMatrixSize=matrix_size,
Expand Down Expand Up @@ -1306,7 +1323,7 @@ def main(args=None):
else:
msg = " (not removed)"

mappable_pairs = iter_num - one_mate_unmapped
mappable_unique_high_quality_pairs = iter_num - (one_mate_unmapped + one_mate_low_quality + one_mate_not_unique)

log_file_name = os.path.join(args.QCfolder, "QC.log")
log_file = open(log_file_name, 'w')
Expand All @@ -1319,84 +1336,66 @@ def main(args=None):
""".format(args.outFileName.name, iter_num, args.minDistance,
args.maxDistance))

log_file.write("Pairs used\t{}\t({:.2f})\t({:.2f})\n".format(pair_added,
100 *
float(
pair_added) / iter_num,
100 * float(pair_added) / mappable_pairs))
log_file.write("One mate unmapped\t{}\t({:.2f})\t({:.2f})\n".format(one_mate_unmapped,
100 *
float(
one_mate_unmapped) / iter_num,
100 * float(one_mate_unmapped) / mappable_pairs))

log_file.write("One mate not unique\t{}\t({:.2f})\t({:.2f})\n".format(one_mate_not_unique,
100 *
float(
one_mate_not_unique) / iter_num,
100 * float(one_mate_not_unique) / mappable_pairs))

log_file.write("One mate low quality\t{}\t({:.2f})\t({:.2f})\n".format(one_mate_low_quality,
100 *
float(
one_mate_low_quality) / iter_num,
100 * float(one_mate_low_quality) / mappable_pairs))

log_file.write("dangling end\t{}\t({:.2f})\t({:.2f})\n".format(dangling_end,
100 *
float(
dangling_end) / iter_num,
100 * float(dangling_end) / mappable_pairs))

log_file.write("self ligation{}\t{}\t({:.2f})\t({:.2f})\n".format(msg, self_ligation,
100 *
float(
self_ligation) / iter_num,
100 * float(self_ligation) / mappable_pairs))

log_file.write("One mate not close to rest site\t{}\t({:.2f})\t({:.2f})\n".format(mate_not_close_to_rf,
100 *
float(
mate_not_close_to_rf) / iter_num,
100 * float(mate_not_close_to_rf) / mappable_pairs))

log_file.write("same fragment (800 bp)\t{}\t({:.2f})\t({:.2f})\n".format(same_fragment,
100 *
float(
same_fragment) / iter_num,
100 * float(same_fragment) / mappable_pairs))
log_file.write("self circle\t{}\t({:.2f})\t({:.2f})\n".format(self_circle,
100 *
float(
self_circle) / iter_num,
100 * float(self_circle) / mappable_pairs))
log_file.write("duplicated pairs\t{}\t({:.2f})\t({:.2f})\n".format(duplicated_pairs,
100 *
float(
duplicated_pairs) / iter_num,
100 * float(duplicated_pairs) / mappable_pairs))
log_file.write("#\tcount\t(percentage w.r.t. total sequenced reads)\n")

log_file.write("Pairs mappable, unique and high quality\t{}\t({:.2f})\n".
format(mappable_unique_high_quality_pairs,
100 * float(mappable_unique_high_quality_pairs) / iter_num))

log_file.write("Pairs used\t{}\t({:.2f})\n".
format(pair_added, 100 * float(pair_added) / iter_num))

log_file.write("One mate unmapped\t{}\t({:.2f})\n".
format(one_mate_unmapped, 100 * float(one_mate_unmapped) / iter_num))

log_file.write("One mate not unique\t{}\t({:.2f})\n".
format(one_mate_not_unique, 100 * float(one_mate_not_unique) / iter_num))

log_file.write("One mate low quality\t{}\t({:.2f})\n".
format(one_mate_low_quality, 100 * float(one_mate_low_quality) / iter_num))

log_file.write("\n#\tcount\t(percentage w.r.t. mappable, unique and high quality pairs)\n")

log_file.write("dangling end\t{}\t({:.2f})\n".
format(dangling_end, 100 * float(dangling_end) / mappable_unique_high_quality_pairs))

log_file.write("self ligation{}\t{}\t({:.2f})\n".
format(msg, self_ligation, 100 * float(self_ligation) / mappable_unique_high_quality_pairs))

log_file.write("One mate not close to rest site\t{}\t({:.2f})\n".
format(mate_not_close_to_rf, 100 * float(mate_not_close_to_rf) / mappable_unique_high_quality_pairs))

log_file.write("same fragment (800 bp)\t{}\t({:.2f})\n".
format(same_fragment, 100 * float(same_fragment) / mappable_unique_high_quality_pairs))

log_file.write("self circle\t{}\t({:.2f})\n".
format(self_circle, 100 * float(self_circle) / mappable_unique_high_quality_pairs))

log_file.write("duplicated pairs\t{}\t({:.2f})\n".
format(duplicated_pairs, 100 * float(duplicated_pairs) / mappable_unique_high_quality_pairs))

if pair_added > 0:
log_file.write("Of pairs used:\n")
log_file.write("inter chromosomal\t{}\t({:.2f})\n".format(
inter_chromosomal, 100 * float(inter_chromosomal) / pair_added))
log_file.write("\n#\tcount\t(percentage w.r.t. total valid pairs used)\n")
log_file.write("inter chromosomal\t{}\t({:.2f})\n".
format(inter_chromosomal, 100 * float(inter_chromosomal) / pair_added))

log_file.write("short range < 20kb\t{}\t({:.2f})\n".format(
short_range, 100 * float(short_range) / pair_added))
log_file.write("short range < 20kb\t{}\t({:.2f})\n".
format(short_range, 100 * float(short_range) / pair_added))

log_file.write("long range\t{}\t({:.2f})\n".format(
long_range, 100 * float(long_range) / pair_added))
log_file.write("long range\t{}\t({:.2f})\n".
format(long_range, 100 * float(long_range) / pair_added))

log_file.write("inward pairs\t{}\t({:.2f})\n".format(
count_inward, 100 * float(count_inward) / pair_added))
log_file.write("inward pairs\t{}\t({:.2f})\n".
format(count_inward, 100 * float(count_inward) / pair_added))

log_file.write("outward pairs\t{}\t({:.2f})\n".format(
count_outward, 100 * float(count_outward) / pair_added))
log_file.write("outward pairs\t{}\t({:.2f})\n".
format(count_outward, 100 * float(count_outward) / pair_added))

log_file.write("left pairs\t{}\t({:.2f})\n".format(
count_left, 100 * float(count_left) / pair_added))
log_file.write("left pairs\t{}\t({:.2f})\n".
format(count_left, 100 * float(count_left) / pair_added))

log_file.write("right pairs\t{}\t({:.2f})\n".format(
count_right, 100 * float(count_right) / pair_added))
log_file.write("right pairs\t{}\t({:.2f})\n".
format(count_right, 100 * float(count_right) / pair_added))

log_file.close()
QC.main("-l {} -o {}".format(log_file_name, args.QCfolder).split())
Expand Down
3 changes: 2 additions & 1 deletion hicexplorer/hicLog2Ratio.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ def parse_arguments(args=None):
"""
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
description='Converts between different matrix file formats')
description='Computes the log2 ratio between two matrices. The larger matrix is scaled down to match the '
'total sum of the smaller matrix. ')

# define the arguments
parser.add_argument('--treatment', '-t',
Expand Down

0 comments on commit b8c68e8

Please sign in to comment.