-
Notifications
You must be signed in to change notification settings - Fork 294
/
count-median.py
executable file
·104 lines (81 loc) · 3.49 KB
/
count-median.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#! /usr/bin/env python
#
# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project@idyll.org
#
# pylint: disable=missing-docstring,invalid-name
"""
Count the median/avg k-mer abundance for each sequence in the input file.
The abundance is based on the k-mer counts in the given k-mer countgraph.
Can be used to estimate expression levels (mRNAseq) or coverage
(genomic/metagenomic).
% scripts/count-median.py <htname> <input seqs> <output counts>
Use '-h' for parameter help.
The output file contains sequence id, median, average, stddev, and seq length.
NOTE: All 'N's in the input sequences are converted to 'A's.
"""
from __future__ import print_function
import screed
import argparse
import sys
import csv
import textwrap
import khmer
from khmer.kfile import check_input_files, check_space
from khmer.khmer_args import info, sanitize_epilog
def get_parser():
epilog = """
Count the median/avg k-mer abundance for each sequence in the input file,
based on the k-mer counts in the given k-mer countgraph. Can be used
to estimate expression levels (mRNAseq) or coverage (genomic/metagenomic).
The output file contains sequence id, median, average, stddev, and
seq length, in comma-separated value (CSV) format.
Example::
count-median.py counts.ct tests/test-data/test-reads.fq.gz medians.txt
NOTE: All 'N's in the input sequences are converted to 'A's.
"""
parser = argparse.ArgumentParser(
description='Count k-mers summary stats for sequences',
epilog=textwrap.dedent(epilog))
parser.add_argument('countgraph', metavar='input_count_graph_filename',
help='input k-mer countgraph filename')
parser.add_argument('input', metavar='input_sequence_filename',
help='input FAST[AQ] sequence filename')
parser.add_argument('output', metavar='output_summary_filename',
help='output summary filename',
type=argparse.FileType('w'))
parser.add_argument('--version', action='version', version='%(prog)s ' +
khmer.__version__)
parser.add_argument('-f', '--force', default=False, action='store_true',
help='Overwrite output file if it exists')
return parser
def main():
info('count-median.py', ['diginorm'])
args = sanitize_epilog(get_parser()).parse_args()
htfile = args.countgraph
input_filename = args.input
output = args.output
output_filename = str(output)
infiles = [htfile, input_filename]
for infile in infiles:
check_input_files(infile, args.force)
check_space(infiles, args.force)
print('loading k-mer countgraph from', htfile, file=sys.stderr)
countgraph = khmer.load_countgraph(htfile)
ksize = countgraph.ksize()
print('writing to', output_filename, file=sys.stderr)
output = csv.writer(output)
# write headers:
output.writerow(['name', 'median', 'average', 'stddev', 'seqlen'])
for record in screed.open(input_filename):
seq = record.sequence.upper()
if 'N' in seq:
seq = seq.replace('N', 'A')
if ksize <= len(seq):
medn, ave, stdev = countgraph.get_median_count(seq)
ave, stdev = [round(x, 9) for x in (ave, stdev)]
output.writerow([record.name, medn, ave, stdev, len(seq)])
if __name__ == '__main__':
main()