-
Notifications
You must be signed in to change notification settings - Fork 12
/
fastq.clj
181 lines (165 loc) · 6.02 KB
/
fastq.clj
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
(ns cljam.io.fastq
"Functions to read and write the FASTQ format."
(:require [clojure.java.io :as cio]
[clojure.string :as string]
[cljam.io.protocols :as protocols]
[cljam.util :as util])
(:import [java.io Closeable]
[java.nio CharBuffer]))
(declare read-sequences write-sequences)
(deftype FASTQReader [reader url]
Closeable
(close [this]
(.close ^Closeable (.reader this)))
protocols/IReader
(reader-url [this] (.url this))
(read [this] (read-sequences this))
(read [this opts] (read-sequences this opts))
(indexed? [_] false))
(deftype FASTQWriter [writer url]
Closeable
(close [this]
(.close ^Closeable (.writer this)))
protocols/IWriter
(writer-url [this] (.url this)))
(defn ^FASTQReader reader
"Returns an open cljam.io.fastq.FASTQReader of f. Should be used inside
with-open to ensure the reader is properly closed."
[f]
(-> (util/compressor-input-stream f)
cio/reader
(FASTQReader. (util/as-url f))))
(defn ^FASTQWriter writer
"Returns an open cljam.io.fastq.FASTQWriter of f. Should be used inside
with-open to ensure the writer is properly closed."
[f]
(-> (util/compressor-output-stream f)
cio/writer
(FASTQWriter. (util/as-url f))))
(defrecord FASTQRead [^String name ^String sequence quality])
(defn- ^FASTQRead deserialize-fastq
"Deserialize a read from 4 lines of fastq file."
[[^String name-line ^String seq-line ^String plus-line ^String qual-line]
{:keys [decode-quality] :or {decode-quality :phred33}}]
{:pre [(not-empty name-line)
(not-empty seq-line)
(not-empty plus-line)
(not-empty qual-line)
(= (first name-line) \@)
(= (first plus-line) \+)
(not-empty (rest name-line))
(or (empty? (rest plus-line))
(= (rest plus-line) (rest name-line)))
(= (count seq-line) (count qual-line))]
:post [(every? (fn [q] (case decode-quality
:phred33 (<= 0 q 93)
:phred64 (<= 0 q 62)
true))
(:quality %))]}
(FASTQRead.
(subs name-line 1)
seq-line
(case decode-quality
:phred33 (map #(- (int %) 33) qual-line)
:phred64 (map #(- (int %) 64) qual-line)
qual-line)))
(defn read-sequences
"Returns a lazy sequence of FASTQReads deserialized from given reader."
([rdr]
(read-sequences rdr {}))
([^FASTQReader rdr opts]
(sequence
(comp (map string/trim)
(partition-all 4)
(map #(deserialize-fastq % opts)))
(line-seq (.reader rdr)))))
(defn- ^String serialize-fastq
"Serialize a FASTQRead to FASTQ format string."
[^FASTQRead {:keys [^String name ^String sequence quality]}
{:keys [encode-quality] :or {encode-quality :phred33}}]
{:pre [(not-empty name)
(not-empty sequence)
(not-empty quality)
(= (count sequence) (count quality))
(every? #(case encode-quality
:phred33 (<= 0 % 93)
:phred64 (<= 0 % 62)
true) quality)]}
(let [cb (CharBuffer/allocate (+ 6 (.length name) (.length sequence) (.length sequence)))]
(.put cb \@)
(.put cb name)
(.put cb \newline)
(.put cb sequence)
(.put cb \newline)
(.put cb \+)
(.put cb \newline)
(if (string? quality)
(.put cb ^String quality)
(doseq [q quality]
(.put cb (char (case encode-quality
:phred33 (+ q 33)
:phred64 (+ q 64)
q)))))
(.put cb \newline)
(.flip cb)
(.toString cb)))
(defn write-sequences
"Write given sequence of reads to a FASTQ file."
([wtr sequences]
(write-sequences wtr sequences {}))
([^FASTQWriter wtr sequences opts]
(let [w ^java.io.Writer (.writer wtr)]
(doseq [s sequences]
(.write w ^String (serialize-fastq s opts))))))
(def casava-pattern #"^@?([^\s^:]+):(\d+):(\d+):(\d+):(\d+)#(\d+)/(\d)+$")
(defn deserialize-casava-name
"Parse Casava-style name of fastq read."
[^String name]
(let [[match instrument lane tile x y index pair]
(re-matches casava-pattern name)]
(when match
{:instrument instrument
:lane (Integer/parseInt lane)
:tile (Integer/parseInt tile)
:x (Integer/parseInt x)
:y (Integer/parseInt y)
:index (Integer/parseInt index)
:pair (Integer/parseInt pair)})))
(defn ^String serialize-casava-name
"Encode fastq name map to Casava-style string."
[{:keys [instrument lane tile x y index pair]}]
(when (and instrument lane tile x y index pair)
(str instrument \: lane \: tile \: x \: y \# index \/ pair)))
(def casava-1_8-pattern
#"^@?([^\s^:]+):(\d+):([^\s^\:]+):(\d+):(\d+):(\d+):(\d+)\s+(\d+):(Y|N):(\d+):(\S+)$")
(defn deserialize-casava-1_8-name
"Parse Casava1.8-style name of fastq read."
[^String name]
(let [[match instrument run flowcell lane tile x y pair filtered control index]
(re-matches casava-1_8-pattern name)]
(when match
{:instrument instrument
:run (Integer/parseInt run)
:flowcell flowcell
:lane (Integer/parseInt lane)
:tile (Integer/parseInt tile)
:x (Integer/parseInt x)
:y (Integer/parseInt y)
:pair (Integer/parseInt pair)
:filtered (= filtered "Y")
:control (Integer/parseInt control)
:index index})))
(defn ^String serialize-casava-1_8-name
"Encode fastq name map to Casava1.8-style string."
[{:keys [instrument run flowcell lane tile x y pair filtered control index]}]
(when (and instrument run flowcell lane tile x y pair (not (nil? filtered)) control index)
(str instrument \: run \: flowcell \: lane \: tile \: x \: y " "
pair \: (if filtered \Y \N) \: control \: index)))
(defn deserialize-name
"Try parsing name of fastq read."
[^String name]
(first (keep #(% name) [deserialize-casava-1_8-name deserialize-casava-name])))
(defn ^String serialize-name
"Try encoding name of fastq read."
[name]
(first (keep #(% name) [serialize-casava-1_8-name serialize-casava-name])))