Skip to content

Commit

Permalink
Merge pull request #121 from chrovis/fix/sequence-out-of-range
Browse files Browse the repository at this point in the history
Remove N padding for out-of-range bases in read-sequence.
  • Loading branch information
totakke committed Nov 13, 2017
2 parents 310a0fa + 466bbc7 commit 8cb494b
Show file tree
Hide file tree
Showing 4 changed files with 106 additions and 106 deletions.
4 changes: 1 addition & 3 deletions src/cljam/io/fasta/core.clj
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,7 @@

(defn read-sequence
[rdr {:keys [chr start end]} opts]
(if (and (nil? start) (nil? end))
(reader/read-whole-sequence rdr chr opts)
(reader/read-sequence rdr chr start end opts)))
(reader/read-sequence rdr chr start end opts))

(defn read
[rdr]
Expand Down
48 changes: 21 additions & 27 deletions src/cljam/io/fasta/reader.clj
Original file line number Diff line number Diff line change
Expand Up @@ -75,33 +75,27 @@

(defn read-sequence
[^FASTAReader rdr name start end {:keys [mask?]}]
(let [fai @(.index-delay rdr)
{:keys [len]} (fasta-index/get-header fai name)
buf (CharBuffer/allocate (inc (- end start)))
r ^RandomAccessFile (.reader rdr)]
(when-let [[s e] (fasta-index/get-span fai name (dec start) end)]
(dotimes [_ (- 1 start)]
(.put buf \N))
(let [mbb (.. r getChannel (map FileChannel$MapMode/READ_ONLY s (- e s)))]
(if mask?
(while (.hasRemaining mbb)
(let [c (unchecked-char (.get mbb))]
(when-not (or (= \newline c) (= \return c))
(.put buf c))))
(while (.hasRemaining mbb)
(let [c (unchecked-long (.get mbb))]
(when-not (or (= 10 c) (= 13 c))
;; toUpperCase works only for ASCII chars.
(.put buf (unchecked-char (bit-and c 0x5f))))))))
(dotimes [_ (- end len)]
(.put buf \N))
(.flip buf)
(.toString buf))))

(defn read-whole-sequence
[^FASTAReader rdr name opts]
(let [{:keys [len]} (fasta-index/get-header @(.index-delay rdr) name)]
(read-sequence rdr name 1 len opts)))
(let [fai @(.index-delay rdr)]
(when-let [len (:len (fasta-index/get-header fai name))]
(let [start' (max 1 (or start 1))
end' (min len (or end len))]
(when (<= start' end')
(let [buf (CharBuffer/allocate (inc (- end' start')))
r ^RandomAccessFile (.reader rdr)]
(when-let [[s e] (fasta-index/get-span fai name (dec start') end')]
(let [mbb (.. r getChannel (map FileChannel$MapMode/READ_ONLY s (- e s)))]
(if mask?
(while (.hasRemaining mbb)
(let [c (unchecked-char (.get mbb))]
(when-not (or (= \newline c) (= \return c))
(.put buf c))))
(while (.hasRemaining mbb)
(let [c (unchecked-long (.get mbb))]
(when-not (or (= 10 c) (= 13 c))
;; toUpperCase works only for ASCII chars.
(.put buf (unchecked-char (bit-and c 0x5f))))))))
(.flip buf)
(.toString buf))))))))

(defn read
"Reads FASTA sequence data, returning its information as a lazy sequence."
Expand Down
38 changes: 19 additions & 19 deletions src/cljam/io/twobit/reader.clj
Original file line number Diff line number Diff line change
Expand Up @@ -99,25 +99,25 @@
(when-let [[n {:keys [offset]}]
(first (filter (fn [[i {:keys [name]}]] (= name chr)) (map vector (range) (.file-index rdr))))]
(let [{:keys [len ambs masks header-offset]} @(nth (.seq-index rdr) n) ;; Potential seek & read.
start' (or start 1)
end' (or end len)
start-offset (quot (dec (max 1 start')) 4)
end-offset (quot (dec (min len end')) 4)
ba (byte-array (- end-offset start-offset -1))
cb (CharBuffer/allocate (inc (- end' start')))]
(.seek ^RandomAccessFile (.reader rdr) (+ offset header-offset start-offset))
(.readFully ^RandomAccessFile (.reader rdr) ba)
(dotimes [out-pos (inc (- end' start'))]
(let [ref-pos (+ out-pos start')
ba-pos (- (quot (dec ref-pos) 4) start-offset)
bit-pos (mod (dec ref-pos) 4)]
(if (<= 1 ref-pos len)
(.put cb (.charAt ^String (twobit-to-str (+ (aget ba ba-pos) 128)) bit-pos))
(.put cb \N))))
(replace-ambs! cb ambs start' end')
(when mask? (mask! cb masks start' end'))
(.rewind cb)
(.toString cb)))))
start' (max 1 (or start 1))
end' (min len (or end len))]
(when (<= start' end')
(let [start-offset (quot (dec start') 4)
end-offset (quot (dec end') 4)
ba (byte-array (- end-offset start-offset -1))
cb (CharBuffer/allocate (inc (- end' start')))]
(.seek ^RandomAccessFile (.reader rdr) (+ offset header-offset start-offset))
(.readFully ^RandomAccessFile (.reader rdr) ba)
(dotimes [out-pos (inc (- end' start'))]
(let [ref-pos (+ out-pos start')
ba-pos (- (quot (dec ref-pos) 4) start-offset)
bit-pos (mod (dec ref-pos) 4)]
(when (<= 1 ref-pos len)
(.put cb (.charAt ^String (twobit-to-str (+ (aget ba ba-pos) 128)) bit-pos)))))
(replace-ambs! cb ambs start' end')
(when mask? (mask! cb masks start' end'))
(.rewind cb)
(.toString cb)))))))

(defn- read-all-sequences*
[rdr chrs option]
Expand Down
122 changes: 65 additions & 57 deletions test/cljam/io/sequence_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -40,47 +40,57 @@
[{:name "ref", :len 45, :offset 33, :ambs [], :masks [], :header-offset 16}
{:name "ref2", :len 40, :offset 61, :ambs [], :masks [[1 40]], :header-offset 24}])))))

(deftest read-sequence-fasta-test
(with-open [rdr (cseq/fasta-reader test-fa-file)]
(is (= (cseq/read-sequence rdr {:chr "ref" :start 5 :end 10}) "TGTTAG"))
(is (= (cseq/read-sequence rdr {:chr "ref" :start 5 :end 10} {:mask? false})
"TGTTAG"))
(is (= (cseq/read-sequence rdr {:chr "ref" :start 5 :end 10} {:mask? true})
"TGTTAG")))
(with-open [rdr (cseq/fasta-reader test-fa-file)]
(is (= (cseq/read-sequence rdr {:chr "ref2" :start 1 :end 16})
(deftest read-sequence-test
(with-open [fa-rdr (cseq/reader test-fa-file)
tb-rdr (cseq/reader test-twobit-file)]
(are [?reg ?opts ?expect]
(= (cseq/read-sequence fa-rdr ?reg ?opts)
(cseq/read-sequence tb-rdr ?reg ?opts)
?expect)
{} {} nil
{:chr "badref"} {} nil
{:chr "ref" :start -1 :end 0} {} nil
{:chr "ref" :start 0 :end 0} {} nil
{:chr "ref" :start -1 :end 1} {} "A"
{:chr "ref" :start 0 :end 1} {} "A"
{:chr "ref" :start 1 :end 2} {} "AG"
{:chr "ref" :start 44 :end 45} {} "AT"
{:chr "ref" :start 45 :end 45} {} "T"
{:chr "ref" :start 45 :end 46} {} "T"
{:chr "ref" :start 46 :end 46} {} nil
{:chr "ref" :start 46 :end 47} {} nil
{:chr "ref" :start 5 :end 10} {:mask? false} "TGTTAG"
{:chr "ref" :start 5 :end 10} {:mask? true} "TGTTAG"
{:chr "ref2" :start 1 :end 16} {:mask? false} "AGGTTTTATAAAACAA"
{:chr "ref2" :start 1 :end 16} {:mask? true} "aggttttataaaacaa"
{:chr "ref2" :start 1 :end 45} {:mask? false} "AGGTTTTATAAAACAATTAAGTCTACAGAGCAACTACGCG"
{:chr "ref2" :start 10} {:mask? false} "AAAACAATTAAGTCTACAGAGCAACTACGCG"
{:chr "ref2" :end 10} {:mask? false} "AGGTTTTATA"
{:chr "ref"} {:mask? false} "AGCATGTTAGATAAGATAGCTGTGCTAGTAGGCAGTCAGCGCCAT"
{:chr "ref"} {:mask? true} "AGCATGTTAGATAAGATAGCTGTGCTAGTAGGCAGTCAGCGCCAT"
{:chr "ref2"} {:mask? false} "AGGTTTTATAAAACAATTAAGTCTACAGAGCAACTACGCG"
{:chr "ref2"} {:mask? true} "aggttttataaaacaattaagtctacagagcaactacgcg"))
(with-open [fa-rdr (cseq/reader test-fa-file)
tb-rdr (cseq/reader test-twobit-file)]
(are [?reg ?expect]
(= (cseq/read-sequence fa-rdr ?reg)
(cseq/read-sequence tb-rdr ?reg)
?expect)
{:chr "ref" :start 5 :end 10} "TGTTAG"
{:chr "ref2" :start 1 :end 16} "AGGTTTTATAAAACAA"
{:chr "ref2" :start 0 :end 45} "AGGTTTTATAAAACAATTAAGTCTACAGAGCAACTACGCG"
{:chr "ref"} "AGCATGTTAGATAAGATAGCTGTGCTAGTAGGCAGTCAGCGCCAT"
{:chr "ref2"} "AGGTTTTATAAAACAATTAAGTCTACAGAGCAACTACGCG"))
(with-open [fa-rdr (cseq/reader test-fa-file)
tb-rdr (cseq/reader test-twobit-file)]
(is (= (protocols/read-in-region fa-rdr {:chr "ref2" :start 1 :end 16})
(protocols/read-in-region tb-rdr {:chr "ref2" :start 1 :end 16})
"AGGTTTTATAAAACAA"))
(is (= (cseq/read-sequence rdr {:chr "ref2" :start 1 :end 16} {:mask? false})
(is (= (protocols/read-in-region fa-rdr {:chr "ref2" :start 1 :end 16} {:mask? false})
(protocols/read-in-region tb-rdr {:chr "ref2" :start 1 :end 16} {:mask? false})
"AGGTTTTATAAAACAA"))
(is (= (cseq/read-sequence rdr {:chr "ref2" :start 1 :end 16} {:mask? true})
"aggttttataaaacaa")))
(with-open [rdr (cseq/fasta-reader test-fa-file)]
(is (= (cseq/read-sequence rdr {:chr "ref2" :start 0 :end 45})
"NAGGTTTTATAAAACAATTAAGTCTACAGAGCAACTACGCGNNNNN"))
(is (= (cseq/read-sequence rdr {:chr "ref2" :start 0 :end 45} {:mask? false})
"NAGGTTTTATAAAACAATTAAGTCTACAGAGCAACTACGCGNNNNN"))
(is (= (cseq/read-sequence rdr {:chr "ref2" :start 0 :end 45} {:mask? true})
"NaggttttataaaacaattaagtctacagagcaactacgcgNNNNN")))
(with-open [rdr (cseq/fasta-reader test-fa-file)]
(is (= (cseq/read-sequence rdr {:chr "ref"})
"AGCATGTTAGATAAGATAGCTGTGCTAGTAGGCAGTCAGCGCCAT"))
(is (= (cseq/read-sequence rdr {:chr "ref"} {:mask? false})
"AGCATGTTAGATAAGATAGCTGTGCTAGTAGGCAGTCAGCGCCAT"))
(is (= (cseq/read-sequence rdr {:chr "ref"} {:mask? true})
"AGCATGTTAGATAAGATAGCTGTGCTAGTAGGCAGTCAGCGCCAT")))
(with-open [rdr (cseq/fasta-reader test-fa-file)]
(is (= (cseq/read-sequence rdr {:chr "ref2"})
"AGGTTTTATAAAACAATTAAGTCTACAGAGCAACTACGCG"))
(is (= (cseq/read-sequence rdr {:chr "ref2"} {:mask? false})
"AGGTTTTATAAAACAATTAAGTCTACAGAGCAACTACGCG"))
(is (= (cseq/read-sequence rdr {:chr "ref2"} {:mask? true})
"aggttttataaaacaattaagtctacagagcaactacgcg")))
(with-open [rdr (cseq/fasta-reader test-fa-file)]
(is (= (protocols/read-in-region rdr {:chr "ref2" :start 1 :end 16})
"AGGTTTTATAAAACAA"))
(is (= (protocols/read-in-region rdr {:chr "ref2" :start 1 :end 16} {:mask? false})
"AGGTTTTATAAAACAA"))
(is (= (protocols/read-in-region rdr {:chr "ref2" :start 1 :end 16} {:mask? true})
(is (= (protocols/read-in-region fa-rdr {:chr "ref2" :start 1 :end 16} {:mask? true})
(protocols/read-in-region tb-rdr {:chr "ref2" :start 1 :end 16} {:mask? true})
"aggttttataaaacaa"))))

(deftest read-sequence-medium-fasta-test
Expand All @@ -97,38 +107,36 @@
(testing "reference test"
(with-open [r (cseq/twobit-reader test-twobit-file)
c (cseq/reader r)]
(are [?region ?opt ?result] (= (cseq/read-sequence r ?region ?opt)
(cseq/read-sequence c ?region ?opt)
?result)
{:chr "ref"} {} "AGCATGTTAGATAAGATAGCTGTGCTAGTAGGCAGTCAGCGCCAT"
{:chr "ref2"} {} "AGGTTTTATAAAACAATTAAGTCTACAGAGCAACTACGCG"
{:chr "ref2"} {:mask? true} "aggttttataaaacaattaagtctacagagcaactacgcg"
{:chr "ref" :start 1 :end 4} {} "AGCA"
{:chr "ref" :start 0 :end 4} {} "NAGCA"
{:chr "ref" :start 41 :end 50} {} "GCCATNNNNN"
{:chr "ref" :start 1 :end 45} {} "AGCATGTTAGATAAGATAGCTGTGCTAGTAGGCAGTCAGCGCCAT"
{:chr "ref2" :start 1 :end 40} {} "AGGTTTTATAAAACAATTAAGTCTACAGAGCAACTACGCG"
{:chr "ref2" :start 1 :end 40} {:mask? true} "aggttttataaaacaattaagtctacagagcaactacgcg"
{:chr "chr1" :start 1 :end 40} {} nil)
(is (= (for [i (range 1 45) j (range i 46)]
(cseq/read-sequence r {:chr "ref" :start i :end j}))
(for [i (range 1 45) j (range i 46)]
(subs "AGCATGTTAGATAAGATAGCTGTGCTAGTAGGCAGTCAGCGCCAT" (dec i) j))))
(is (= (protocols/read-in-region r {:chr "ref2" :start 1 :end 40}) "AGGTTTTATAAAACAATTAAGTCTACAGAGCAACTACGCG"))
(is (= (protocols/read-in-region r {:chr "ref2" :start 1 :end 40} {:mask? false}) "AGGTTTTATAAAACAATTAAGTCTACAGAGCAACTACGCG"))
(is (= (protocols/read-in-region r {:chr "ref2" :start 1 :end 40} {:mask? true}) "aggttttataaaacaattaagtctacagagcaactacgcg"))))
(is (= (protocols/read-in-region r {:chr "ref2" :start 1 :end 40})
"AGGTTTTATAAAACAATTAAGTCTACAGAGCAACTACGCG"))
(is (= (protocols/read-in-region r {:chr "ref2" :start 1 :end 40} {:mask? false})
"AGGTTTTATAAAACAATTAAGTCTACAGAGCAACTACGCG"))
(is (= (protocols/read-in-region r {:chr "ref2" :start 1 :end 40} {:mask? true})
"aggttttataaaacaattaagtctacagagcaactacgcg"))))
(testing "reference test with N"
(with-open [r (cseq/twobit-reader test-twobit-n-file)
c (cseq/reader r)]
(are [?region ?opt ?result] (= (cseq/read-sequence r ?region ?opt)
(cseq/read-sequence c ?region ?opt)
?result)
{} {} nil
{:chr "badref"} {} nil
{:chr "ref"} {} "NNNNNGTTAGATAAGATAGCNNTGCTAGTAGGCAGTCNNNNCCAT"
{:chr "ref2"} {} "AGNNNTTATAAAACAATTANNNCTACAGAGCAACTANNNN"
{:chr "ref2"} {:mask? true} "agNNNttataaaacaattaNNNctacagagcaactaNNNN"
{:chr "ref" :start 10} {} "GATAAGATAGCNNTGCTAGTAGGCAGTCNNNNCCAT"
{:chr "ref" :end 10} {} "NNNNNGTTAG"
{:chr "ref" :start -3 :end 0} {} nil
{:chr "ref" :start -3 :end 1} {} "N"
{:chr "ref" :start 46 :end 50} {} nil
{:chr "ref" :start 45 :end 50} {} "T"
{:chr "ref" :start 1 :end 4} {} "NNNN"
{:chr "ref" :start 0 :end 4} {} "NNNNN"
{:chr "ref" :start 41 :end 50} {} "NCCATNNNNN"
{:chr "ref" :start 0 :end 4} {} "NNNN"
{:chr "ref" :start 41 :end 50} {} "NCCAT"
{:chr "ref" :start 1 :end 45} {} "NNNNNGTTAGATAAGATAGCNNTGCTAGTAGGCAGTCNNNNCCAT"
{:chr "ref2" :start 1 :end 40} {} "AGNNNTTATAAAACAATTANNNCTACAGAGCAACTANNNN"
{:chr "ref2" :start 1 :end 40} {:mask? true} "agNNNttataaaacaattaNNNctacagagcaactaNNNN"
Expand Down

0 comments on commit 8cb494b

Please sign in to comment.