Skip to content

Commit

Permalink
Starting to work fairly well. Large sequences can be processed withou…
Browse files Browse the repository at this point in the history
…t running out of heap.
  • Loading branch information
John Jacobsen committed Jul 6, 2013
1 parent 44e63bc commit bfe8379
Show file tree
Hide file tree
Showing 6 changed files with 220 additions and 13 deletions.
Binary file added resources/sacCer3.2bit
Binary file not shown.
11 changes: 11 additions & 0 deletions src/jenome/rafile.clj
@@ -0,0 +1,11 @@
(ns jenome.rafile
(:import (java.io RandomAccessFile)))


(defn read-with-offset [fname offset len]
(let [raf (RandomAccessFile. fname "r")
bb (byte-array len)]
(.seek raf offset)
(.readFully raf bb)
(.close raf)
bb))
146 changes: 146 additions & 0 deletions src/jenome/sequential.clj
@@ -0,0 +1,146 @@
(ns jenome.sequential
"
Decode genome (human or otherwise) in 2-bit format as documented at:
http://genome.ucsc.edu/FAQ/FAQformat#format7
Sequential implementation, file must be processed end-to-end.
"
(:gen-class)
(:use midje.sweet
jenome.core
[clojure.math.numeric-tower :only [ceil]]
[clojure.java.io :only [resource]]
[gloss.core :only [defcodec repeated]]
[gloss.io :only [decode decode-all]]
[jenome.rafile :only [read-with-offset]]))


(defcodec u32 :uint32-le)
(defcodec hdr-codec (repeat 4 u32))

(defn get-bytes
[n inf]
(let [buf (byte-array n)]
(.read inf buf)
buf))

(defn get-sequence-count-from-file-header
[infile]
(let [hdr-bytes (get-bytes 16 infile)
[signature ver cnt reserved] (decode hdr-codec hdr-bytes)]
(assert (= signature 0x1A412743))
(assert (= ver reserved 0))
cnt))

(defn bytes-to-str
[bytes]
(apply str (map (comp char int) bytes)))

(defn read32
[infile]
(decode u32 (get-bytes 4 infile)))

(defn getwords
[n infile]
(decode-all u32 (get-bytes (* n 4) infile)))

(defn decode-sequence-block-header
[infile]
(let [[name-len & _] (get-bytes 1 infile)
seqname (bytes-to-str (get-bytes name-len infile))
offset (read32 infile)]
[seqname offset]))

(defn decode-sequence-fields
[infile]
(let [dna-size (read32 infile)
n-block-count (read32 infile)
n-block-starts (getwords n-block-count infile)
n-block-sizes (getwords n-block-count infile)
mask-block-count (read32 infile)
mask-block-starts (getwords mask-block-count infile)
mask-block-sizes (getwords mask-block-count infile)
reserved (read32 infile)]

(assert (monotonic? n-block-starts))
(assert (monotonic? mask-block-starts))
(assert (= reserved 0))
{:dna-size dna-size
:n-block-count n-block-count
:mask-block-count mask-block-count}))



(defn partition-buffer-sizes
"Return buffer sizes required to cleanly read a total of n bytes no more than m at a time"
[n m]
(let [remainder (mod n m)]
(if (zero? remainder)
(repeat (/ n m) m)
(concat (repeat (dec (/ n m)) m) [remainder]))))


(defn sequence-index [infile seqcount]
(for [i (range seqcount)]
(decode-sequence-block-header infile)))


(defn decode-genome
"
Make a lazy sequence of base pairs from a given .2bit genome file
"
([fname] (decode-genome 100000 fname))
([blocksiz fname]
(apply concat
(let [infile (clojure.java.io/input-stream fname)
seqcount (get-sequence-count-from-file-header infile)
;; Don't care about the index, but need to read the
;; bytes to get to the right position:
_ (doall (sequence-index infile seqcount))]
(for [i (range seqcount)
:let [header (decode-sequence-fields infile)
dna-size (:dna-size header)
dna-bytes (rounding-up-divide dna-size 4)
read-sizes (partition-buffer-sizes dna-bytes blocksiz)]
r read-sizes
:let [inner (get-bytes r infile)]
b inner]
(byte-to-base-pairs b))))))


(defn str-with-commas [n]
(->> n
str
reverse
(partition-all 3)
(interpose \,)
flatten
reverse
(apply str)))


(defn printing-counter
([s] (printing-counter 1000N s))
([ival s]
(loop [c 0
s s]
(if-not (seq s)
(println "\nTotal:" (str-with-commas c))
(do
(if (zero? (rem c ival)) (print (str "\r" (str-with-commas c))))
(recur (inc' c) (rest s)))))))


(defn -main-old [& args]
(let [[filename & extra] args]
(cond
(seq extra) (println "unknown extra argument(s)" extra)
(nil? filename) (println "expected file name argument (2bit genome format)")
:else
(do
(println (->> (first args)
decode-genome
(take 1000)
(map name)
(apply str)) "...")
(printing-counter (decode-genome (first args)))))))
20 changes: 7 additions & 13 deletions test/jenome/test_core.clj
Expand Up @@ -7,35 +7,29 @@
(facts "about utility functions"
(map nybs-to-bases [0 1 2 3]) => [:T :C :A :G]
(map nybs-to-bases [0 1 2 3]) => [:T :C :A :G]
(byte-to-base-pair 0X1B) => [:T :C :A :G]
(byte-to-base-pairs 0X1B) => [:T :C :A :G]
(rounding-up-divide 4 5) => 1
(rounding-up-divide 5 5) => 1
(rounding-up-divide 6 5) => 2
(rounding-up-divide 10 5) => 2
(rounding-up-divide 11 5) => 3
(partition-buffer-sizes 100 100) => [100]
(partition-buffer-sizes 200 100) => [100 100]
(partition-buffer-sizes 201 100) => [100 100 1])


(facts "about showing numbers with commas"
(str-with-commas 12345678) => "12,345,678")
(rounding-up-divide 11 5) => 3)


(defn yeast-section [from to]
(->> (resource "sacCer3.2bit")
decode-genome
as-file
genome-sequence
(drop from)
(take (- to from 1))
(map name)
(apply str)))


(facts "about example yeast Genome file"
; (count (sequence-index (clojure.java.io/input-stream (resource "sacCer3.2bit")))) => 17
(.exists (as-file (resource "sacCer3.2bit"))) => true
(->> (resource "sacCer3.2bit")
decode-genome
(->> (resource "sacCer3.2bit")
as-file
genome-sequence
(take 10)
(map name)
(apply str)) => "CCACACCACA"
Expand Down
42 changes: 42 additions & 0 deletions test/jenome/test_rafile.clj
@@ -0,0 +1,42 @@
(ns jenome.test-rafile
(:use [midje.sweet]
[jenome.rafile]
[clojure.java.io :only [as-file output-stream delete-file]]))


(defn rollover-byte-array [siz]
(let [ba (byte-array siz)]
(doseq [i (range siz)]
(aset-byte ba i (byte (rem i 128))))
ba))


(defn make-tmp-binary-file
[file-path siz]
(with-open [outfile (output-stream file-path)]
(.write outfile (rollover-byte-array siz))))


(defmacro with-tmp-binary-file [fname siz & body]
`(do
(make-tmp-binary-file ~fname ~siz)
(let [result# (do ~@body)]
(delete-file ~fname)
result#)))


(fact "with-tmp-binary-file works"
(let [fname "/tmp/adflkhadf"
f (as-file fname)
siz 120389]
(with-tmp-binary-file fname siz
(.exists f) => true
(.length f) => siz)
(.exists f) => false))


(fact "can read a file with random access, getting the right stuff out"
(let [fname "/tmp/sadklhjasdflkhs"]
(with-tmp-binary-file fname (+ 12800 128)
(into [] (read-with-offset fname 12800 128))
=> (range 128))))
14 changes: 14 additions & 0 deletions test/jenome/test_sequential.clj
@@ -0,0 +1,14 @@
(ns jenome.test-sequential
(:use [midje.sweet]
[clojure.java.io :only [as-file resource]]
[jenome.sequential]))


(facts "about partition-buffer-sizes"
(partition-buffer-sizes 100 100) => [100]
(partition-buffer-sizes 200 100) => [100 100]
(partition-buffer-sizes 201 100) => [100 100 1])


(facts "about showing numbers with commas"
(str-with-commas 12345678) => "12,345,678")

0 comments on commit bfe8379

Please sign in to comment.