Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BED reader/writer #73

Merged
merged 4 commits into from May 10, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
98 changes: 70 additions & 28 deletions src/cljam/bed.clj
@@ -1,8 +1,31 @@
(ns cljam.bed
(:require [clojure.string :as cstr]
(:require [clojure.java.io :as io]
[clojure.string :as cstr]
[cljam.util :as util]
[cljam.util.chromosome :as chr-util])
(:import [java.io BufferedReader BufferedWriter]))
(:import [java.io BufferedReader BufferedWriter Closeable]))

(defrecord BEDReader [^BufferedReader reader ^String f]
Closeable
(close [this]
(.close ^Closeable (.reader this))))

(defrecord BEDWriter [^BufferedWriter writer ^String f]
Closeable
(close [this]
(.close ^Closeable (.writer this))))

(defn ^BEDReader reader
"Returns BED file reader of f."
[f]
(let [abs (.getAbsolutePath (io/file f))]
(BEDReader. (io/reader (util/compressor-input-stream abs)) abs)))

(defn ^BEDWriter writer
"Returns BED file writer of f."
[f]
(let [abs (.getAbsolutePath (io/file f))]
(BEDWriter. (io/writer (util/compressor-output-stream abs)) abs)))

(def ^:const bed-columns
[:chr :start :end :name :score :strand :thick-start :thick-end :item-rgb :block-count :block-sizes :block-starts])
Expand Down Expand Up @@ -32,25 +55,24 @@
"Parse BED fields string and returns a map.
Based on information at https://genome.ucsc.edu/FAQ/FAQformat#format1."
[^String s]
{:post [(and (:chr %) (:start %) (:end %))
;; First 3 fields are required.
(< (:start %) (:end %))
{:post [;; First 3 fields are required.
(:chr %) (:start %) (:end %)
;; The chromEnd base is not included in the display of the feature.
(every? true? (drop-while false? (map nil? ((apply juxt bed-columns) %))))
(< (:start %) (:end %))
;; Lower-numbered fields must be populated if higher-numbered fields are used.
(if-let [s (:score %)] (<= 0 s 1000) true)
(every? true? (drop-while false? (map nil? ((apply juxt bed-columns) %))))
;; A score between 0 and 1000.
(if-let [s (:score %)] (<= 0 s 1000) true)
;; The number of items in this list should correspond to blockCount.
(if-let [xs (:block-sizes %)] (= (count xs) (:block-count %)) true)
;; The number of items in this list should correspond to blockCount.
(if-let [xs (:block-starts %)] (= (count xs) (:block-count %)) true)
;; The number of items in this list should correspond to blockCount.
(if-let [[f] (:block-starts %)] (= 0 f) true)
;; The first blockStart value must be 0.
(if-let [xs (:block-starts %)] (= (+ (last xs) (last (:block-sizes %))) (- (:end %) (:start %))) true)
(if-let [[f] (:block-starts %)] (= 0 f) true)
;; The final blockStart position plus the final blockSize value must equal chromEnd.
(if-let [xs (:block-starts %)] (apply <= (mapcat (fn [a b] [a (+ a b)]) xs (:block-sizes %))) true)
(if-let [xs (:block-starts %)] (= (+ (last xs) (last (:block-sizes %))) (- (:end %) (:start %))) true)
;; Blocks may not overlap.
]}
(if-let [xs (:block-starts %)] (apply <= (mapcat (fn [a b] [a (+ a b)]) xs (:block-sizes %))) true)]}
(reduce
(fn deserialize-bed-reduce-fn [m [k f]] (update-some m k f))
(zipmap bed-columns (cstr/split s #"\s+"))
Expand All @@ -67,6 +89,24 @@
(defn- serialize-bed
"Serialize bed fields into string."
[m]
{:pre [;; First 3 fields are required.
(:chr m) (:start m) (:end m)
;; The chromEnd base is not included in the display of the feature.
(< (:start m) (:end m))
;; Lower-numbered fields must be populated if higher-numbered fields are used.
(every? true? (drop-while false? (map nil? ((apply juxt bed-columns) m))))
;; A score between 0 and 1000.
(if-let [s (:score m)] (<= 0 s 1000) true)
;; The number of items in this list should correspond to blockCount.
(if-let [xs (:block-sizes m)] (= (count xs) (:block-count m)) true)
;; The number of items in this list should correspond to blockCount.
(if-let [xs (:block-starts m)] (= (count xs) (:block-count m)) true)
;; The first blockStart value must be 0.
(if-let [[f] (:block-starts m)] (= 0 f) true)
;; The final blockStart position plus the final blockSize value must equal chromEnd.
(if-let [xs (:block-starts m)] (= (+ (last xs) (last (:block-sizes m))) (- (:end m) (:start m))) true)
;; Blocks may not overlap.
(if-let [xs (:block-starts m)] (apply <= (mapcat (fn [a b] [a (+ a b)]) xs (:block-sizes m))) true)]}
(->> (-> m
(update-some :strand #(case % :plus "+" :minus "-" :no-strand "."))
(update-some :block-sizes long-list->str)
Expand Down Expand Up @@ -104,20 +144,20 @@

(defn read-raw-fields
"Returns a lazy sequence of unnormalized BED fields."
[^BufferedReader rdr]
[^BEDReader rdr]
(sequence
(comp (remove header-or-comment?)
(map deserialize-bed))
(line-seq rdr)))
(line-seq (.reader rdr))))

(defn read-fields
"Returns a lazy sequence of normalized BED fields."
[^BufferedReader rdr]
[^BEDReader rdr]
(sequence
(comp (remove header-or-comment?)
(map deserialize-bed)
(map normalize))
(line-seq rdr)))
(line-seq (.reader rdr))))

(defn sort-fields
"Sort BED fields based on :chr, :start and :end.
Expand Down Expand Up @@ -148,18 +188,20 @@

(defn write-raw-fields
"Write sequence of BED fields to writer without converting :start and :thick-start values."
[^BufferedWriter wtr xs]
(->> xs
(map serialize-bed)
(interpose "\n")
^String (apply str)
(.write wtr)))
[^BEDWriter wtr xs]
(let [w ^BufferedWriter (.writer wtr)]
(->> xs
(map serialize-bed)
(interpose "\n")
^String (apply str)
(.write w))))

(defn write-fields
"Write sequence of BED fields to writer."
[^BufferedWriter wtr xs]
(->> xs
(map (comp serialize-bed denormalize))
(interpose "\n")
^String (apply str)
(.write wtr)))
[^BEDWriter wtr xs]
(let [w ^BufferedWriter (.writer wtr)]
(->> xs
(map (comp serialize-bed denormalize))
(interpose "\n")
^String (apply str)
(.write w))))
Binary file added test-resources/bed/test1.bed.gz
Binary file not shown.
Binary file added test-resources/bed/test2.bed.bz2
Binary file not shown.
85 changes: 70 additions & 15 deletions test/cljam/t_bed.clj
Expand Up @@ -8,33 +8,34 @@
[cljam.bam :as bam]
[cljam.util.sam-util :as sam-util])
(:import [java.io BufferedReader InputStreamReader ByteArrayInputStream
ByteArrayOutputStream OutputStreamWriter BufferedWriter]))
ByteArrayOutputStream OutputStreamWriter BufferedWriter]
[cljam.bed BEDReader BEDWriter]))

(defn- str->bed [^String s]
(with-open [bais (ByteArrayInputStream. (.getBytes s))
isr (InputStreamReader. bais)
br (BufferedReader. isr)]
br (bed/BEDReader. (BufferedReader. isr) nil)]
(doall (bed/read-fields br))))

(defn- bed->str [xs]
(with-open [bao (ByteArrayOutputStream.)
osw (OutputStreamWriter. bao)
bw (BufferedWriter. osw)]
bw (bed/BEDWriter. (BufferedWriter. osw) nil)]
(bed/write-fields bw xs)
(.flush bw)
(.flush ^BufferedWriter (.writer bw))
(.toString bao)))

(defn- raw-str->bed [^String s]
(with-open [bais (ByteArrayInputStream. (.getBytes s))
isr (InputStreamReader. bais)
br (BufferedReader. isr)]
(doall (bed/read-raw-fields br))))
(doall (bed/read-raw-fields (bed/BEDReader. br nil)))))

(defn- bed->raw-str [xs]
(with-open [bao (ByteArrayOutputStream.)
osw (OutputStreamWriter. bao)
bw (BufferedWriter. osw)]
(bed/write-raw-fields bw xs)
(bed/write-raw-fields (bed/BEDWriter. bw nil) xs)
(.flush bw)
(.toString bao)))

Expand All @@ -47,14 +48,42 @@
[{:chr "chr1" :start 1 :end 100 :name "N" :score 0 :strand :plus :thick-start 1 :thick-end 0
:item-rgb "255,0,0" :block-count 2 :block-sizes [10 90] :block-starts [0 10]}]))

(with-open [r (io/reader test-bed-file1)]
(with-open [r (bed/reader test-bed-file1)]
(is (= (bed/read-fields r)
[{:chr "chr22" :start 1001 :end 5000 :name "cloneA" :score 960 :strand :plus :thick-start 1001
:thick-end 5000 :item-rgb "0" :block-count 2 :block-sizes [567 488] :block-starts [0 3512]}
{:chr "chr22" :start 2001 :end 6000 :name "cloneB" :score 900 :strand :minus :thick-start 2001
:thick-end 6000 :item-rgb "0" :block-count 2 :block-sizes [433 399] :block-starts [0 3601]}])))

(with-open [r (io/reader test-bed-file2)]
(with-open [r (bed/reader test-bed-file1-gz)]
(is (= (bed/read-fields r)
[{:chr "chr22" :start 1001 :end 5000 :name "cloneA" :score 960 :strand :plus :thick-start 1001
:thick-end 5000 :item-rgb "0" :block-count 2 :block-sizes [567 488] :block-starts [0 3512]}
{:chr "chr22" :start 2001 :end 6000 :name "cloneB" :score 900 :strand :minus :thick-start 2001
:thick-end 6000 :item-rgb "0" :block-count 2 :block-sizes [433 399] :block-starts [0 3601]}])))

(with-open [r (bed/reader test-bed-file2)]
(is (= (bed/read-fields r)
[{:chr "chr7" :start 127471197 :end 127472363 :name "Pos1" :score 0 :strand :plus :thick-start 127471197
:thick-end 127472363 :item-rgb "255,0,0"}
{:chr "chr7" :start 127472364 :end 127473530 :name "Pos2" :score 0 :strand :plus :thick-start 127472364
:thick-end 127473530 :item-rgb "255,0,0"}
{:chr "chr7" :start 127473531 :end 127474697 :name "Pos3" :score 0 :strand :plus :thick-start 127473531
:thick-end 127474697 :item-rgb "255,0,0"}
{:chr "chr7" :start 127474698 :end 127475864 :name "Pos4" :score 0 :strand :plus :thick-start 127474698
:thick-end 127475864 :item-rgb "255,0,0"}
{:chr "chr7" :start 127475865 :end 127477031 :name "Neg1" :score 0 :strand :minus :thick-start 127475865
:thick-end 127477031 :item-rgb "0,0,255"}
{:chr "chr7" :start 127477032 :end 127478198 :name "Neg2" :score 0 :strand :minus :thick-start 127477032
:thick-end 127478198 :item-rgb "0,0,255"}
{:chr "chr7" :start 127478199 :end 127479365 :name "Neg3" :score 0 :strand :minus :thick-start 127478199
:thick-end 127479365 :item-rgb "0,0,255"}
{:chr "chr7" :start 127479366 :end 127480532 :name "Pos5" :score 0 :strand :plus :thick-start 127479366
:thick-end 127480532 :item-rgb "255,0,0"}
{:chr "chr7" :start 127480533 :end 127481699 :name "Neg4" :score 0 :strand :minus :thick-start 127480533
:thick-end 127481699 :item-rgb "0,0,255"}])))

(with-open [r (bed/reader test-bed-file2-bz2)]
(is (= (bed/read-fields r)
[{:chr "chr7" :start 127471197 :end 127472363 :name "Pos1" :score 0 :strand :plus :thick-start 127471197
:thick-end 127472363 :item-rgb "255,0,0"}
Expand All @@ -75,7 +104,7 @@
{:chr "chr7" :start 127480533 :end 127481699 :name "Neg4" :score 0 :strand :minus :thick-start 127480533
:thick-end 127481699 :item-rgb "0,0,255"}])))

(with-open [r (io/reader test-bed-file3)]
(with-open [r (bed/reader test-bed-file3)]
(is (= (bed/read-fields r)
[{:chr "chr7" :start 127471197 :end 127472363 :name "Pos1" :score 0 :strand :plus}
{:chr "chr7" :start 127472364 :end 127473530 :name "Pos2" :score 0 :strand :plus}
Expand Down Expand Up @@ -174,9 +203,35 @@
(is (= (bed->str (str->bed "1 0 1")) "chr1 0 1"))
(is (= (bed->str (str->bed "1 0 10")) "chr1 0 10"))
(is (= (bed->str (str->bed "1 0 1\n1 1 2")) "chr1 0 1\nchr1 1 2"))
(is (= (with-open [r (io/reader test-bed-file1)] (str->bed (bed->str (bed/read-fields r))))
(with-open [r (io/reader test-bed-file1)] (doall (bed/read-fields r)))))
(is (= (with-open [r (io/reader test-bed-file2)] (str->bed (bed->str (bed/read-fields r))))
(with-open [r (io/reader test-bed-file2)] (doall (bed/read-fields r)))))
(is (= (with-open [r (io/reader test-bed-file3)] (str->bed (bed->str (bed/read-fields r))))
(with-open [r (io/reader test-bed-file3)] (doall (bed/read-fields r))))))
(is (= (with-open [r (bed/reader test-bed-file1)] (str->bed (bed->str (bed/read-fields r))))
(with-open [r (bed/reader test-bed-file1)] (doall (bed/read-fields r)))))
(is (= (with-open [r (bed/reader test-bed-file1-gz)] (str->bed (bed->str (bed/read-fields r))))
(with-open [r (bed/reader test-bed-file1-gz)] (doall (bed/read-fields r)))))
(is (= (with-open [r (bed/reader test-bed-file2)] (str->bed (bed->str (bed/read-fields r))))
(with-open [r (bed/reader test-bed-file2)] (doall (bed/read-fields r)))))
(is (= (with-open [r (bed/reader test-bed-file2-bz2)] (str->bed (bed->str (bed/read-fields r))))
(with-open [r (bed/reader test-bed-file2-bz2)] (doall (bed/read-fields r)))))
(is (= (with-open [r (bed/reader test-bed-file3)] (str->bed (bed->str (bed/read-fields r))))
(with-open [r (bed/reader test-bed-file3)] (doall (bed/read-fields r)))))

(with-before-after {:before (prepare-cache!)
:after (clean-cache!)}
(let [temp-file (str temp-dir "/test1.bed")
temp-file-gz (str temp-dir "/test1.bed.gz")
temp-file-bz2 (str temp-dir "/test1.bed.bz2")
xs (with-open [r (bed/reader test-bed-file1)] (doall (bed/read-fields r)))]
(with-open [wtr1 (bed/writer temp-file)
wtr2 (bed/writer temp-file-gz)
wtr3 (bed/writer temp-file-bz2)]
(bed/write-fields wtr1 xs)
(bed/write-fields wtr2 xs)
(bed/write-fields wtr3 xs))
(with-open [rdr1 (bed/reader temp-file)
rdr2 (bed/reader temp-file-gz)
rdr3 (bed/reader temp-file-bz2)]
(is (= xs
(bed/read-fields rdr1)))
(is (= xs
(bed/read-fields rdr2)))
(is (= xs
(bed/read-fields rdr3)))))))
2 changes: 2 additions & 0 deletions test/cljam/t_common.clj
Expand Up @@ -146,6 +146,8 @@
(def test-bed-file1 "test-resources/bed/test1.bed")
(def test-bed-file2 "test-resources/bed/test2.bed")
(def test-bed-file3 "test-resources/bed/test3.bed")
(def test-bed-file1-gz "test-resources/bed/test1.bed.gz")
(def test-bed-file2-bz2 "test-resources/bed/test2.bed.bz2")

;; ### TABIX files

Expand Down