diff --git a/src/cljam/bed.clj b/src/cljam/bed.clj index e125c03a..3b1b8f26 100644 --- a/src/cljam/bed.clj +++ b/src/cljam/bed.clj @@ -1,8 +1,31 @@ (ns cljam.bed - (:require [clojure.string :as cstr] + (:require [clojure.java.io :as io] + [clojure.string :as cstr] [cljam.util :as util] [cljam.util.chromosome :as chr-util]) - (:import [java.io BufferedReader BufferedWriter])) + (:import [java.io BufferedReader BufferedWriter Closeable])) + +(defrecord BEDReader [^BufferedReader reader ^String f] + Closeable + (close [this] + (.close ^Closeable (.reader this)))) + +(defrecord BEDWriter [^BufferedWriter writer ^String f] + Closeable + (close [this] + (.close ^Closeable (.writer this)))) + +(defn ^BEDReader reader + "Returns BED file reader of f." + [f] + (let [abs (.getAbsolutePath (io/file f))] + (BEDReader. (io/reader (util/compressor-input-stream abs)) abs))) + +(defn ^BEDWriter writer + "Returns BED file writer of f." + [f] + (let [abs (.getAbsolutePath (io/file f))] + (BEDWriter. (io/writer (util/compressor-output-stream abs)) abs))) (def ^:const bed-columns [:chr :start :end :name :score :strand :thick-start :thick-end :item-rgb :block-count :block-sizes :block-starts]) @@ -32,25 +55,24 @@ "Parse BED fields string and returns a map. Based on information at https://genome.ucsc.edu/FAQ/FAQformat#format1." [^String s] - {:post [(and (:chr %) (:start %) (:end %)) - ;; First 3 fields are required. - (< (:start %) (:end %)) + {:post [;; First 3 fields are required. + (:chr %) (:start %) (:end %) ;; The chromEnd base is not included in the display of the feature. - (every? true? (drop-while false? (map nil? ((apply juxt bed-columns) %)))) + (< (:start %) (:end %)) ;; Lower-numbered fields must be populated if higher-numbered fields are used. - (if-let [s (:score %)] (<= 0 s 1000) true) + (every? true? (drop-while false? (map nil? ((apply juxt bed-columns) %)))) ;; A score between 0 and 1000. + (if-let [s (:score %)] (<= 0 s 1000) true) + ;; The number of items in this list should correspond to blockCount. (if-let [xs (:block-sizes %)] (= (count xs) (:block-count %)) true) ;; The number of items in this list should correspond to blockCount. (if-let [xs (:block-starts %)] (= (count xs) (:block-count %)) true) - ;; The number of items in this list should correspond to blockCount. - (if-let [[f] (:block-starts %)] (= 0 f) true) ;; The first blockStart value must be 0. - (if-let [xs (:block-starts %)] (= (+ (last xs) (last (:block-sizes %))) (- (:end %) (:start %))) true) + (if-let [[f] (:block-starts %)] (= 0 f) true) ;; The final blockStart position plus the final blockSize value must equal chromEnd. - (if-let [xs (:block-starts %)] (apply <= (mapcat (fn [a b] [a (+ a b)]) xs (:block-sizes %))) true) + (if-let [xs (:block-starts %)] (= (+ (last xs) (last (:block-sizes %))) (- (:end %) (:start %))) true) ;; Blocks may not overlap. - ]} + (if-let [xs (:block-starts %)] (apply <= (mapcat (fn [a b] [a (+ a b)]) xs (:block-sizes %))) true)]} (reduce (fn deserialize-bed-reduce-fn [m [k f]] (update-some m k f)) (zipmap bed-columns (cstr/split s #"\s+")) @@ -67,6 +89,24 @@ (defn- serialize-bed "Serialize bed fields into string." [m] + {:pre [;; First 3 fields are required. + (:chr m) (:start m) (:end m) + ;; The chromEnd base is not included in the display of the feature. + (< (:start m) (:end m)) + ;; Lower-numbered fields must be populated if higher-numbered fields are used. + (every? true? (drop-while false? (map nil? ((apply juxt bed-columns) m)))) + ;; A score between 0 and 1000. + (if-let [s (:score m)] (<= 0 s 1000) true) + ;; The number of items in this list should correspond to blockCount. + (if-let [xs (:block-sizes m)] (= (count xs) (:block-count m)) true) + ;; The number of items in this list should correspond to blockCount. + (if-let [xs (:block-starts m)] (= (count xs) (:block-count m)) true) + ;; The first blockStart value must be 0. + (if-let [[f] (:block-starts m)] (= 0 f) true) + ;; The final blockStart position plus the final blockSize value must equal chromEnd. + (if-let [xs (:block-starts m)] (= (+ (last xs) (last (:block-sizes m))) (- (:end m) (:start m))) true) + ;; Blocks may not overlap. + (if-let [xs (:block-starts m)] (apply <= (mapcat (fn [a b] [a (+ a b)]) xs (:block-sizes m))) true)]} (->> (-> m (update-some :strand #(case % :plus "+" :minus "-" :no-strand ".")) (update-some :block-sizes long-list->str) @@ -104,20 +144,20 @@ (defn read-raw-fields "Returns a lazy sequence of unnormalized BED fields." - [^BufferedReader rdr] + [^BEDReader rdr] (sequence (comp (remove header-or-comment?) (map deserialize-bed)) - (line-seq rdr))) + (line-seq (.reader rdr)))) (defn read-fields "Returns a lazy sequence of normalized BED fields." - [^BufferedReader rdr] + [^BEDReader rdr] (sequence (comp (remove header-or-comment?) (map deserialize-bed) (map normalize)) - (line-seq rdr))) + (line-seq (.reader rdr)))) (defn sort-fields "Sort BED fields based on :chr, :start and :end. @@ -148,18 +188,20 @@ (defn write-raw-fields "Write sequence of BED fields to writer without converting :start and :thick-start values." - [^BufferedWriter wtr xs] - (->> xs - (map serialize-bed) - (interpose "\n") - ^String (apply str) - (.write wtr))) + [^BEDWriter wtr xs] + (let [w ^BufferedWriter (.writer wtr)] + (->> xs + (map serialize-bed) + (interpose "\n") + ^String (apply str) + (.write w)))) (defn write-fields "Write sequence of BED fields to writer." - [^BufferedWriter wtr xs] - (->> xs - (map (comp serialize-bed denormalize)) - (interpose "\n") - ^String (apply str) - (.write wtr))) + [^BEDWriter wtr xs] + (let [w ^BufferedWriter (.writer wtr)] + (->> xs + (map (comp serialize-bed denormalize)) + (interpose "\n") + ^String (apply str) + (.write w)))) diff --git a/test-resources/bed/test1.bed.gz b/test-resources/bed/test1.bed.gz new file mode 100644 index 00000000..66cb204c Binary files /dev/null and b/test-resources/bed/test1.bed.gz differ diff --git a/test-resources/bed/test2.bed.bz2 b/test-resources/bed/test2.bed.bz2 new file mode 100644 index 00000000..14c0569e Binary files /dev/null and b/test-resources/bed/test2.bed.bz2 differ diff --git a/test/cljam/t_bed.clj b/test/cljam/t_bed.clj index 9dc0c153..facf694a 100644 --- a/test/cljam/t_bed.clj +++ b/test/cljam/t_bed.clj @@ -8,33 +8,34 @@ [cljam.bam :as bam] [cljam.util.sam-util :as sam-util]) (:import [java.io BufferedReader InputStreamReader ByteArrayInputStream - ByteArrayOutputStream OutputStreamWriter BufferedWriter])) + ByteArrayOutputStream OutputStreamWriter BufferedWriter] + [cljam.bed BEDReader BEDWriter])) (defn- str->bed [^String s] (with-open [bais (ByteArrayInputStream. (.getBytes s)) isr (InputStreamReader. bais) - br (BufferedReader. isr)] + br (bed/BEDReader. (BufferedReader. isr) nil)] (doall (bed/read-fields br)))) (defn- bed->str [xs] (with-open [bao (ByteArrayOutputStream.) osw (OutputStreamWriter. bao) - bw (BufferedWriter. osw)] + bw (bed/BEDWriter. (BufferedWriter. osw) nil)] (bed/write-fields bw xs) - (.flush bw) + (.flush ^BufferedWriter (.writer bw)) (.toString bao))) (defn- raw-str->bed [^String s] (with-open [bais (ByteArrayInputStream. (.getBytes s)) isr (InputStreamReader. bais) br (BufferedReader. isr)] - (doall (bed/read-raw-fields br)))) + (doall (bed/read-raw-fields (bed/BEDReader. br nil))))) (defn- bed->raw-str [xs] (with-open [bao (ByteArrayOutputStream.) osw (OutputStreamWriter. bao) bw (BufferedWriter. osw)] - (bed/write-raw-fields bw xs) + (bed/write-raw-fields (bed/BEDWriter. bw nil) xs) (.flush bw) (.toString bao))) @@ -47,14 +48,42 @@ [{:chr "chr1" :start 1 :end 100 :name "N" :score 0 :strand :plus :thick-start 1 :thick-end 0 :item-rgb "255,0,0" :block-count 2 :block-sizes [10 90] :block-starts [0 10]}])) - (with-open [r (io/reader test-bed-file1)] + (with-open [r (bed/reader test-bed-file1)] (is (= (bed/read-fields r) [{:chr "chr22" :start 1001 :end 5000 :name "cloneA" :score 960 :strand :plus :thick-start 1001 :thick-end 5000 :item-rgb "0" :block-count 2 :block-sizes [567 488] :block-starts [0 3512]} {:chr "chr22" :start 2001 :end 6000 :name "cloneB" :score 900 :strand :minus :thick-start 2001 :thick-end 6000 :item-rgb "0" :block-count 2 :block-sizes [433 399] :block-starts [0 3601]}]))) - (with-open [r (io/reader test-bed-file2)] + (with-open [r (bed/reader test-bed-file1-gz)] + (is (= (bed/read-fields r) + [{:chr "chr22" :start 1001 :end 5000 :name "cloneA" :score 960 :strand :plus :thick-start 1001 + :thick-end 5000 :item-rgb "0" :block-count 2 :block-sizes [567 488] :block-starts [0 3512]} + {:chr "chr22" :start 2001 :end 6000 :name "cloneB" :score 900 :strand :minus :thick-start 2001 + :thick-end 6000 :item-rgb "0" :block-count 2 :block-sizes [433 399] :block-starts [0 3601]}]))) + + (with-open [r (bed/reader test-bed-file2)] + (is (= (bed/read-fields r) + [{:chr "chr7" :start 127471197 :end 127472363 :name "Pos1" :score 0 :strand :plus :thick-start 127471197 + :thick-end 127472363 :item-rgb "255,0,0"} + {:chr "chr7" :start 127472364 :end 127473530 :name "Pos2" :score 0 :strand :plus :thick-start 127472364 + :thick-end 127473530 :item-rgb "255,0,0"} + {:chr "chr7" :start 127473531 :end 127474697 :name "Pos3" :score 0 :strand :plus :thick-start 127473531 + :thick-end 127474697 :item-rgb "255,0,0"} + {:chr "chr7" :start 127474698 :end 127475864 :name "Pos4" :score 0 :strand :plus :thick-start 127474698 + :thick-end 127475864 :item-rgb "255,0,0"} + {:chr "chr7" :start 127475865 :end 127477031 :name "Neg1" :score 0 :strand :minus :thick-start 127475865 + :thick-end 127477031 :item-rgb "0,0,255"} + {:chr "chr7" :start 127477032 :end 127478198 :name "Neg2" :score 0 :strand :minus :thick-start 127477032 + :thick-end 127478198 :item-rgb "0,0,255"} + {:chr "chr7" :start 127478199 :end 127479365 :name "Neg3" :score 0 :strand :minus :thick-start 127478199 + :thick-end 127479365 :item-rgb "0,0,255"} + {:chr "chr7" :start 127479366 :end 127480532 :name "Pos5" :score 0 :strand :plus :thick-start 127479366 + :thick-end 127480532 :item-rgb "255,0,0"} + {:chr "chr7" :start 127480533 :end 127481699 :name "Neg4" :score 0 :strand :minus :thick-start 127480533 + :thick-end 127481699 :item-rgb "0,0,255"}]))) + + (with-open [r (bed/reader test-bed-file2-bz2)] (is (= (bed/read-fields r) [{:chr "chr7" :start 127471197 :end 127472363 :name "Pos1" :score 0 :strand :plus :thick-start 127471197 :thick-end 127472363 :item-rgb "255,0,0"} @@ -75,7 +104,7 @@ {:chr "chr7" :start 127480533 :end 127481699 :name "Neg4" :score 0 :strand :minus :thick-start 127480533 :thick-end 127481699 :item-rgb "0,0,255"}]))) - (with-open [r (io/reader test-bed-file3)] + (with-open [r (bed/reader test-bed-file3)] (is (= (bed/read-fields r) [{:chr "chr7" :start 127471197 :end 127472363 :name "Pos1" :score 0 :strand :plus} {:chr "chr7" :start 127472364 :end 127473530 :name "Pos2" :score 0 :strand :plus} @@ -174,9 +203,35 @@ (is (= (bed->str (str->bed "1 0 1")) "chr1 0 1")) (is (= (bed->str (str->bed "1 0 10")) "chr1 0 10")) (is (= (bed->str (str->bed "1 0 1\n1 1 2")) "chr1 0 1\nchr1 1 2")) - (is (= (with-open [r (io/reader test-bed-file1)] (str->bed (bed->str (bed/read-fields r)))) - (with-open [r (io/reader test-bed-file1)] (doall (bed/read-fields r))))) - (is (= (with-open [r (io/reader test-bed-file2)] (str->bed (bed->str (bed/read-fields r)))) - (with-open [r (io/reader test-bed-file2)] (doall (bed/read-fields r))))) - (is (= (with-open [r (io/reader test-bed-file3)] (str->bed (bed->str (bed/read-fields r)))) - (with-open [r (io/reader test-bed-file3)] (doall (bed/read-fields r)))))) + (is (= (with-open [r (bed/reader test-bed-file1)] (str->bed (bed->str (bed/read-fields r)))) + (with-open [r (bed/reader test-bed-file1)] (doall (bed/read-fields r))))) + (is (= (with-open [r (bed/reader test-bed-file1-gz)] (str->bed (bed->str (bed/read-fields r)))) + (with-open [r (bed/reader test-bed-file1-gz)] (doall (bed/read-fields r))))) + (is (= (with-open [r (bed/reader test-bed-file2)] (str->bed (bed->str (bed/read-fields r)))) + (with-open [r (bed/reader test-bed-file2)] (doall (bed/read-fields r))))) + (is (= (with-open [r (bed/reader test-bed-file2-bz2)] (str->bed (bed->str (bed/read-fields r)))) + (with-open [r (bed/reader test-bed-file2-bz2)] (doall (bed/read-fields r))))) + (is (= (with-open [r (bed/reader test-bed-file3)] (str->bed (bed->str (bed/read-fields r)))) + (with-open [r (bed/reader test-bed-file3)] (doall (bed/read-fields r))))) + + (with-before-after {:before (prepare-cache!) + :after (clean-cache!)} + (let [temp-file (str temp-dir "/test1.bed") + temp-file-gz (str temp-dir "/test1.bed.gz") + temp-file-bz2 (str temp-dir "/test1.bed.bz2") + xs (with-open [r (bed/reader test-bed-file1)] (doall (bed/read-fields r)))] + (with-open [wtr1 (bed/writer temp-file) + wtr2 (bed/writer temp-file-gz) + wtr3 (bed/writer temp-file-bz2)] + (bed/write-fields wtr1 xs) + (bed/write-fields wtr2 xs) + (bed/write-fields wtr3 xs)) + (with-open [rdr1 (bed/reader temp-file) + rdr2 (bed/reader temp-file-gz) + rdr3 (bed/reader temp-file-bz2)] + (is (= xs + (bed/read-fields rdr1))) + (is (= xs + (bed/read-fields rdr2))) + (is (= xs + (bed/read-fields rdr3))))))) diff --git a/test/cljam/t_common.clj b/test/cljam/t_common.clj index a54d73e6..dd17cc3e 100644 --- a/test/cljam/t_common.clj +++ b/test/cljam/t_common.clj @@ -146,6 +146,8 @@ (def test-bed-file1 "test-resources/bed/test1.bed") (def test-bed-file2 "test-resources/bed/test2.bed") (def test-bed-file3 "test-resources/bed/test3.bed") +(def test-bed-file1-gz "test-resources/bed/test1.bed.gz") +(def test-bed-file2-bz2 "test-resources/bed/test2.bed.bz2") ;; ### TABIX files