Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add some BED manipulation APIs #131

Merged
merged 9 commits into from May 31, 2018
90 changes: 90 additions & 0 deletions src/cljam/io/bed.clj
Expand Up @@ -208,6 +208,96 @@
0
(sort-fields xs)))

(defn- fields-<=
"Compare two BED fields on the basis of :chr and :end fields."
[x y]
(<= (compare [(chr/chromosome-order-key (:chr x)) (:end x)]
[(chr/chromosome-order-key (:chr y)) (:end y)])
0))

(defn intersect-fields
"Returns a lazy sequence that is the intersection of the two BED sequences.

The input sequences will first be sorted with sort-fields, which may cause
an extensive memory use for ones with a large number of elements.
Note also that this function assumes the input sequences contain only valid
regions, and thus :start <= :end holds for each region. Make sure yourself
the input sequences meet the condition, or the function may return a wrong
result."
[xs ys]
(letfn [(intersect [xs ys]
(lazy-seq
(if-let [x (first xs)]
(if-let [y (first ys)]
(cond->> (if (fields-<= x y)
(intersect (next xs) ys)
(intersect xs (next ys)))
(region/overlapped-regions? x y)
(cons (-> x
(update :start max (:start y))
(update :end min (:end y)))))
[])
[])))]
(intersect (sort-fields xs) (merge-fields ys))))

(defn subtract-fields
"Returns a lazy sequence that is the result of subtracting the BED fields
in the sequence ys from the sequence xs.

The input sequences will first be sorted with sort-fields, which may cause
an extensive memory use for ones with a large number of elements.
Note also that this function assumes the input sequences contain only valid
regions, and thus :start <= :end holds for each region. Make sure yourself
the input sequences meet the condition, or the function may return a wrong
result."
[xs ys]
(letfn [(subtract [xs ys]
(lazy-seq
(if-let [x (first xs)]
(if-let [y (first ys)]
(let [[r1 r2] (region/subtract-region x y)]
(if r2
(cons r1 (subtract (cons r2 (next xs)) (next ys)))
(if r1
(if (fields-<= r1 y)
(cons r1 (subtract (next xs) ys))
(subtract (cons r1 (next xs)) (next ys)))
(subtract (next xs) ys))))
xs)
[])))]
(subtract (sort-fields xs) (merge-fields ys))))

(defn complement-fields
"Takes a sequence of maps containing :name and :len keys (representing
chromosome's name and length, resp.) and a sequence of BED fields,
and returns a lazy sequence that is the complement of the BED sequence.

The input sequence will first be sorted with sort-fields, which may cause
an extensive memory use for ones with a large number of elements.
Note also that this function assumes the BED sequence contains only valid
regions, and thus :start <= :end holds for each region. Make sure yourself
the BED sequence meets the condition, or the function may return a wrong
result."
[refs xs]
(let [chr->len (into {} (map (juxt :name :len)) refs)
chrs (sort-by chr/chromosome-order-key (map :name refs))]
(when-first [{:keys [chr]} (filter #(not (chr->len (:chr %))) xs)]
(let [msg (str "Length of chromosome " chr " not specified")]
(throw (IllegalArgumentException. msg))))
(letfn [(complement [xs chrs pos]
(lazy-seq
(when-let [chr (first chrs)]
(let [len (get chr->len chr)
x (first xs)]
(if (and x (= (:chr x) chr))
(cond->> (complement (next xs) chrs (inc (:end x)))
(< pos (:start x))
(cons {:chr chr :start pos :end (dec (:start x))}))
(cond->> (complement xs (next chrs) 1)
(< pos len)
(cons {:chr chr :start pos :end len})))))))]
(complement (merge-fields xs) chrs 1))))

(defn write-raw-fields
"Write sequence of BED fields to writer without converting :start and :thick-start values."
[^BEDWriter wtr xs]
Expand Down
10 changes: 10 additions & 0 deletions src/cljam/util/region.clj
Expand Up @@ -3,6 +3,16 @@
(:require [clojure.string :as cstr]
[proton.core :as proton]))

;;; region predicates
;;; ----------

(defn overlapped-regions?
"Returns true if two regions are overlapped with each other."
[x y]
(and (= (:chr x) (:chr y))
(<= (:start y) (:end x))
(<= (:start x) (:end y))))

;;; region conversion
;;; ----------

Expand Down
98 changes: 98 additions & 0 deletions test/cljam/io/bed_test.clj
Expand Up @@ -172,6 +172,104 @@
(is (= (bed/merge-fields [{:chr "chr1" :start 1 :end 10 :name "chr1:1-10"} {:chr "chr1" :start 4 :end 13 :name "chr1:4-13"}])
[{:chr "chr1" :start 1 :end 13 :name "chr1:1-10+chr1:4-13"}])))

(deftest bed-intersect
(are [?xs ?ys ?result] (= ?result (bed/intersect-fields ?xs ?ys))
[]
[]
[]

[{:chr "chr1" :start 10 :end 40}]
[]
[]

[]
[{:chr "chr1" :start 10 :end 40}]
[]

[{:chr "chr1" :start 10 :end 40}]
[{:chr "chr1" :start 50 :end 60}]
[]

[{:chr "chr1" :start 10 :end 40} {:chr "chr1" :start 50 :end 80}]
[{:chr "chr1" :start 5 :end 20}]
[{:chr "chr1" :start 10 :end 20}]

[{:chr "chr1" :start 10 :end 40} {:chr "chr1" :start 50 :end 80}]
[{:chr "chr1" :start 20 :end 30}]
[{:chr "chr1" :start 20 :end 30}]

[{:chr "chr1" :start 10 :end 40} {:chr "chr1" :start 50 :end 80}]
[{:chr "chr1" :start 30 :end 60} {:chr "chr1" :start 70 :end 90}]
[{:chr "chr1" :start 30 :end 40} {:chr "chr1" :start 50 :end 60} {:chr "chr1" :start 70 :end 80}]

[{:chr "chr1" :start 10 :end 40} {:chr "chr1" :start 50 :end 80} {:chr "chr2" :start 10 :end 40}]
[{:chr "chr1" :start 10 :end 20} {:chr "chr2" :start 30 :end 40}]
[{:chr "chr1" :start 10 :end 20} {:chr "chr2" :start 30 :end 40}]

[{:chr "chr1" :start 10 :end 40} {:chr "chr1" :start 10 :end 50}]
[{:chr "chr1" :start 10 :end 40} {:chr "chr1" :start 30 :end 50}]
[{:chr "chr1" :start 10 :end 40} {:chr "chr1" :start 10 :end 50}]))

(deftest bed-subtract
(are [?xs ?ys ?result] (= ?result (bed/subtract-fields ?xs ?ys))
[]
[]
[]

[{:chr "chr1" :start 10 :end 40}]
[]
[{:chr "chr1" :start 10 :end 40}]

[]
[{:chr "chr1" :start 10 :end 40}]
[]

[{:chr "chr1" :start 10 :end 40}]
[{:chr "chr1" :start 50 :end 60}]
[{:chr "chr1" :start 10 :end 40}]

[{:chr "chr1" :start 10 :end 40} {:chr "chr1" :start 50 :end 80}]
[{:chr "chr1" :start 5 :end 20}]
[{:chr "chr1" :start 21 :end 40} {:chr "chr1" :start 50 :end 80}]

[{:chr "chr1" :start 10 :end 40} {:chr "chr1" :start 50 :end 80}]
[{:chr "chr1" :start 20 :end 30}]
[{:chr "chr1" :start 10 :end 19} {:chr "chr1" :start 31 :end 40} {:chr "chr1" :start 50 :end 80}]

[{:chr "chr1" :start 10 :end 40} {:chr "chr1" :start 50 :end 80}]
[{:chr "chr1" :start 30 :end 60} {:chr "chr1" :start 70 :end 90}]
[{:chr "chr1" :start 10 :end 29} {:chr "chr1" :start 61 :end 69}]

[{:chr "chr1" :start 10 :end 40} {:chr "chr1" :start 50 :end 80} {:chr "chr2" :start 10 :end 40}]
[{:chr "chr1" :start 10 :end 20} {:chr "chr2" :start 5 :end 30}]
[{:chr "chr1" :start 21 :end 40} {:chr "chr1" :start 50 :end 80} {:chr "chr2" :start 31 :end 40}]

[{:chr "chr1" :start 10 :end 40} {:chr "chr1" :start 10 :end 50}]
[{:chr "chr1" :start 10 :end 40} {:chr "chr1" :start 30 :end 50}]
[]))

(deftest bed-complement
(are [?xs ?result]
(= ?result
(bed/complement-fields [{:name "chr1" :len 1000}
{:name "chr2" :len 800}]
?xs))
[]
[{:chr "chr1" :start 1 :end 1000} {:chr "chr2" :start 1 :end 800}]

[{:chr "chr1" :start 1 :end 300}]
[{:chr "chr1" :start 301 :end 1000} {:chr "chr2" :start 1 :end 800}]

[{:chr "chr1" :start 1 :end 300} {:chr "chr1" :start 900 :end 1000} {:chr "chr2" :start 1 :end 300}]
[{:chr "chr1" :start 301 :end 899} {:chr "chr2" :start 301 :end 800}]

[{:chr "chr1" :start 1 :end 300} {:chr "chr1" :start 200 :end 500}]
[{:chr "chr1" :start 501 :end 1000} {:chr "chr2" :start 1 :end 800}])

(is (thrown? IllegalArgumentException
(bed/complement-fields [{:name "chr1" :len 1000}]
[{:chr "chr2" :start 1 :end 100}]))))

(deftest bed-reader-and-bam-reader
(with-open [bam (sam/bam-reader test-sorted-bam-file)]
(letfn [(ref-pos-end [m] {:rname (:rname m) :pos (:pos m) :end (sam-util/get-end m)})
Expand Down