From b579e88ab7d8dfbd4faf4ad95635a3e7ae9e9ded Mon Sep 17 00:00:00 2001 From: ctSkennerton Date: Sun, 12 Jul 2015 16:22:01 -0700 Subject: [PATCH] Added back in the interval search tree, needs testing --- CRISPRFinder.java | 9 ++- DNASequence.java | 12 +++- Interval.java | 79 ----------------------- IntervalNode.java | 166 ----------------------------------------------- IntervalSearchTree.java | 111 ++++++++++++++++++++++++++++++++ IntervalTree.java | 167 ------------------------------------------------ Makefile | 2 +- minced.java | 32 +++++----- 8 files changed, 146 insertions(+), 432 deletions(-) delete mode 100644 Interval.java delete mode 100644 IntervalNode.java create mode 100644 IntervalSearchTree.java delete mode 100644 IntervalTree.java diff --git a/CRISPRFinder.java b/CRISPRFinder.java index 3b47a0a..4e08ed7 100644 --- a/CRISPRFinder.java +++ b/CRISPRFinder.java @@ -128,7 +128,7 @@ public boolean goCRISPRFinder() continue; } */ - + sequence.mask(100); try { @@ -194,6 +194,13 @@ private boolean findRepeats( DNASequence sequence, int readNum ) if (endSearch < beginSearch) //should never occur endSearch = beginSearch; + int mask_end = sequence.masked(beginSearch, endSearch); + if(mask_end != -1) + { + j = mask_end; + continue; + } + String text = sequence.substring(beginSearch, endSearch); pattern = sequence.substring(j, j + searchWindowLength); diff --git a/DNASequence.java b/DNASequence.java index c994f21..4f84441 100644 --- a/DNASequence.java +++ b/DNASequence.java @@ -9,7 +9,7 @@ private String desc; private String errorLog = ""; private Pattern notDnaRe = Pattern.compile("([^ACGT])"); - private IntervalTree mask = new IntervalTree(); + private IntervalSearchTree mask = new IntervalSearchTree(); public DNASequence(String sequence) { @@ -359,9 +359,17 @@ public void mask(int minimum) } if(n >= minimum) { - mask.addInterval(i, j, 0); + // add in the interval and save the end location as the data + mask.add(i, j); } + i = j; + } } + public int masked(int start, int end) + { + return mask.overlapEnd(start, end); + } + } diff --git a/Interval.java b/Interval.java deleted file mode 100644 index 0d23d26..0000000 --- a/Interval.java +++ /dev/null @@ -1,79 +0,0 @@ - -/** - * The Interval class maintains an interval with some associated data - * @author Kevin Dolan - * - * @param The type of data being stored - */ -public class Interval implements Comparable> { - - private long start; - private long end; - private Type data; - - public Interval(long start, long end, Type data) { - this.start = start; - this.end = end; - this.data = data; - } - - public long getStart() { - return start; - } - - public void setStart(long start) { - this.start = start; - } - - public long getEnd() { - return end; - } - - public void setEnd(long end) { - this.end = end; - } - - public Type getData() { - return data; - } - - public void setData(Type data) { - this.data = data; - } - - /** - * @param time - * @return true if this interval contains time (invlusive) - */ - public boolean contains(long time) { - return time < end && time > start; - } - - /** - * @param other - * @return return true if this interval intersects other - */ - public boolean intersects(Interval other) { - return other.getEnd() > start && other.getStart() < end; - } - - /** - * Return -1 if this interval's start time is less than the other, 1 if greater - * In the event of a tie, -1 if this interval's end time is less than the other, 1 if greater, 0 if same - * @param other - * @return 1 or -1 - */ - public int compareTo(Interval other) { - if(start < other.getStart()) - return -1; - else if(start > other.getStart()) - return 1; - else if(end < other.getEnd()) - return -1; - else if(end > other.getEnd()) - return 1; - else - return 0; - } - -} diff --git a/IntervalNode.java b/IntervalNode.java deleted file mode 100644 index 7c4d72c..0000000 --- a/IntervalNode.java +++ /dev/null @@ -1,166 +0,0 @@ - -import java.util.ArrayList; -import java.util.List; -import java.util.SortedMap; -import java.util.SortedSet; -import java.util.TreeMap; -import java.util.TreeSet; -import java.util.Map.Entry; - -/** - * The Node class contains the interval tree information for one single node - * - * @author Kevin Dolan - */ -public class IntervalNode { - - private SortedMap, List>> intervals; - private long center; - private IntervalNode leftNode; - private IntervalNode rightNode; - - public IntervalNode() { - intervals = new TreeMap, List>>(); - center = 0; - leftNode = null; - rightNode = null; - } - - public IntervalNode(List> intervalList) { - - intervals = new TreeMap, List>>(); - - SortedSet endpoints = new TreeSet(); - - for(Interval interval: intervalList) { - endpoints.add(interval.getStart()); - endpoints.add(interval.getEnd()); - } - - long median = getMedian(endpoints); - center = median; - - List> left = new ArrayList>(); - List> right = new ArrayList>(); - - for(Interval interval : intervalList) { - if(interval.getEnd() < median) - left.add(interval); - else if(interval.getStart() > median) - right.add(interval); - else { - List> posting = intervals.get(interval); - if(posting == null) { - posting = new ArrayList>(); - intervals.put(interval, posting); - } - posting.add(interval); - } - } - - if(left.size() > 0) - leftNode = new IntervalNode(left); - if(right.size() > 0) - rightNode = new IntervalNode(right); - } - - /** - * Perform a stabbing query on the node - * @param time the time to query at - * @return all intervals containing time - */ - public List> stab(long time) { - List> result = new ArrayList>(); - - for(Entry, List>> entry : intervals.entrySet()) { - if(entry.getKey().contains(time)) - for(Interval interval : entry.getValue()) - result.add(interval); - else if(entry.getKey().getStart() > time) - break; - } - - if(time < center && leftNode != null) - result.addAll(leftNode.stab(time)); - else if(time > center && rightNode != null) - result.addAll(rightNode.stab(time)); - return result; - } - - /** - * Perform an interval intersection query on the node - * @param target the interval to intersect - * @return all intervals containing time - */ - public List> query(Interval target) { - List> result = new ArrayList>(); - - for(Entry, List>> entry : intervals.entrySet()) { - if(entry.getKey().intersects(target)) - for(Interval interval : entry.getValue()) - result.add(interval); - else if(entry.getKey().getStart() > target.getEnd()) - break; - } - - if(target.getStart() < center && leftNode != null) - result.addAll(leftNode.query(target)); - if(target.getEnd() > center && rightNode != null) - result.addAll(rightNode.query(target)); - return result; - } - - public long getCenter() { - return center; - } - - public void setCenter(long center) { - this.center = center; - } - - public IntervalNode getLeft() { - return leftNode; - } - - public void setLeft(IntervalNode left) { - this.leftNode = left; - } - - public IntervalNode getRight() { - return rightNode; - } - - public void setRight(IntervalNode right) { - this.rightNode = right; - } - - /** - * @param set the set to look on - * @return the median of the set, not interpolated - */ - private Long getMedian(SortedSet set) { - int i = 0; - int middle = set.size() / 2; - for(Long point : set) { - if(i == middle) - return point; - i++; - } - return null; - } - - @Override - public String toString() { - StringBuffer sb = new StringBuffer(); - sb.append(center + ": "); - for(Entry, List>> entry : intervals.entrySet()) { - sb.append("[" + entry.getKey().getStart() + "," + entry.getKey().getEnd() + "]:{"); - for(Interval interval : entry.getValue()) { - sb.append("("+interval.getStart()+","+interval.getEnd()+","+interval.getData()+")"); - } - sb.append("} "); - } - return sb.toString(); - } - -} diff --git a/IntervalSearchTree.java b/IntervalSearchTree.java new file mode 100644 index 0000000..f0d1e49 --- /dev/null +++ b/IntervalSearchTree.java @@ -0,0 +1,111 @@ +public class IntervalSearchTree { + + private IntervalNode root; + + private class IntervalNode { + IntervalNode left; + int start; + int end; + int maxEnd; + IntervalNode right; + + public IntervalNode(IntervalNode left, int start, int end, int maxEnd, IntervalNode right) { + this.left = left; + this.start = start; + this.end = end; + this.maxEnd = maxEnd; + this.right = right; + } + } + + /** + * Adds an interval to the the calendar + * + * @param start the start of interval + * @param end the end of the interval. + */ + public void add (int start, int end) { + if (start >= end) throw new IllegalArgumentException("The end " + end + " should be greater than start " + start); + + IntervalNode inode = root; + while (inode != null) { + inode.maxEnd = (end > inode.maxEnd) ? end : inode.maxEnd; + if (start < inode.start) { + if (inode.left == null) { + inode.left = new IntervalNode(null, start, end, end, null); + return; + } + inode = inode.left; + } else { + if (inode.right == null) { + inode.right = new IntervalNode(null, start, end, end, null); + return; + } + inode = inode.right; + } + } + root = new IntervalNode(null, start, end, end, null); + } + + /** + * Tests if the input interval overlaps with the existing intervals. + * + * Rules: + * 1. If interval intersects return true. obvious. + * 2. if (leftsubtree == null || leftsubtree.max <= low) go right + * 3. else go left + * + * @param start the start of the interval + * @param end the end of the interval + * return true if overlap, else false. + */ + public boolean overlap(int start, int end) { + if (start >= end) throw new IllegalArgumentException("The end " + end + " should be greater than start " + start); + + IntervalNode intervalNode = root; + + while (intervalNode != null) { + if (intersection(start, end, intervalNode.start, intervalNode.end)) return true; + + if (goLeft(start, end, intervalNode.left)) { + intervalNode = intervalNode.left; + } else { + intervalNode = intervalNode.right; + } + } + return false; + } + + public int overlapEnd(int start, int end) { + if (start >= end) throw new IllegalArgumentException("The end " + end + " should be greater than start " + start); + + IntervalNode intervalNode = root; + if (root == null || start >= root.maxEnd || end <= root.start) + return -1; + + while (intervalNode != null) { + if (intersection(start, end, intervalNode.start, intervalNode.end)) + return intervalNode.end; + + if (goLeft(start, end, intervalNode.left)) { + intervalNode = intervalNode.left; + } else { + intervalNode = intervalNode.right; + } + } + return -1; + } + + /** + * Returns if there is an intersection in the two intervals + * Two intervals such that one of the points coincide: + * eg: [10, 20] and [20, 40] are NOT considered as intersecting. + */ + private boolean intersection (int start, int end, int intervalStart, int intervalEnd) { + return start < intervalEnd && end > intervalStart; + } + + private boolean goLeft(int start, int end, IntervalNode intervalLeftSubtree) { + return intervalLeftSubtree != null && intervalLeftSubtree.maxEnd > start; + } +} diff --git a/IntervalTree.java b/IntervalTree.java deleted file mode 100644 index bf36797..0000000 --- a/IntervalTree.java +++ /dev/null @@ -1,167 +0,0 @@ - -import java.util.ArrayList; -import java.util.List; - -/** - * An Interval Tree is essentially a map from intervals to objects, which - * can be queried for all data associated with a particular interval of - * time - * @author Kevin Dolan - * - * @param the type of objects to associate - */ -public class IntervalTree { - - private IntervalNode head; - private List> intervalList; - private boolean inSync; - private int size; - - /** - * Instantiate a new interval tree with no intervals - */ - public IntervalTree() { - this.head = new IntervalNode(); - this.intervalList = new ArrayList>(); - this.inSync = true; - this.size = 0; - } - - /** - * Instantiate and build an interval tree with a preset list of intervals - * @param intervalList the list of intervals to use - */ - public IntervalTree(List> intervalList) { - this.head = new IntervalNode(intervalList); - this.intervalList = new ArrayList>(); - this.intervalList.addAll(intervalList); - this.inSync = true; - this.size = intervalList.size(); - } - - /** - * Perform a stabbing query, returning the associated data - * Will rebuild the tree if out of sync - * @param time the time to stab - * @return the data associated with all intervals that contain time - */ - public List get(long time) { - List> intervals = getIntervals(time); - List result = new ArrayList(); - for(Interval interval : intervals) - result.add(interval.getData()); - return result; - } - - /** - * Perform a stabbing query, returning the interval objects - * Will rebuild the tree if out of sync - * @param time the time to stab - * @return all intervals that contain time - */ - public List> getIntervals(long time) { - build(); - return head.stab(time); - } - - /** - * Perform an interval query, returning the associated data - * Will rebuild the tree if out of sync - * @param start the start of the interval to check - * @param end the end of the interval to check - * @return the data associated with all intervals that intersect target - */ - public List get(long start, long end) { - List> intervals = getIntervals(start, end); - List result = new ArrayList(); - for(Interval interval : intervals) - result.add(interval.getData()); - return result; - } - - /** - * Perform an interval query, returning the interval objects - * Will rebuild the tree if out of sync - * @param start the start of the interval to check - * @param end the end of the interval to check - * @return all intervals that intersect target - */ - public List> getIntervals(long start, long end) { - build(); - return head.query(new Interval(start, end, null)); - } - - /** - * Add an interval object to the interval tree's list - * Will not rebuild the tree until the next query or call to build - * @param interval the interval object to add - */ - public void addInterval(Interval interval) { - intervalList.add(interval); - inSync = false; - } - - /** - * Add an interval object to the interval tree's list - * Will not rebuild the tree until the next query or call to build - * @param begin the beginning of the interval - * @param end the end of the interval - * @param data the data to associate - */ - public void addInterval(long begin, long end, Type data) { - intervalList.add(new Interval(begin, end, data)); - inSync = false; - } - - /** - * Determine whether this interval tree is currently a reflection of all intervals in the interval list - * @return true if no changes have been made since the last build - */ - public boolean inSync() { - return inSync; - } - - /** - * Build the interval tree to reflect the list of intervals, - * Will not run if this is currently in sync - */ - public void build() { - if(!inSync) { - head = new IntervalNode(intervalList); - inSync = true; - size = intervalList.size(); - } - } - - /** - * @return the number of entries in the currently built interval tree - */ - public int currentSize() { - return size; - } - - /** - * @return the number of entries in the interval list, equal to .size() if inSync() - */ - public int listSize() { - return intervalList.size(); - } - - @Override - public String toString() { - return nodeString(head,0); - } - - private String nodeString(IntervalNode node, int level) { - if(node == null) - return ""; - - StringBuffer sb = new StringBuffer(); - for(int i = 0; i < level; i++) - sb.append("\t"); - sb.append(node + "\n"); - sb.append(nodeString(node.getLeft(), level + 1)); - sb.append(nodeString(node.getRight(), level + 1)); - return sb.toString(); - } -} diff --git a/Makefile b/Makefile index 8625647..600e297 100644 --- a/Makefile +++ b/Makefile @@ -6,7 +6,7 @@ JAR = jar .java.class: $(JC) $(JFLAGS) $*.java -CLASSES = CRISPR.java CRISPRFinder.java CRISPRUtil.java DNASequence.java FASTAReader.java SearchUtil.java minced.java Interval.java IntervalNode.java IntervalTree.java +CLASSES = CRISPR.java CRISPRFinder.java CRISPRUtil.java DNASequence.java FASTAReader.java SearchUtil.java minced.java IntervalSearchTree.java default: classes minced.jar diff --git a/minced.java b/minced.java index 56ecdbd..a8edb57 100644 --- a/minced.java +++ b/minced.java @@ -2,7 +2,7 @@ public class minced { - public static final String VERSION = "0.1.6"; + public static final String VERSION = "0.2.0"; public static void main(String[] args) { //default values @@ -19,7 +19,7 @@ public static void main(String[] args) // 1 = summary gff // 2 = full gff int outformat = 0; - + int numOptions = 0; if (args.length == 0) @@ -35,13 +35,13 @@ public static void main(String[] args) printUsage(); System.exit(1); } - + if (args[0].equals("--version")) { printVersion(); System.exit(1); } - + int i = 0; while (args[i].charAt(0) == '-') { @@ -118,7 +118,7 @@ else if (args[i].endsWith("maxSL")) } maxSpacerLength = Integer.parseInt(args[i]); numOptions += 2; - } + } else if (args[i].endsWith("searchWL")) { i++; @@ -173,14 +173,14 @@ else if (args[i].endsWith("screen")) System.exit(1); } } // end while - - + + // Last options should be an input file and optional output file String inputFileName = ""; String outputFileName = ""; boolean outputFileSpecified = false; int numArgsRemaining = args.length - numOptions; - + if (numArgsRemaining == 1) inputFileName = args[i]; else if (numArgsRemaining == 2) @@ -196,8 +196,8 @@ else if (numArgsRemaining == 2) System.out.println("Try 'minced --help' for more information."); System.exit(1); } - - + + File inputFile = new File(inputFileName); if (!inputFile.exists()) { @@ -212,18 +212,18 @@ else if (numArgsRemaining == 2) } - + File outputFile; if (outputFileSpecified) { outputFile = new File(outputFileName); - + if (outputFile.isDirectory()) { System.out.println("You have entered an existing directory name. An output file name is required: " + outputFile.getAbsolutePath()); System.exit(1); } - + if (!outputFile.getAbsoluteFile().getParentFile().isDirectory()) { System.out.println("You did not enter a valid file output path: " + outputFile.getAbsolutePath()); @@ -246,9 +246,9 @@ else if (numArgsRemaining == 2) client.goCRISPRFinder(); } - + public static void printUsage() - { + { System.out.println("MinCED, a program to find CRISPRs in shotgun DNA sequences or full genomes"); System.out.println(); System.out.println("Usage: minced [options] file.fa [outputFile]"); @@ -274,7 +274,7 @@ public static void printUsage() } public static void printVersion() - { + { System.out.println("minced " + VERSION); System.out.println("MinCED - Mining CRISPRs in Environmental Datasets (version " + VERSION + ")"); System.out.println("Copyright 2011 Florent ANGLY ");