diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java index ff8e26998..5ba92ce31 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java @@ -51,22 +51,21 @@ public String select(String html) { text.setLength(0); for (int i = 0; i < indexDistribution.size() - 1; i++) { - if (indexDistribution.get(i) > threshold && ! boolstart) { - if (indexDistribution.get(i+1).intValue() != 0 - || indexDistribution.get(i+2).intValue() != 0 - || indexDistribution.get(i+3).intValue() != 0) { + if (indexDistribution.get(i) > threshold && ! boolstart + && !isAnyIndexDistributionZero(indexDistribution,i+1,i+2,i+3)){ boolstart = true; start = i; continue; } } - if (boolstart) { - if (indexDistribution.get(i).intValue() == 0 - || indexDistribution.get(i+1).intValue() == 0) { + if (boolstart && isAnyIndexDistributionZero (indexDistribution,i,i+1)) { + end = i; boolend = true; - } + } + + StringBuilder tmp = new StringBuilder(); if (boolend) { //System.out.println(start+1 + "\t\t" + end+1); @@ -83,9 +82,27 @@ public String select(String html) { } return text.toString(); } - + + @Override public List selectList(String text) { throw new UnsupportedOperationException(); } + + private static boolean isAnyIndexDistributionZero( ArrayList indexDistribution, int index, int successorIndex, int afterSuccessorIndex = null) { + + + if (afterSuccessorIndex != null) { + return (indexDistribution.get(index).intValue() == 0 + && indexDistribution.get(indexSuccessor).intValue() == 0 + && indexDistribution.get(afterSuccessorIndex).intValue() == 0 ); + }else { + return (indexDistribution.get(index).intValue() == 0 + || indexDistribution.get(indexSuccessor).intValue() == 0); + } + + } + + + }