Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
向 webmagic-saxon 组件提供若干新 API,更优雅更灵活更强大 (#1108)
* Feature: * webmagic-saxon 组件新增若干新 API; * Update: 更优雅的写代码。 * Update: JaxpSelectorUtils 工具类增加 final 关键字。
- Loading branch information
1 parent
f47038d
commit 7179311
Showing
4 changed files
with
216 additions
and
89 deletions.
There are no files selected for viewing
61 changes: 61 additions & 0 deletions
61
webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/JaxpSelectorUtils.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
package us.codecraft.webmagic.selector; | ||
|
||
import org.w3c.dom.Node; | ||
import org.w3c.dom.NodeList; | ||
|
||
import javax.xml.transform.OutputKeys; | ||
import javax.xml.transform.Transformer; | ||
import javax.xml.transform.TransformerException; | ||
import javax.xml.transform.TransformerFactory; | ||
import javax.xml.transform.dom.DOMSource; | ||
import javax.xml.transform.stream.StreamResult; | ||
import java.io.StringWriter; | ||
import java.util.ArrayList; | ||
import java.util.Collections; | ||
import java.util.List; | ||
|
||
/** | ||
* @author hooy | ||
*/ | ||
public final class JaxpSelectorUtils { | ||
|
||
private JaxpSelectorUtils() { | ||
throw new RuntimeException("The util class cannot be instanced"); | ||
} | ||
|
||
public static List<Node> NodeListToArrayList(NodeList nodes) { | ||
List<Node> list = new ArrayList<>(nodes.getLength()); | ||
for (int i = 0; i < nodes.getLength(); i++) { | ||
list.add(nodes.item(i)); | ||
} | ||
return list; | ||
} | ||
|
||
public static String nodeToString(Node node) throws TransformerException { | ||
List<Node> before = Collections.singletonList(node); | ||
List<String> after = nodesToStrings(before); | ||
if (after.size() > 0) { | ||
return after.get(0); | ||
} else { | ||
return null; | ||
} | ||
} | ||
|
||
public static List<String> nodesToStrings(List<Node> nodes) throws TransformerException { | ||
List<String> results = new ArrayList<>(nodes.size()); | ||
Transformer transformer = TransformerFactory.newInstance().newTransformer(); | ||
StreamResult xmlOutput = new StreamResult(); | ||
transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); | ||
for (Node node : nodes) { | ||
if (node.getNodeType() == Node.ATTRIBUTE_NODE || node.getNodeType() == Node.TEXT_NODE) { | ||
results.add(node.getTextContent()); | ||
} else { | ||
xmlOutput.setWriter(new StringWriter()); | ||
transformer.transform(new DOMSource(node), xmlOutput); | ||
results.add(xmlOutput.getWriter().toString()); | ||
} | ||
} | ||
return results; | ||
} | ||
|
||
} |
32 changes: 32 additions & 0 deletions
32
webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/NodeSelector.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
package us.codecraft.webmagic.selector; | ||
|
||
import org.w3c.dom.Node; | ||
|
||
import java.util.List; | ||
|
||
/** | ||
* Selector(extractor) for html node.<br> | ||
* | ||
* @author hooy <br> | ||
* @since 0.8.0 | ||
*/ | ||
public interface NodeSelector { | ||
|
||
/** | ||
* Extract single result in text.<br> | ||
* If there are more than one result, only the first will be chosen. | ||
* | ||
* @param node node | ||
* @return result | ||
*/ | ||
String select(Node node); | ||
|
||
/** | ||
* Extract all results in text.<br> | ||
* | ||
* @param node node | ||
* @return results | ||
*/ | ||
List<String> selectList(Node node); | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
57 changes: 40 additions & 17 deletions
57
webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java
Large diffs are not rendered by default.
Oops, something went wrong.