From 717931166a5ea6e0931f85cb3efc195982ca7b91 Mon Sep 17 00:00:00 2001 From: hooy <56918789+hooyantsing@users.noreply.github.com> Date: Sat, 11 Feb 2023 02:14:11 +0800 Subject: [PATCH] =?UTF-8?q?=E5=90=91=20webmagic-saxon=20=E7=BB=84=E4=BB=B6?= =?UTF-8?q?=E6=8F=90=E4=BE=9B=E8=8B=A5=E5=B9=B2=E6=96=B0=20API=EF=BC=8C?= =?UTF-8?q?=E6=9B=B4=E4=BC=98=E9=9B=85=E6=9B=B4=E7=81=B5=E6=B4=BB=E6=9B=B4?= =?UTF-8?q?=E5=BC=BA=E5=A4=A7=20(#1108)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Feature: * webmagic-saxon 组件新增若干新 API; * Update: 更优雅的写代码。 * Update: JaxpSelectorUtils 工具类增加 final 关键字。 --- .../webmagic/selector/JaxpSelectorUtils.java | 61 +++++++ .../webmagic/selector/NodeSelector.java | 32 ++++ .../webmagic/selector/Xpath2Selector.java | 155 ++++++++++-------- .../webmagic/selector/XpathSelectorTest.java | 57 +++++-- 4 files changed, 216 insertions(+), 89 deletions(-) create mode 100644 webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/JaxpSelectorUtils.java create mode 100644 webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/NodeSelector.java diff --git a/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/JaxpSelectorUtils.java b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/JaxpSelectorUtils.java new file mode 100644 index 000000000..b03f3a2ab --- /dev/null +++ b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/JaxpSelectorUtils.java @@ -0,0 +1,61 @@ +package us.codecraft.webmagic.selector; + +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; + +import javax.xml.transform.OutputKeys; +import javax.xml.transform.Transformer; +import javax.xml.transform.TransformerException; +import javax.xml.transform.TransformerFactory; +import javax.xml.transform.dom.DOMSource; +import javax.xml.transform.stream.StreamResult; +import java.io.StringWriter; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +/** + * @author hooy + */ +public final class JaxpSelectorUtils { + + private JaxpSelectorUtils() { + throw new RuntimeException("The util class cannot be instanced"); + } + + public static List NodeListToArrayList(NodeList nodes) { + List list = new ArrayList<>(nodes.getLength()); + for (int i = 0; i < nodes.getLength(); i++) { + list.add(nodes.item(i)); + } + return list; + } + + public static String nodeToString(Node node) throws TransformerException { + List before = Collections.singletonList(node); + List after = nodesToStrings(before); + if (after.size() > 0) { + return after.get(0); + } else { + return null; + } + } + + public static List nodesToStrings(List nodes) throws TransformerException { + List results = new ArrayList<>(nodes.size()); + Transformer transformer = TransformerFactory.newInstance().newTransformer(); + StreamResult xmlOutput = new StreamResult(); + transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); + for (Node node : nodes) { + if (node.getNodeType() == Node.ATTRIBUTE_NODE || node.getNodeType() == Node.TEXT_NODE) { + results.add(node.getTextContent()); + } else { + xmlOutput.setWriter(new StringWriter()); + transformer.transform(new DOMSource(node), xmlOutput); + results.add(xmlOutput.getWriter().toString()); + } + } + return results; + } + +} diff --git a/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/NodeSelector.java b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/NodeSelector.java new file mode 100644 index 000000000..3e6339dda --- /dev/null +++ b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/NodeSelector.java @@ -0,0 +1,32 @@ +package us.codecraft.webmagic.selector; + +import org.w3c.dom.Node; + +import java.util.List; + +/** + * Selector(extractor) for html node.
+ * + * @author hooy
+ * @since 0.8.0 + */ +public interface NodeSelector { + + /** + * Extract single result in text.
+ * If there are more than one result, only the first will be chosen. + * + * @param node node + * @return result + */ + String select(Node node); + + /** + * Extract all results in text.
+ * + * @param node node + * @return results + */ + List selectList(Node node); + +} diff --git a/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java index b63213b62..6c5d7b332 100644 --- a/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java +++ b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java @@ -1,19 +1,10 @@ package us.codecraft.webmagic.selector; -import java.io.StringWriter; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; -import java.util.Map; +import java.util.*; import java.util.concurrent.ConcurrentHashMap; import javax.xml.namespace.NamespaceContext; import javax.xml.parsers.ParserConfigurationException; -import javax.xml.transform.OutputKeys; -import javax.xml.transform.Transformer; -import javax.xml.transform.TransformerFactory; -import javax.xml.transform.dom.DOMSource; -import javax.xml.transform.stream.StreamResult; import javax.xml.xpath.XPathConstants; import javax.xml.xpath.XPathExpression; import javax.xml.xpath.XPathExpressionException; @@ -32,20 +23,22 @@ import net.sf.saxon.xpath.XPathEvaluator; import us.codecraft.webmagic.utils.BaseSelectorUtils; +import static us.codecraft.webmagic.selector.JaxpSelectorUtils.*; + /** * 支持xpath2.0的选择器。包装了HtmlCleaner和Saxon HE。
* - * @author code4crafter@gmail.com
+ * @author code4crafter@gmail.com, hooy
* Date: 13-4-21 * Time: 上午9:39 */ -public class Xpath2Selector implements Selector { +public class Xpath2Selector implements Selector, NodeSelector { - private String xpathStr; + private final String xpathStr; private XPathExpression xPathExpression; - private Logger logger = LoggerFactory.getLogger(getClass()); + private final Logger logger = LoggerFactory.getLogger(getClass()); public Xpath2Selector(String xpathStr) { this.xpathStr = xpathStr; @@ -56,25 +49,25 @@ public Xpath2Selector(String xpathStr) { } } + public static Xpath2Selector newInstance(String xpathStr) { + return new Xpath2Selector(xpathStr); + } + enum XPath2NamespaceContext implements NamespaceContext { INSTANCE; - private final Map prefix2NamespaceMap = new ConcurrentHashMap(); + private final Map prefix2NamespaceMap = new ConcurrentHashMap<>(); - private final Map> namespace2PrefixMap = new ConcurrentHashMap>(); + private final Map> namespace2PrefixMap = new ConcurrentHashMap<>(); private void put(String prefix, String namespaceURI) { prefix2NamespaceMap.put(prefix, namespaceURI); - List prefixes = namespace2PrefixMap.get(namespaceURI); - if (prefixes == null) { - prefixes = new ArrayList(); - namespace2PrefixMap.put(namespaceURI, prefixes); - } + List prefixes = namespace2PrefixMap.computeIfAbsent(namespaceURI, k -> new ArrayList<>()); prefixes.add(prefix); } - private XPath2NamespaceContext() { + XPath2NamespaceContext() { put("fn", NamespaceConstant.FN); put("xslt", NamespaceConstant.XSLT); put("xhtml", NamespaceConstant.XHTML); @@ -113,29 +106,18 @@ private void init() throws XPathExpressionException { @Override public String select(String text) { try { - Object result; - try { - result = xPathExpression.evaluate(parse(text), XPathConstants.NODESET); - } catch (XPathExpressionException e) { - result = xPathExpression.evaluate(parse(text), XPathConstants.STRING); - } - if (result instanceof NodeList) { - NodeList nodeList = (NodeList) result; - if (nodeList.getLength() == 0) { - return null; - } - Node item = nodeList.item(0); - if (item.getNodeType() == Node.ATTRIBUTE_NODE || item.getNodeType() == Node.TEXT_NODE) { - return item.getTextContent(); - } else { - StreamResult xmlOutput = new StreamResult(new StringWriter()); - Transformer transformer = TransformerFactory.newInstance().newTransformer(); - transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); - transformer.transform(new DOMSource(item), xmlOutput); - return xmlOutput.getWriter().toString(); - } - } - return result.toString(); + Document doc = parse(text); + return select(doc); + } catch (Exception e) { + logger.error("select text error! " + xpathStr, e); + } + return null; + } + + @Override + public String select(Node node) { + try { + return (String) xPathExpression.evaluate(node, XPathConstants.STRING); } catch (Exception e) { logger.error("select text error! " + xpathStr, e); } @@ -144,43 +126,72 @@ public String select(String text) { @Override public List selectList(String text) { - List results = new ArrayList(); try { - Object result; - try { - result = xPathExpression.evaluate(parse(text), XPathConstants.NODESET); - } catch (XPathExpressionException e) { - result = xPathExpression.evaluate(parse(text), XPathConstants.STRING); - } - if (result instanceof NodeList) { - NodeList nodeList = (NodeList) result; - Transformer transformer = TransformerFactory.newInstance().newTransformer(); - StreamResult xmlOutput = new StreamResult(); - transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); - for (int i = 0; i < nodeList.getLength(); i++) { - Node item = nodeList.item(i); - if (item.getNodeType() == Node.ATTRIBUTE_NODE || item.getNodeType() == Node.TEXT_NODE) { - results.add(item.getTextContent()); - } else { - xmlOutput.setWriter(new StringWriter()); - transformer.transform(new DOMSource(item), xmlOutput); - results.add(xmlOutput.getWriter().toString()); - } - } - } else { - results.add(result.toString()); - } + Document doc = parse(text); + return selectList(doc); + } catch (Exception e) { + logger.error("select text error! " + xpathStr, e); + } + return null; + } + + @Override + public List selectList(Node node) { + try { + NodeList result = (NodeList) xPathExpression.evaluate(node, XPathConstants.NODESET); + List nodes = NodeListToArrayList(result); + return nodesToStrings(nodes); } catch (Exception e) { logger.error("select text error! " + xpathStr, e); } - return results; + return null; } - private Document parse(String text) throws ParserConfigurationException { + public Node selectNode(String text) { + try { + Document doc = parse(text); + return selectNode(doc); + } catch (Exception e) { + logger.error("select text error! " + xpathStr, e); + } + return null; + } + + public Node selectNode(Node node) { + try { + return (Node) xPathExpression.evaluate(node, XPathConstants.NODE); + } catch (Exception e) { + logger.error("select text error! " + xpathStr, e); + } + return null; + } + + public List selectNodes(String text) { + try { + Document doc = parse(text); + return selectNodes(doc); + } catch (Exception e) { + logger.error("select text error! " + xpathStr, e); + } + return null; + } + + public List selectNodes(Node node) { + try { + NodeList result = (NodeList) xPathExpression.evaluate(node, XPathConstants.NODESET); + return NodeListToArrayList(result); + } catch (Exception e) { + logger.error("select text error! " + xpathStr, e); + } + return null; + } + + protected static Document parse(String text) throws ParserConfigurationException { // HtmlCleaner could not parse or tag directly text = BaseSelectorUtils.preParse(text); HtmlCleaner htmlCleaner = new HtmlCleaner(); TagNode tagNode = htmlCleaner.clean(text); return new DomSerializer(new CleanerProperties()).createDOM(tagNode); } + } diff --git a/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java index 8ac721934..4033fcfbd 100644 --- a/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java +++ b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java @@ -11,12 +11,15 @@ import org.junit.Ignore; import org.junit.Test; +import org.w3c.dom.Node; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.xsoup.XPathEvaluator; import us.codecraft.xsoup.Xsoup; +import javax.xml.transform.TransformerException; + /** * @author code4crafter@gmail.com
Date: 13-4-21 Time: 上午10:06 */ @@ -1388,23 +1391,6 @@ public void testXpath2Selector() { Assert.assertEquals("http://www.oschina.net/", selectList.get(0)); } - @Ignore("test parse
tag") - @Test - public void htmlCleanerParseTest() { - Spider.create(new RuoxiaPageProcessor()).addUrl("http://www.ruoxia.com/top/dianji/month").thread(1).run(); - } - - class RuoxiaPageProcessor implements PageProcessor { - @Override - public void process(Page page) { - List items = new Xpath2Selector("//div[@class=\"bd\"]//tbody/tr").selectList(page.getRawText()); - for (String item : items) { - String name = new Xpath2Selector("//td[3]/div/a[1]/text()").select(item); - System.out.println(name); - } - } - } - @Ignore("take long time") @Test public void performanceTest() { @@ -1496,4 +1482,41 @@ public void parserPerformanceTest() throws XPatherException { } + /** + * New api test + * + * @author hooy + * @since 8.0 + */ + private String rank = "

点击榜

排名分类书名/最新章节作者推荐更新时间
1.现实
0
11-24 22:32
2.架空
1047
03-04 14:44
3.现实
0
07-20 09:06
4.豪门
0
12-03 09:12
5.现实
0
02-01 21:12
6.玄奇
3455
02-28 12:31
7.玄奇
20614
03-31 12:37
8.复仇
55
06-03 11:43
9.穿越
0
10-27 18:50
10.宫斗
320
10-31 13:58
11.宫斗
6268
07-12 20:23
12.现实
0
01-18 23:00
13.婚恋
0
12-14 20:50
14.修真
0
02-03 23:40
15.豪门
0
11-06 23:38
16.穿越
191
12-02 23:37
17.穿越
412
10-13 22:39
18.豪门
635
07-01 13:15
19.架空
144
06-18 09:35
20.宅斗
1032
08-15 19:03
21.宫斗
0
09-30 20:32
22.豪门
0
06-05 11:31
23.重生
80
11-25 19:56
24.异世
68
01-12 10:06
25.豪门
0
05-29 18:46
26.婚恋
2778
11-04 17:48
27.玄奇
207
12-06 16:57
28.穿越
260
01-04 23:26
29.豪门
0
12-07 21:39
30.架空
1127
06-06 17:28
31.穿越
113
09-13 09:06
32.架空
597
02-14 18:47
33.玄奇
528
06-04 22:04
34.穿越
328
06-06 22:09
35.架空
539
05-24 14:42
36.架空
0
03-05 23:27
37.穿越
3215
08-21 16:38
38.宫斗
905
08-04 20:24
39.玄奇
1328
07-25 10:58
40.穿越
203
01-27 20:53
41.宫斗
407
08-31 09:03
42.宅斗
16
05-03 17:38
43.豪门
0
11-10 08:00
44.婚恋
0
07-12 21:37
45.架空
0
06-23 21:02
46.玄奇
1382
05-31 20:36
47.重生
334
07-16 19:19
48.婚恋
505
11-01 16:42
49.婚恋
0
10-19 18:32
50.豪门
540
09-19 19:18
51.婚恋
226
03-18 13:09
52.穿越
1026
03-08 16:28
53.重生
304
02-19 10:25
54.玄奇
2617
02-15 20:57
55.穿越
199
09-04 19:43
56.同人
768
07-19 20:00
57.宅斗
0
02-13 18:13
58.豪门
0
11-12 22:23
59.架空
0
07-28 23:42
60.婚恋
0
02-03 23:09
61.豪门
285
01-07 19:21
62.重生
654
10-12 18:16
63.异能
617
06-18 20:23
64.宫斗
27
06-02 21:05
65.种田
206
08-31 19:23
66.宅斗
2444
08-19 15:51
67.宅斗
818
08-07 23:38
68.现代
0
12-23 17:02
69.玄奇
0
07-23 12:00
70.婚恋
0
11-01 16:43
71.豪门
0
09-12 00:01
72.架空
0
04-27 22:42
73.豪门
0
04-19 13:55
74.异能
62
07-30 00:00
75.穿越
1307
07-20 16:41
76.玄奇
12820
07-15 23:46
77.架空
828
06-06 17:54
78.宅斗
985
05-20 23:53
79.玄奇
4960
04-12 15:58
80.玄奇
245
03-02 23:11
81.宅斗
34
12-21 10:11
82.宅斗
1411
07-21 00:00
83.现代
0
07-31 10:10
84.玄奇
0
06-18 13:53
85.架空
0
12-03 23:41
86.玄奇
0
11-28 22:13
87.豪门
0
11-07 22:48
88.婚恋
0
08-29 23:15
89.种田
1831
08-21 16:38
90.豪门
0
07-11 21:25
91.豪门
0
06-13 15:37
92.豪门
0
05-07 22:10
93.豪门
0
02-28 00:01
94.豪门
304
12-16 07:30
95.婚恋
669
11-07 18:16
96.仙侠
54
09-25 19:51
97.豪门
655
08-31 13:02
98.现实
374
06-29 09:55
99.穿越
373
06-19 18:07
100.婚恋
159
06-04 21:05
"; + + @Test + public void testStringAPI() { + // testAPI: selectList(String) -> selectList(Node) + List items = new Xpath2Selector("//div[@class=\"bd\"]//tbody/tr").selectList(rank); + Assert.assertSame(100, items.size()); + // testAPI: select(String) -> select(Node) + String name = new Xpath2Selector("//td[3]/div/a[1]/text()").select(items.get(10)); + Assert.assertEquals("深宫安容传", name); + } + + @Test + public void testNodeAPI() { + // testAPI: selectNodes(String) -> selectNodes(Node) + List items = new Xpath2Selector("//div[@class=\"bd\"]//tbody/tr").selectNodes(rank); + Assert.assertSame(100, items.size()); + // testAPI: selectNode(Node) + Node item = new Xpath2Selector("./td[3]/div/a[1]").selectNode(items.get(10)); + String name = new Xpath2Selector("./text()").select(item); + Assert.assertEquals("深宫安容传", name); + } + + @Test + public void testUtilAPI() throws TransformerException { + Node item = Xpath2Selector.newInstance("//div[@class=\"bd\"]//tbody/tr[11]/td[3]/div/a[1]/text()").selectNode(rank); + // testAPI: nodeToString(Node) -> nodesToStrings(List) + String name = JaxpSelectorUtils.nodeToString(item); + Assert.assertEquals("深宫安容传", name); + } + }