Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

向 webmagic-saxon 组件提供若干新 API,更优雅更灵活更强大 #1108

Merged
merged 4 commits into from Feb 10, 2023
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
@@ -0,0 +1,61 @@
package us.codecraft.webmagic.selector;

import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

/**
* @author hooy
*/
public class JaxpSelectorUtils {
hooyantsing marked this conversation as resolved.
Show resolved Hide resolved

private JaxpSelectorUtils() {
throw new RuntimeException("The util class cannot be instanced");
}

public static List<Node> NodeListToArrayList(NodeList nodes) {
List<Node> list = new ArrayList<>(nodes.getLength());
for (int i = 0; i < nodes.getLength(); i++) {
list.add(nodes.item(i));
}
return list;
}

public static String nodeToString(Node node) throws TransformerException {
List<Node> before = Collections.singletonList(node);
List<String> after = nodesToStrings(before);
if (after.size() > 0) {
return after.get(0);
} else {
return null;
}
}

public static List<String> nodesToStrings(List<Node> nodes) throws TransformerException {
List<String> results = new ArrayList<>(nodes.size());
Transformer transformer = TransformerFactory.newInstance().newTransformer();
StreamResult xmlOutput = new StreamResult();
transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
for (Node node : nodes) {
if (node.getNodeType() == Node.ATTRIBUTE_NODE || node.getNodeType() == Node.TEXT_NODE) {
results.add(node.getTextContent());
} else {
xmlOutput.setWriter(new StringWriter());
transformer.transform(new DOMSource(node), xmlOutput);
results.add(xmlOutput.getWriter().toString());
}
}
return results;
}

}
@@ -0,0 +1,32 @@
package us.codecraft.webmagic.selector;

import org.w3c.dom.Node;

import java.util.List;

/**
* Selector(extractor) for html node.<br>
*
* @author hooy <br>
* @since 0.8.0
*/
public interface NodeSelector {

/**
* Extract single result in text.<br>
* If there are more than one result, only the first will be chosen.
*
* @param node node
* @return result
*/
String select(Node node);

/**
* Extract all results in text.<br>
*
* @param node node
* @return results
*/
List<String> selectList(Node node);

}
@@ -1,19 +1,10 @@
package us.codecraft.webmagic.selector;

import java.io.StringWriter;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;

import javax.xml.namespace.NamespaceContext;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;
Expand All @@ -32,20 +23,22 @@
import net.sf.saxon.xpath.XPathEvaluator;
import us.codecraft.webmagic.utils.BaseSelectorUtils;

import static us.codecraft.webmagic.selector.JaxpSelectorUtils.*;

/**
* 支持xpath2.0的选择器。包装了HtmlCleaner和Saxon HE。<br>
*
* @author code4crafter@gmail.com <br>
* @author code4crafter@gmail.com, hooy <br>
* Date: 13-4-21
* Time: 上午9:39
*/
public class Xpath2Selector implements Selector {
public class Xpath2Selector implements Selector, NodeSelector {

private String xpathStr;
private final String xpathStr;

private XPathExpression xPathExpression;

private Logger logger = LoggerFactory.getLogger(getClass());
private final Logger logger = LoggerFactory.getLogger(getClass());

public Xpath2Selector(String xpathStr) {
this.xpathStr = xpathStr;
Expand All @@ -56,25 +49,25 @@ public Xpath2Selector(String xpathStr) {
}
}

public static Xpath2Selector newInstance(String xpathStr) {
return new Xpath2Selector(xpathStr);
}

enum XPath2NamespaceContext implements NamespaceContext {

INSTANCE;

private final Map<String, String> prefix2NamespaceMap = new ConcurrentHashMap<String, String>();
private final Map<String, String> prefix2NamespaceMap = new ConcurrentHashMap<>();

private final Map<String, List<String>> namespace2PrefixMap = new ConcurrentHashMap<String, List<String>>();
private final Map<String, List<String>> namespace2PrefixMap = new ConcurrentHashMap<>();

private void put(String prefix, String namespaceURI) {
prefix2NamespaceMap.put(prefix, namespaceURI);
List<String> prefixes = namespace2PrefixMap.get(namespaceURI);
if (prefixes == null) {
prefixes = new ArrayList<String>();
namespace2PrefixMap.put(namespaceURI, prefixes);
}
List<String> prefixes = namespace2PrefixMap.computeIfAbsent(namespaceURI, k -> new ArrayList<>());
prefixes.add(prefix);
}

private XPath2NamespaceContext() {
XPath2NamespaceContext() {
put("fn", NamespaceConstant.FN);
put("xslt", NamespaceConstant.XSLT);
put("xhtml", NamespaceConstant.XHTML);
Expand Down Expand Up @@ -113,29 +106,18 @@ private void init() throws XPathExpressionException {
@Override
public String select(String text) {
try {
Object result;
try {
result = xPathExpression.evaluate(parse(text), XPathConstants.NODESET);
} catch (XPathExpressionException e) {
result = xPathExpression.evaluate(parse(text), XPathConstants.STRING);
}
if (result instanceof NodeList) {
NodeList nodeList = (NodeList) result;
if (nodeList.getLength() == 0) {
return null;
}
Node item = nodeList.item(0);
if (item.getNodeType() == Node.ATTRIBUTE_NODE || item.getNodeType() == Node.TEXT_NODE) {
return item.getTextContent();
} else {
StreamResult xmlOutput = new StreamResult(new StringWriter());
Transformer transformer = TransformerFactory.newInstance().newTransformer();
transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
transformer.transform(new DOMSource(item), xmlOutput);
return xmlOutput.getWriter().toString();
}
}
return result.toString();
Document doc = parse(text);
return select(doc);
} catch (Exception e) {
logger.error("select text error! " + xpathStr, e);
}
return null;
}

@Override
public String select(Node node) {
try {
return (String) xPathExpression.evaluate(node, XPathConstants.STRING);
} catch (Exception e) {
logger.error("select text error! " + xpathStr, e);
}
Expand All @@ -144,43 +126,72 @@ public String select(String text) {

@Override
public List<String> selectList(String text) {
List<String> results = new ArrayList<String>();
try {
Object result;
try {
result = xPathExpression.evaluate(parse(text), XPathConstants.NODESET);
} catch (XPathExpressionException e) {
result = xPathExpression.evaluate(parse(text), XPathConstants.STRING);
}
if (result instanceof NodeList) {
NodeList nodeList = (NodeList) result;
Transformer transformer = TransformerFactory.newInstance().newTransformer();
StreamResult xmlOutput = new StreamResult();
transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
for (int i = 0; i < nodeList.getLength(); i++) {
Node item = nodeList.item(i);
if (item.getNodeType() == Node.ATTRIBUTE_NODE || item.getNodeType() == Node.TEXT_NODE) {
results.add(item.getTextContent());
} else {
xmlOutput.setWriter(new StringWriter());
transformer.transform(new DOMSource(item), xmlOutput);
results.add(xmlOutput.getWriter().toString());
}
}
} else {
results.add(result.toString());
}
Document doc = parse(text);
return selectList(doc);
} catch (Exception e) {
logger.error("select text error! " + xpathStr, e);
}
return null;
}

@Override
public List<String> selectList(Node node) {
try {
NodeList result = (NodeList) xPathExpression.evaluate(node, XPathConstants.NODESET);
List<Node> nodes = NodeListToArrayList(result);
return nodesToStrings(nodes);
} catch (Exception e) {
logger.error("select text error! " + xpathStr, e);
}
return results;
return null;
}

private Document parse(String text) throws ParserConfigurationException {
public Node selectNode(String text) {
try {
Document doc = parse(text);
return selectNode(doc);
} catch (Exception e) {
logger.error("select text error! " + xpathStr, e);
}
return null;
}

public Node selectNode(Node node) {
try {
return (Node) xPathExpression.evaluate(node, XPathConstants.NODE);
} catch (Exception e) {
logger.error("select text error! " + xpathStr, e);
}
return null;
}

public List<Node> selectNodes(String text) {
try {
Document doc = parse(text);
return selectNodes(doc);
} catch (Exception e) {
logger.error("select text error! " + xpathStr, e);
}
return null;
}

public List<Node> selectNodes(Node node) {
try {
NodeList result = (NodeList) xPathExpression.evaluate(node, XPathConstants.NODESET);
return NodeListToArrayList(result);
} catch (Exception e) {
logger.error("select text error! " + xpathStr, e);
}
return null;
}

protected static Document parse(String text) throws ParserConfigurationException {
// HtmlCleaner could not parse <tr></tr> or <td></td> tag directly
text = BaseSelectorUtils.preParse(text);
HtmlCleaner htmlCleaner = new HtmlCleaner();
TagNode tagNode = htmlCleaner.clean(text);
return new DomSerializer(new CleanerProperties()).createDOM(tagNode);
}

}

Large diffs are not rendered by default.