Skip to content

Commit

Permalink
fix #1211 css attribute selector support
Browse files Browse the repository at this point in the history
  • Loading branch information
marevol committed Aug 7, 2017
1 parent 53a59b5 commit 8e987df
Show file tree
Hide file tree
Showing 6 changed files with 97 additions and 52 deletions.
Expand Up @@ -15,10 +15,10 @@
*/
package org.codelibs.fess.app.web.base;

import java.util.ArrayList;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.ArrayList;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
Expand Down
Expand Up @@ -35,7 +35,6 @@
import org.codelibs.fess.app.service.LabelTypeService;
import org.codelibs.fess.entity.SearchRequestParams.SearchRequestType;
import org.codelibs.fess.es.config.exentity.LabelType;
import org.codelibs.fess.mylasta.direction.FessConfig;
import org.codelibs.fess.util.ComponentUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand Down
37 changes: 21 additions & 16 deletions src/main/java/org/codelibs/fess/mylasta/direction/FessProp.java
Expand Up @@ -35,6 +35,7 @@
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.function.Function;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

Expand All @@ -47,6 +48,7 @@
import org.codelibs.core.misc.Pair;
import org.codelibs.core.misc.Tuple3;
import org.codelibs.fess.Constants;
import org.codelibs.fess.exception.FessSystemException;
import org.codelibs.fess.helper.PermissionHelper;
import org.codelibs.fess.mylasta.action.FessUserBean;
import org.codelibs.fess.taglib.FessFunctions;
Expand Down Expand Up @@ -635,23 +637,26 @@ public default PrunedTag[] getCrawlerDocumentHtmlPrunedTagsAsArray() {
PrunedTag[] tags = (PrunedTag[]) propMap.get("crawlerDocumentHtmlPrunedTags");
if (tags == null) {
tags = split(getCrawlerDocumentHtmlPrunedTags(), ",").get(stream -> stream.filter(StringUtil::isNotBlank).map(v -> {
final String[] cssValues = v.split("\\.", 2);
final String css;
if (cssValues.length == 2) {
css = cssValues[1];
} else {
css = null;
}

final String[] idValues = cssValues[0].split("#", 2);
final String id;
if (idValues.length == 2) {
id = idValues[1];
} else {
id = null;
final Pattern pattern = Pattern.compile("(\\w+)(\\[[^\\]]+\\])?(\\.\\w+)?(#\\w+)?");
final Matcher matcher = pattern.matcher(v.trim());
if (matcher.matches()) {
final PrunedTag tag = new PrunedTag(matcher.group(1));
if (matcher.group(2) != null) {
final String attrPair = matcher.group(2).substring(1, matcher.group(2).length() - 1);
final Matcher equalMatcher = Pattern.compile("(\\w+)=(\\w+)").matcher(attrPair);
if (equalMatcher.matches()) {
tag.setAttr(equalMatcher.group(1), equalMatcher.group(2));
}
}
if (matcher.group(3) != null) {
tag.setCss(matcher.group(3).substring(1));
}
if (matcher.group(4) != null) {
tag.setId(matcher.group(4).substring(1));
}
return tag;
}

return new PrunedTag(idValues[0], id, css);
throw new FessSystemException("Invalid pruned tag: " + v);
}).toArray(n -> new PrunedTag[n]));
propMap.put("crawlerDocumentHtmlPrunedTags", tags);
}
Expand Down
68 changes: 35 additions & 33 deletions src/main/java/org/codelibs/fess/util/PrunedTag.java
Expand Up @@ -15,24 +15,30 @@
*/
package org.codelibs.fess.util;

import org.apache.commons.lang3.StringUtils;
import org.codelibs.core.lang.StringUtil;
import org.codelibs.core.stream.StreamUtil;
import org.w3c.dom.Node;

public class PrunedTag {
private final String tag;
private final String id;
private final String css;
private String id;
private String css;
private String attrName;
private String attrValue;

public PrunedTag(final String tag, final String id, final String css) {
public PrunedTag(final String tag) {
this.tag = tag;
this.id = id;
this.css = css;

}

public boolean matches(final Node node) {
if (tag.equalsIgnoreCase(node.getNodeName())) {
if (attrName != null) {
Node attr = node.getAttributes().getNamedItem(attrName);
if (attr == null || !attrValue.equals(attr.getNodeValue())) {
return false;
}
}
if (id == null) {
if (css == null) {
return true;
Expand All @@ -56,11 +62,6 @@ public boolean matches(final Node node) {
return false;
}

@Override
public String toString() {
return "PrunedTag [tag=" + tag + ", id=" + id + ", css=" + css + "]";
}

@Override
public int hashCode() {
final int prime = 31;
Expand All @@ -83,27 +84,28 @@ public boolean equals(final Object obj) {
return false;
}
final PrunedTag other = (PrunedTag) obj;
if (css == null) {
if (other.css != null) {
return false;
}
} else if (!css.equals(other.css)) {
return false;
}
if (id == null) {
if (other.id != null) {
return false;
}
} else if (!id.equals(other.id)) {
return false;
}
if (tag == null) {
if (other.tag != null) {
return false;
}
} else if (!tag.equals(other.tag)) {
return false;
}
return true;
return StringUtils.compare(tag, other.tag) == 0 //
&& StringUtils.compare(css, other.css) == 0 //
&& StringUtils.compare(id, other.id) == 0 //
&& StringUtils.compare(attrName, other.attrName) == 0 //
&& StringUtils.compare(attrValue, other.attrValue) == 0;
}

public void setId(String id) {
this.id = id;
}

public void setCss(String css) {
this.css = css;
}

public void setAttr(String name, String value) {
this.attrName = name;
this.attrValue = value;
}

@Override
public String toString() {
return "PrunedTag [tag=" + tag + ", id=" + id + ", css=" + css + ", attrName=" + attrName + ", attrValue=" + attrValue + "]";
}
}
2 changes: 1 addition & 1 deletion src/main/resources/fess_config.properties
Expand Up @@ -119,7 +119,7 @@ crawler.document.html.content.xpath=//BODY
crawler.document.html.lang.xpath=//HTML/@lang
crawler.document.html.digest.xpath=//META[@name='description']/@content
crawler.document.html.canonical.xpath=//LINK[@rel='canonical']/@href
crawler.document.html.pruned.tags=noscript,script,style,header,footer,nav
crawler.document.html.pruned.tags=noscript,script,style,header,footer,nav,a[rel="nofollow"]
crawler.document.html.max.digest.length=200

# file
Expand Down
Expand Up @@ -15,6 +15,7 @@
*/
package org.codelibs.fess.mylasta.direction;

import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.util.Arrays;
Expand All @@ -23,7 +24,12 @@
import org.codelibs.core.io.FileUtil;
import org.codelibs.core.misc.DynamicProperties;
import org.codelibs.fess.unit.UnitFessTestCase;
import org.codelibs.fess.util.PrunedTag;
import org.cyberneko.html.parsers.DOMParser;
import org.lastaflute.di.core.factory.SingletonLaContainerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.xml.sax.InputSource;

public class FessPropTest extends UnitFessTestCase {

Expand Down Expand Up @@ -120,6 +126,39 @@ public String getCrawlerDocumentSpaceChars() {
assertEquals(12288, spaceChars[1]);
}

public void test_getCrawlerDocumentHtmlPrunedTagsAsArray() throws Exception {
FessProp.propMap.clear();
FessConfig fessConfig = new FessConfig.SimpleImpl() {
@Override
public String getCrawlerDocumentHtmlPrunedTags() {
return "script,div#main,p.image,a[rel=nofollow]";
}
};

PrunedTag[] tags = fessConfig.getCrawlerDocumentHtmlPrunedTagsAsArray();
assertTrue(matchesTag(tags[0], "<script></script>"));
assertTrue(matchesTag(tags[0], "<script id=\\\"main\\\"></script>"));
assertFalse(matchesTag(tags[0], "<a></a>"));

assertTrue(matchesTag(tags[1], "<div id=\"main\"></div>"));
assertFalse(matchesTag(tags[1], "<div></div>"));

assertTrue(matchesTag(tags[2], "<p class=\"image\"></p>"));
assertFalse(matchesTag(tags[2], "<p></p>"));

assertTrue(matchesTag(tags[3], "<a rel=\"nofollow\"></a>"));
assertFalse(matchesTag(tags[3], "<a></a>"));
}

private boolean matchesTag(final PrunedTag tag, final String text) throws Exception {
final DOMParser parser = new DOMParser();
final String html = "<html><body>" + text + "</body></html>";
final ByteArrayInputStream is = new ByteArrayInputStream(html.getBytes("UTF-8"));
parser.parse(new InputSource(is));
Node node = parser.getDocument().getFirstChild().getLastChild().getFirstChild();
return tag.matches(node);
}

public void test_normalizeQueryLanguages() {
FessProp.propMap.clear();
FessConfig fessConfig = new FessConfig.SimpleImpl() {
Expand Down

0 comments on commit 8e987df

Please sign in to comment.