Skip to content

Commit

Permalink
Improve sitemap parsing
Browse files Browse the repository at this point in the history
- ignore query part of URL to determine sitemap location prefix
  for URL validation, fixes #202
- resolve relative links in RSS feeds, fixes #203
- allow non-continuous content (containing XML entities or CDATA)
  when parsing links in RSS feeds, fixes #204
- extract links from <guid> elements in RSS feeds, fixes #201
  • Loading branch information
sebastian-nagel committed Apr 25, 2018
1 parent a9277ac commit 0ef7cf8
Show file tree
Hide file tree
Showing 7 changed files with 134 additions and 35 deletions.
4 changes: 4 additions & 0 deletions CHANGES.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
Crawler-Commons Change Log

Current Development 0.10-SNAPSHOT (yyyy-mm-dd)
- Sitemap file location to ignore query part of URL (sebastian-nagel) #202
- [RSS sitemaps] Link extraction from RSS feeds fails on XML entities (sebastian-nagel) #204
- [RSS sitemaps] Resolve relative links in RSS feeds (sebastian-nagel) #203
- [RSS sitemaps] Extract links from <guid> elements (sebastian-nagel) #201
- [Sitemaps] Limit on "bad url" log messages (sebastian-nagel) #145
- EffectiveTldFinder to parse Internationalized Domain Names (sebastian-nagel) #179
- Add main() to EffectiveTldFinder (sebastian-nagel) #187
Expand Down
23 changes: 20 additions & 3 deletions src/main/java/crawlercommons/sitemaps/SiteMap.java
Original file line number Diff line number Diff line change
Expand Up @@ -108,9 +108,26 @@ public String toString() {
* @param sitemapUrl
*/
private void setBaseUrl(URL sitemapUrl) {
// Remove everything back to last slash.
// So http://foo.org/abc/sitemap.xml becomes http://foo.org/abc/
baseUrl = sitemapUrl.toString().replaceFirst("/[^/]*$", "/");
// Remove everything back to last slash on the file path:
// - http://example.com/abc/sitemap.xml
// becomes
// - http://example.com/abc/
// But ignore slashes in the query part:
// - http://example.com/index.php?route=feed/imagemap
// becomes
// - http://example.com/
String path = sitemapUrl.getPath();
int lastPathDelimPos = path.lastIndexOf('/');
if (lastPathDelimPos < 0) {
path = "/";
} else {
path = path.substring(0, lastPathDelimPos + 1);
}
try {
baseUrl = new URL(sitemapUrl.getProtocol(), sitemapUrl.getHost(), sitemapUrl.getPort(), path).toString();
} catch (MalformedURLException e) {
baseUrl = sitemapUrl.toString();
}
}

/**
Expand Down
10 changes: 1 addition & 9 deletions src/main/java/crawlercommons/sitemaps/SiteMapParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -443,14 +443,6 @@ public InputSource resolveEntity(String publicId, String systemId) {
* @return true if testUrl is under sitemapBaseUrl, false otherwise
*/
public static boolean urlIsValid(String sitemapBaseUrl, String testUrl) {
boolean ret = false;

// Don't try a comparison if the URL is too short to match
if (sitemapBaseUrl != null && sitemapBaseUrl.length() <= testUrl.length()) {
String u = testUrl.substring(0, sitemapBaseUrl.length());
ret = sitemapBaseUrl.equals(u);
}

return ret;
return testUrl.startsWith(sitemapBaseUrl);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -102,10 +102,8 @@ private void startRootElement(String uri, String localName, String qName, Attrib
delegate = new AtomHandler(url, elementStack, strict);
}
// See if it is a RSS feed by looking for the localName "channel"
// element .
// This avoids the issue
// of having the outer tag named <rdf:RDF> that was causing this code to
// fail. Inside of
// element. This avoids the issue of having the outer tag named
// <rdf:RDF> that was causing this code to fail. Inside of
// the <rss> or <rdf> tag is a <channel> tag, so we can use that.
// See https://github.com/crawler-commons/crawler-commons/issues/87
// and also RSS 1.0 specification http://web.resource.org/rss/1.0/spec
Expand Down
54 changes: 35 additions & 19 deletions src/main/java/crawlercommons/sitemaps/sax/RSSHandler.java
Original file line number Diff line number Diff line change
Expand Up @@ -78,15 +78,16 @@
class RSSHandler extends DelegatorHandler {

private SiteMap sitemap;
private URL loc;
private StringBuilder loc;
private URL locURL;
private String lastMod;
boolean valid;
private int i = 0;

RSSHandler(URL url, LinkedList<String> elementStack, boolean strict) {
super(elementStack, strict);
sitemap = new SiteMap(url);
sitemap.setType(SitemapType.RSS);
loc = new StringBuilder();
}

/*
Expand All @@ -107,7 +108,9 @@ public void startElement(String uri, String localName, String qName, Attributes
*/
@Override
public void endElement(String uri, String localName, String qName) throws SAXException {
if ("item".equals(currentElement())) {
if ("link".equals(currentElement()) || "guid".equals(currentElement())) {
setLocURL();
} else if ("item".equals(currentElement())) {
maybeAddSiteMapUrl();
} else if ("rss".equals(currentElement())) {
sitemap.setProcessed(true);
Expand All @@ -130,14 +133,13 @@ public void characters(char[] ch, int start, int length) throws SAXException {
sitemap.setLastModified(lastMod);
}
} else if ("link".equals(localName)) {
String href = value;
LOG.debug("href = {}", href);
try {
loc = new URL(href);
valid = urlIsValid(sitemap.getBaseUrl(), href);
} catch (MalformedURLException e) {
LOG.trace("Can't create an entry with a bad URL", e);
LOG.debug("Bad url: [{}]", href);
loc.append(value);
} else if ("guid".equals(localName)) {
// accept as link if
// - a valid absolute URL (not a URN, UUID or similar)
// - and no <link> found yet
if (locURL == null) {
loc.append(value);
}
}
}
Expand All @@ -146,18 +148,32 @@ public AbstractSiteMap getSiteMap() {
return sitemap;
}

private void setLocURL() {
String value = loc.toString().trim();
if (value.isEmpty()) {
return;
}
try {
// check that the value is a valid URL
locURL = new URL(sitemap.getUrl(), value);
} catch (MalformedURLException e) {
LOG.debug("Bad url: [{}]", value);
LOG.trace("Can't create an entry with a bad URL", e);
} finally {
loc = new StringBuilder();
}
}

private void maybeAddSiteMapUrl() {
if (valid || !isStrict()) {
if (loc == null) {
LOG.debug("Missing url");
LOG.trace("Can't create an entry with a missing URL");
} else {
SiteMapURL sUrl = new SiteMapURL(loc.toString(), lastMod, null, null, valid);
if (locURL != null) {
boolean valid = urlIsValid(sitemap.getBaseUrl(), locURL.toString());
if (!isStrict() || valid) {
SiteMapURL sUrl = new SiteMapURL(locURL, valid);
sUrl.setLastModified(lastMod);
sitemap.addSiteMapUrl(sUrl);
LOG.debug(" {}. {}", (++i), sUrl);
}
}
loc = null;
locURL = null;
lastMod = null;
}

Expand Down
35 changes: 35 additions & 0 deletions src/test/java/crawlercommons/sitemaps/SiteMapParserTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -372,6 +372,21 @@ public void testAtomFormat() throws UnknownFormatException, IOException {
assertEquals(new URL("http://example.org/2003/12/13/atom03"), sm.getSiteMapUrls().iterator().next().getUrl());
}

@Test
public void testRSSFormat() throws UnknownFormatException, IOException {
SiteMapParser parser = new SiteMapParser(true, false);
byte[] content = getResourceAsBytes("src/test/resources/rss/feed.rss");
URL url = new URL("https://www.example.com/index.php?feed/rss");

SiteMap sm = (SiteMap) parser.parseSiteMap(content, url);
assertEquals(4, sm.getSiteMapUrls().size());
Iterator<SiteMapURL> it = sm.getSiteMapUrls().iterator();
assertEquals(new URL("https://www.example.com/blog/post/1"), it.next().getUrl());
assertEquals(new URL("https://www.example.com/guid.html"), it.next().getUrl());
assertEquals(new URL("https://www.example.com/foo?q=a&l=en"), it.next().getUrl());
assertEquals(new URL("https://www.example.com/foo?q=a&l=fr"), it.next().getUrl());
}

/**
* Test processing RSS 1.0 sitemaps, which don't have an <rss> tag. E.g.
* http://rss.slashdot.org/slashdot/slashdotMain?format=xml
Expand Down Expand Up @@ -480,6 +495,26 @@ public void testPartialSitemapsAllowed() throws UnknownFormatException, IOExcept
assertFalse(sm.getSiteMapUrls().iterator().next().isValid());
}

@Test
public void testFileLocationValidation() {
// examples from https://www.sitemaps.org/protocol.html#location
String baseUrl = "http://example.com/catalog/sitemap.xml";
String testUrl = "http://example.com/catalog/show?item=233&user=3453";
SiteMap sitemap = new SiteMap(baseUrl);
assertTrue(SiteMapParser.urlIsValid(sitemap.getBaseUrl(), testUrl));

testUrl = "http://example.com/image/show?item=233&user=3453";
assertFalse(SiteMapParser.urlIsValid(sitemap.getBaseUrl(), testUrl));

testUrl = "https://example.com/catalog/page1.php";
assertFalse(SiteMapParser.urlIsValid(sitemap.getBaseUrl(), testUrl));

// do not use / in query part to determine base location
baseUrl = "https://example.com/index.php?route=path/sitemap";
sitemap = new SiteMap(baseUrl);
assertTrue(SiteMapParser.urlIsValid(sitemap.getBaseUrl(), testUrl));
}

@Test
public void testUrlLocUrl() throws UnknownFormatException, IOException {
SiteMapParser parser = new SiteMapParser(false);
Expand Down
37 changes: 37 additions & 0 deletions src/test/resources/rss/feed.rss
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@

<!-- extended from https://en.wikipedia.org/wiki/RSS#Example -->

<rss version="2.0">
<channel>
<title>RSS Title</title>
<description>This is an example of an RSS feed</description>
<link>https://www.example.com/main.html</link>
<lastBuildDate>Mon, 06 Sep 2010 00:01:00 +0000 </lastBuildDate>
<pubDate>Sun, 06 Sep 2009 16:20:00 +0000</pubDate>
<ttl>1800</ttl>

<item>
<title>Example entry</title>
<description>Here is some text containing an interesting description.</description>
<link>https://www.example.com/blog/post/1</link>
<guid isPermaLink="false">7bd204c6-1655-4c27-aeee-53f933c5395f</guid>
<pubDate>Sun, 06 Sep 2009 16:20:00 +0000</pubDate>
</item>

<item>
<title>&amp;guid&amp; element instead of &amp;link&amp;</title>
<guid isPermaLink="false">https://www.example.com/guid.html</guid>
</item>

<item>
<title>XML entity in link content</title>
<link>https://www.example.com/foo?q=a&amp;l=en</link>
</item>

<item>
<title>CDATA</title>
<link>https://www.example.com/foo<![CDATA[?q=a&l=fr]]></link>
</item>

</channel>
</rss>

0 comments on commit 0ef7cf8

Please sign in to comment.