Improve sitemap parsing

- ignore query part of URL to determine sitemap location prefix for URL validation, fixes #202 - resolve relative links in RSS feeds, fixes #203 - allow non-continuous content (containing XML entities or CDATA) when parsing links in RSS feeds, fixes #204 - extract links from <guid> elements in RSS feeds, fixes #201
crawler-commons · Apr 25, 2018 · 0ef7cf8 · 0ef7cf8
1 parent a9277ac
commit 0ef7cf8
Show file tree

Hide file tree

Showing 7 changed files with 134 additions and 35 deletions.
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -1,6 +1,10 @@
 Crawler-Commons Change Log
 
 Current Development 0.10-SNAPSHOT (yyyy-mm-dd)
+- Sitemap file location to ignore query part of URL (sebastian-nagel) #202
+- [RSS sitemaps] Link extraction from RSS feeds fails on XML entities (sebastian-nagel) #204
+- [RSS sitemaps] Resolve relative links in RSS feeds (sebastian-nagel) #203
+- [RSS sitemaps] Extract links from <guid> elements (sebastian-nagel) #201
 - [Sitemaps] Limit on "bad url" log messages (sebastian-nagel) #145
 - EffectiveTldFinder to parse Internationalized Domain Names (sebastian-nagel) #179
 - Add main() to EffectiveTldFinder (sebastian-nagel) #187

diff --git a/src/main/java/crawlercommons/sitemaps/SiteMap.java b/src/main/java/crawlercommons/sitemaps/SiteMap.java
@@ -108,9 +108,26 @@ public String toString() {
      * @param sitemapUrl
      */
     private void setBaseUrl(URL sitemapUrl) {
-        // Remove everything back to last slash.
-        // So http://foo.org/abc/sitemap.xml becomes http://foo.org/abc/
-        baseUrl = sitemapUrl.toString().replaceFirst("/[^/]*$", "/");
+        // Remove everything back to last slash on the file path:
+        // - http://example.com/abc/sitemap.xml
+        // becomes
+        // - http://example.com/abc/
+        // But ignore slashes in the query part:
+        // - http://example.com/index.php?route=feed/imagemap
+        // becomes
+        // - http://example.com/
+        String path = sitemapUrl.getPath();
+        int lastPathDelimPos = path.lastIndexOf('/');
+        if (lastPathDelimPos < 0) {
+            path = "/";
+        } else {
+            path = path.substring(0, lastPathDelimPos + 1);
+        }
+        try {
+            baseUrl = new URL(sitemapUrl.getProtocol(), sitemapUrl.getHost(), sitemapUrl.getPort(), path).toString();
+        } catch (MalformedURLException e) {
+            baseUrl = sitemapUrl.toString();
+        }
     }
 
     /**

diff --git a/src/main/java/crawlercommons/sitemaps/SiteMapParser.java b/src/main/java/crawlercommons/sitemaps/SiteMapParser.java
@@ -443,14 +443,6 @@ public InputSource resolveEntity(String publicId, String systemId) {
      * @return true if testUrl is under sitemapBaseUrl, false otherwise
      */
     public static boolean urlIsValid(String sitemapBaseUrl, String testUrl) {
-        boolean ret = false;
-
-        // Don't try a comparison if the URL is too short to match
-        if (sitemapBaseUrl != null && sitemapBaseUrl.length() <= testUrl.length()) {
-            String u = testUrl.substring(0, sitemapBaseUrl.length());
-            ret = sitemapBaseUrl.equals(u);
-        }
-
-        return ret;
+        return testUrl.startsWith(sitemapBaseUrl);
     }
 }
diff --git a/src/main/java/crawlercommons/sitemaps/sax/DelegatorHandler.java b/src/main/java/crawlercommons/sitemaps/sax/DelegatorHandler.java
@@ -102,10 +102,8 @@ private void startRootElement(String uri, String localName, String qName, Attrib
             delegate = new AtomHandler(url, elementStack, strict);
         }
         // See if it is a RSS feed by looking for the localName "channel"
-        // element .
-        // This avoids the issue
-        // of having the outer tag named <rdf:RDF> that was causing this code to
-        // fail. Inside of
+        // element. This avoids the issue of having the outer tag named
+        // <rdf:RDF> that was causing this code to fail. Inside of
         // the <rss> or <rdf> tag is a <channel> tag, so we can use that.
         // See https://github.com/crawler-commons/crawler-commons/issues/87
         // and also RSS 1.0 specification http://web.resource.org/rss/1.0/spec

diff --git a/src/main/java/crawlercommons/sitemaps/sax/RSSHandler.java b/src/main/java/crawlercommons/sitemaps/sax/RSSHandler.java
@@ -78,15 +78,16 @@
 class RSSHandler extends DelegatorHandler {
 
     private SiteMap sitemap;
-    private URL loc;
+    private StringBuilder loc;
+    private URL locURL;
     private String lastMod;
     boolean valid;
-    private int i = 0;
 
     RSSHandler(URL url, LinkedList<String> elementStack, boolean strict) {
         super(elementStack, strict);
         sitemap = new SiteMap(url);
         sitemap.setType(SitemapType.RSS);
+        loc = new StringBuilder();
     }
 
     /*
@@ -107,7 +108,9 @@ public void startElement(String uri, String localName, String qName, Attributes
      */
     @Override
     public void endElement(String uri, String localName, String qName) throws SAXException {
-        if ("item".equals(currentElement())) {
+        if ("link".equals(currentElement()) || "guid".equals(currentElement())) {
+            setLocURL();
+        } else if ("item".equals(currentElement())) {
             maybeAddSiteMapUrl();
         } else if ("rss".equals(currentElement())) {
             sitemap.setProcessed(true);
@@ -130,14 +133,13 @@ public void characters(char[] ch, int start, int length) throws SAXException {
                 sitemap.setLastModified(lastMod);
             }
         } else if ("link".equals(localName)) {
-            String href = value;
-            LOG.debug("href = {}", href);
-            try {
-                loc = new URL(href);
-                valid = urlIsValid(sitemap.getBaseUrl(), href);
-            } catch (MalformedURLException e) {
-                LOG.trace("Can't create an entry with a bad URL", e);
-                LOG.debug("Bad url: [{}]", href);
+            loc.append(value);
+        } else if ("guid".equals(localName)) {
+            // accept as link if
+            // - a valid absolute URL (not a URN, UUID or similar)
+            // - and no <link> found yet
+            if (locURL == null) {
+                loc.append(value);
             }
         }
     }
@@ -146,18 +148,32 @@ public AbstractSiteMap getSiteMap() {
         return sitemap;
     }
 
+    private void setLocURL() {
+        String value = loc.toString().trim();
+        if (value.isEmpty()) {
+            return;
+        }
+        try {
+            // check that the value is a valid URL
+            locURL = new URL(sitemap.getUrl(), value);
+        } catch (MalformedURLException e) {
+            LOG.debug("Bad url: [{}]", value);
+            LOG.trace("Can't create an entry with a bad URL", e);
+        } finally {
+            loc = new StringBuilder();
+        }
+    }
+
     private void maybeAddSiteMapUrl() {
-        if (valid || !isStrict()) {
-            if (loc == null) {
-                LOG.debug("Missing url");
-                LOG.trace("Can't create an entry with a missing URL");
-            } else {
-                SiteMapURL sUrl = new SiteMapURL(loc.toString(), lastMod, null, null, valid);
+        if (locURL != null) {
+            boolean valid = urlIsValid(sitemap.getBaseUrl(), locURL.toString());
+            if (!isStrict() || valid) {
+                SiteMapURL sUrl = new SiteMapURL(locURL, valid);
+                sUrl.setLastModified(lastMod);
                 sitemap.addSiteMapUrl(sUrl);
-                LOG.debug("  {}. {}", (++i), sUrl);
             }
         }
-        loc = null;
+        locURL = null;
         lastMod = null;
     }
 

diff --git a/src/test/java/crawlercommons/sitemaps/SiteMapParserTest.java b/src/test/java/crawlercommons/sitemaps/SiteMapParserTest.java
@@ -372,6 +372,21 @@ public void testAtomFormat() throws UnknownFormatException, IOException {
         assertEquals(new URL("http://example.org/2003/12/13/atom03"), sm.getSiteMapUrls().iterator().next().getUrl());
     }
 
+    @Test
+    public void testRSSFormat() throws UnknownFormatException, IOException {
+        SiteMapParser parser = new SiteMapParser(true, false);
+        byte[] content = getResourceAsBytes("src/test/resources/rss/feed.rss");
+        URL url = new URL("https://www.example.com/index.php?feed/rss");
+
+        SiteMap sm = (SiteMap) parser.parseSiteMap(content, url);
+        assertEquals(4, sm.getSiteMapUrls().size());
+        Iterator<SiteMapURL> it = sm.getSiteMapUrls().iterator();
+        assertEquals(new URL("https://www.example.com/blog/post/1"), it.next().getUrl());
+        assertEquals(new URL("https://www.example.com/guid.html"), it.next().getUrl());
+        assertEquals(new URL("https://www.example.com/foo?q=a&l=en"), it.next().getUrl());
+        assertEquals(new URL("https://www.example.com/foo?q=a&l=fr"), it.next().getUrl());
+    }
+
     /**
      * Test processing RSS 1.0 sitemaps, which don't have an <rss> tag. E.g.
      * http://rss.slashdot.org/slashdot/slashdotMain?format=xml
@@ -480,6 +495,26 @@ public void testPartialSitemapsAllowed() throws UnknownFormatException, IOExcept
         assertFalse(sm.getSiteMapUrls().iterator().next().isValid());
     }
 
+    @Test
+    public void testFileLocationValidation() {
+        // examples from https://www.sitemaps.org/protocol.html#location
+        String baseUrl = "http://example.com/catalog/sitemap.xml";
+        String testUrl = "http://example.com/catalog/show?item=233&user=3453";
+        SiteMap sitemap = new SiteMap(baseUrl);
+        assertTrue(SiteMapParser.urlIsValid(sitemap.getBaseUrl(), testUrl));
+
+        testUrl = "http://example.com/image/show?item=233&user=3453";
+        assertFalse(SiteMapParser.urlIsValid(sitemap.getBaseUrl(), testUrl));
+
+        testUrl = "https://example.com/catalog/page1.php";
+        assertFalse(SiteMapParser.urlIsValid(sitemap.getBaseUrl(), testUrl));
+
+        // do not use / in query part to determine base location
+        baseUrl = "https://example.com/index.php?route=path/sitemap";
+        sitemap = new SiteMap(baseUrl);
+        assertTrue(SiteMapParser.urlIsValid(sitemap.getBaseUrl(), testUrl));
+    }
+
     @Test
     public void testUrlLocUrl() throws UnknownFormatException, IOException {
         SiteMapParser parser = new SiteMapParser(false);

diff --git a/src/test/resources/rss/feed.rss b/src/test/resources/rss/feed.rss
@@ -0,0 +1,37 @@
+
+<!-- extended from https://en.wikipedia.org/wiki/RSS#Example -->
+
+<rss version="2.0">
+<channel>
+ <title>RSS Title</title>
+ <description>This is an example of an RSS feed</description>
+ <link>https://www.example.com/main.html</link>
+ <lastBuildDate>Mon, 06 Sep 2010 00:01:00 +0000 </lastBuildDate>
+ <pubDate>Sun, 06 Sep 2009 16:20:00 +0000</pubDate>
+ <ttl>1800</ttl>
+
+ <item>
+  <title>Example entry</title>
+  <description>Here is some text containing an interesting description.</description>
+  <link>https://www.example.com/blog/post/1</link>
+  <guid isPermaLink="false">7bd204c6-1655-4c27-aeee-53f933c5395f</guid>
+  <pubDate>Sun, 06 Sep 2009 16:20:00 +0000</pubDate>
+ </item>
+
+ <item>
+  <title>&amp;guid&amp; element instead of &amp;link&amp;</title>
+  <guid isPermaLink="false">https://www.example.com/guid.html</guid>
+ </item>
+
+ <item>
+  <title>XML entity in link content</title>
+  <link>https://www.example.com/foo?q=a&amp;l=en</link>
+ </item>
+
+ <item>
+  <title>CDATA</title>
+  <link>https://www.example.com/foo<![CDATA[?q=a&l=fr]]></link>
+ </item>
+
+</channel>
+</rss>