Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sitemap index: stop URL at closing </loc> #227

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGES.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
Crawler-Commons Change Log

Current Development 0.11-SNAPSHOT (yyyy-mm-dd)
- [Sitemaps] Sitemap index: stop URL at closing </loc> (sebastian-nagel, kkrugler) #213
- [Sitemaps] Allow empty price in video sitemaps (sebastian-nagel) #221
- [Sitemaps] In case of the use of a different locale, price tag can be formatted with ',' instead of '.' leading to a NPE (goldenlink) #220
- [Sitemaps] Add support for sitemap extensions (tuxnco, sebastian-nagel) #35, #36, #149, #162
Expand Down
10 changes: 10 additions & 0 deletions src/main/java/crawlercommons/sitemaps/sax/DelegatorHandler.java
Original file line number Diff line number Diff line change
Expand Up @@ -208,4 +208,14 @@ public void fatalError(SAXParseException e) throws SAXException {
delegate.fatalError(e);
}
}

public static boolean isAllBlank(CharSequence charSeq) {
for (int i = 0; i < charSeq.length(); i++) {
if (!Character.isWhitespace(charSeq.charAt(i))) {
return false;
}
}
return true;
}

}
9 changes: 3 additions & 6 deletions src/main/java/crawlercommons/sitemaps/sax/XMLHandler.java
Original file line number Diff line number Diff line change
Expand Up @@ -94,12 +94,9 @@ public void startElement(String uri, String localName, String qName, Attributes

// flush any unclosed or missing URL element
if (loc.length() > 0 && ("loc".equals(localName) || "url".equals(localName))) {
// check whether loc isn't white space only
for (int i = 0; i < loc.length(); i++) {
if (!Character.isWhitespace(loc.charAt(i))) {
maybeAddSiteMapUrl();
return;
}
if (!isAllBlank(loc)) {
maybeAddSiteMapUrl();
return;
}
loc = new StringBuilder();
if ("url".equals(localName)) {
Expand Down
13 changes: 13 additions & 0 deletions src/main/java/crawlercommons/sitemaps/sax/XMLIndexHandler.java
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,19 @@ class XMLIndexHandler extends DelegatorHandler {
}

public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
// flush any unclosed or missing <sitemap> element
if (loc.length() > 0 && ("loc".equals(localName) || "sitemap".equals(localName))) {
if (!isAllBlank(loc)) {
maybeAddSiteMap();
return;
}
loc = new StringBuilder();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wouldn't you want to always reset loc, regardless of whether localName is loc or sitemap?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the start element holds an attribute (e.g., <lastmod>) the content of loc is still required. An example:

<sitemapindex>
   <sitemap>
      <loc>http://www.example.com/sitemap1.xml.gz</loc>
      <lastmod>2004-10-01T18:23:17+00:00</lastmod>
   <sitemap>
      <loc>...

At second opening <sitemap> we can be sure that the previous sitemap is finished. But at the opening <lastmod> we need to wait for the modification date of the sitemap before we can call maybeAddSiteMap().

if ("sitemap".equals(localName)) {
// reset also attributes
locClosed = false;
lastMod = null;
}
}
}

public void endElement(String uri, String localName, String qName) throws SAXException {
Expand Down
47 changes: 47 additions & 0 deletions src/test/java/crawlercommons/sitemaps/SiteMapParserTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,53 @@ public void testMissingLocSitemapIndexFile() throws UnknownFormatException, IOEx
assertNotNull("Sitemap " + sitemap + " not found in sitemap index", sm.getSitemap(new URL(sitemap)));
}

@Test
public void testUnclosedSitemapIndexFile() throws UnknownFormatException, IOException {
SiteMapParser parser = new SiteMapParser();
StringBuilder scontent = new StringBuilder();
scontent.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>") //
.append("<sitemapindex>") //
.append(" <sitemap>") //
.append(" <loc>https://www.example.org/sitemap1.xml</loc>") //
.append(" <loc>https://www.example.org/sitemap2.xml</loc>") //
.append(" </sitemap>") //
.append("</sitemapindex>");
byte[] content = scontent.toString().getBytes(UTF_8);
URL url = new URL("http://www.example.com/sitemapindex.xml");

AbstractSiteMap asm = parser.parseSiteMap(content, url);
assertEquals(true, asm.isIndex());
assertEquals(true, asm instanceof SiteMapIndex);
SiteMapIndex sm = (SiteMapIndex) asm;
assertEquals(2, sm.getSitemaps().size());
String urlSecondSitemap = "https://www.example.org/sitemap2.xml";
AbstractSiteMap secondSitemap = sm.getSitemap(new URL(urlSecondSitemap));
assertNotNull("Sitemap " + urlSecondSitemap + " not found in sitemap index", secondSitemap);

// check reset of attributes (lastmod) when "autoclosing" <sitemap>
// elements
scontent = new StringBuilder();
scontent.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n") //
.append("<sitemapindex>\n") //
.append(" <sitemap>\n") //
.append(" <loc>https://www.example.org/sitemap1.xml</loc>\n") //
.append(" <lastmod>2004-10-01T18:23:17+00:00</lastmod>\n") //
.append(" <loc>https://www.example.org/sitemap2.xml</loc>\n") //
.append(" <loc>https://www.example.org/sitemap3.xml</loc>\n") //
.append(" <lastmod>2005-11-02T19:24:18+00:00</lastmod>\n") //
.append(" </sitemap>\n") //
.append("</sitemapindex>");
content = scontent.toString().getBytes(UTF_8);
asm = parser.parseSiteMap(content, url);
assertEquals(true, asm.isIndex());
assertEquals(true, asm instanceof SiteMapIndex);
sm = (SiteMapIndex) asm;
assertEquals(3, sm.getSitemaps().size());
secondSitemap = sm.getSitemap(new URL(urlSecondSitemap));
assertNotNull("Sitemap " + urlSecondSitemap + " not found in sitemap index", secondSitemap);
assertNull("Sitemap " + urlSecondSitemap + " without modification date", secondSitemap.getLastModified());
}

@Test
public void testSitemapGZ() throws UnknownFormatException, IOException {
SiteMapParser parser = new SiteMapParser();
Expand Down