Skip to content

Commit

Permalink
Sitemaps limit on "bad url" log messages, fixes #145
Browse files Browse the repository at this point in the history
- degrade log level to debug for lines which are not valid
- only log first 1024 characters of line
  • Loading branch information
sebastian-nagel committed Apr 13, 2018
1 parent 1215588 commit af084f7
Showing 1 changed file with 17 additions and 40 deletions.
57 changes: 17 additions & 40 deletions src/main/java/crawlercommons/sitemaps/SiteMapParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -298,10 +298,23 @@ protected SiteMap processText(URL sitemapUrl, InputStream stream) throws IOExcep
BufferedReader reader = new BufferedReader(new InputStreamReader(bomIs, UTF_8));

String line;
int i = 1;
while ((line = reader.readLine()) != null) {
if (line.length() > 0 && i <= MAX_URLS) {
addUrlIntoSitemap(line, textSiteMap, null, null, null, i++);
int i = 0;
while ((line = reader.readLine()) != null && ++i <= MAX_URLS) {
line = line.trim();
if (line.isEmpty())
continue;
try {
URL url = new URL(line);
boolean valid = urlIsValid(textSiteMap.getBaseUrl(), url.toString());
if (valid || !strict) {
SiteMapURL sUrl = new SiteMapURL(url, valid);
textSiteMap.addSiteMapUrl(sUrl);
LOG.debug(" {}. {}", i, sUrl);
} else {
LOG.debug("URL: {} is excluded from the sitemap as it is not a valid url = not under the base url: {}", url.toExternalForm(), textSiteMap.getBaseUrl());
}
} catch (MalformedURLException e) {
LOG.debug("Bad url: [{}]", line.substring(0, Math.min(1024, line.length())));
}
}
textSiteMap.setProcessed(true);
Expand Down Expand Up @@ -410,42 +423,6 @@ public InputSource resolveEntity(String publicId, String systemId) {
}
}

/**
* Adds the given URL to the given sitemap while showing the relevant logs
*
* @param urlStr
* an URL string to add to the
* {@link crawlercommons.sitemaps.SiteMap}
* @param siteMap
* the sitemap to add URL(s) to
* @param lastMod
* last time the {@link crawlercommons.sitemaps.SiteMapURL} was
* modified
* @param changeFreq
* the {@link crawlercommons.sitemaps.SiteMapURL} change frquency
* @param priority
* priority of this {@link crawlercommons.sitemaps.SiteMapURL}
* @param urlIndex
* index position to which this entry has been added
*/
protected void addUrlIntoSitemap(String urlStr, SiteMap siteMap, String lastMod, String changeFreq, String priority, int urlIndex) {
try {
URL url = new URL(urlStr); // Checking the URL
boolean valid = urlIsValid(siteMap.getBaseUrl(), url.toString());

if (valid || !strict) {
SiteMapURL sUrl = new SiteMapURL(url.toString(), lastMod, changeFreq, priority, valid);
siteMap.addSiteMapUrl(sUrl);
LOG.debug(" {}. {}", urlIndex + 1, sUrl);
} else {
LOG.warn("URL: {} is excluded from the sitemap as it is not a valid url = not under the base url: {}", url.toExternalForm(), siteMap.getBaseUrl());
}
} catch (MalformedURLException e) {
LOG.warn("Bad url: [{}]", urlStr);
LOG.trace("Can't create a sitemap entry with a bad URL", e);
}
}

/**
* See if testUrl is under sitemapBaseUrl. Only URLs under sitemapBaseUrl
* are valid.
Expand Down

0 comments on commit af084f7

Please sign in to comment.