From af084f7ee4159612029fb2f0eec4e429e1751849 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Fri, 13 Apr 2018 11:16:47 +0200 Subject: [PATCH 1/2] Sitemaps limit on "bad url" log messages, fixes #145 - degrade log level to debug for lines which are not valid - only log first 1024 characters of line --- .../sitemaps/SiteMapParser.java | 57 ++++++------------- 1 file changed, 17 insertions(+), 40 deletions(-) diff --git a/src/main/java/crawlercommons/sitemaps/SiteMapParser.java b/src/main/java/crawlercommons/sitemaps/SiteMapParser.java index 3720eb2e..2d706e9f 100644 --- a/src/main/java/crawlercommons/sitemaps/SiteMapParser.java +++ b/src/main/java/crawlercommons/sitemaps/SiteMapParser.java @@ -298,10 +298,23 @@ protected SiteMap processText(URL sitemapUrl, InputStream stream) throws IOExcep BufferedReader reader = new BufferedReader(new InputStreamReader(bomIs, UTF_8)); String line; - int i = 1; - while ((line = reader.readLine()) != null) { - if (line.length() > 0 && i <= MAX_URLS) { - addUrlIntoSitemap(line, textSiteMap, null, null, null, i++); + int i = 0; + while ((line = reader.readLine()) != null && ++i <= MAX_URLS) { + line = line.trim(); + if (line.isEmpty()) + continue; + try { + URL url = new URL(line); + boolean valid = urlIsValid(textSiteMap.getBaseUrl(), url.toString()); + if (valid || !strict) { + SiteMapURL sUrl = new SiteMapURL(url, valid); + textSiteMap.addSiteMapUrl(sUrl); + LOG.debug(" {}. {}", i, sUrl); + } else { + LOG.debug("URL: {} is excluded from the sitemap as it is not a valid url = not under the base url: {}", url.toExternalForm(), textSiteMap.getBaseUrl()); + } + } catch (MalformedURLException e) { + LOG.debug("Bad url: [{}]", line.substring(0, Math.min(1024, line.length()))); } } textSiteMap.setProcessed(true); @@ -410,42 +423,6 @@ public InputSource resolveEntity(String publicId, String systemId) { } } - /** - * Adds the given URL to the given sitemap while showing the relevant logs - * - * @param urlStr - * an URL string to add to the - * {@link crawlercommons.sitemaps.SiteMap} - * @param siteMap - * the sitemap to add URL(s) to - * @param lastMod - * last time the {@link crawlercommons.sitemaps.SiteMapURL} was - * modified - * @param changeFreq - * the {@link crawlercommons.sitemaps.SiteMapURL} change frquency - * @param priority - * priority of this {@link crawlercommons.sitemaps.SiteMapURL} - * @param urlIndex - * index position to which this entry has been added - */ - protected void addUrlIntoSitemap(String urlStr, SiteMap siteMap, String lastMod, String changeFreq, String priority, int urlIndex) { - try { - URL url = new URL(urlStr); // Checking the URL - boolean valid = urlIsValid(siteMap.getBaseUrl(), url.toString()); - - if (valid || !strict) { - SiteMapURL sUrl = new SiteMapURL(url.toString(), lastMod, changeFreq, priority, valid); - siteMap.addSiteMapUrl(sUrl); - LOG.debug(" {}. {}", urlIndex + 1, sUrl); - } else { - LOG.warn("URL: {} is excluded from the sitemap as it is not a valid url = not under the base url: {}", url.toExternalForm(), siteMap.getBaseUrl()); - } - } catch (MalformedURLException e) { - LOG.warn("Bad url: [{}]", urlStr); - LOG.trace("Can't create a sitemap entry with a bad URL", e); - } - } - /** * See if testUrl is under sitemapBaseUrl. Only URLs under sitemapBaseUrl * are valid. From 907be2343fce47fbeb1f5a424fe013e61d0a445d Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Mon, 16 Apr 2018 13:36:06 +0200 Subject: [PATCH 2/2] Format fix: add braces, complete CHANGES.txt --- CHANGES.txt | 1 + src/main/java/crawlercommons/sitemaps/SiteMapParser.java | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGES.txt b/CHANGES.txt index 890baf6c..f66c7224 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,6 +1,7 @@ Crawler-Commons Change Log Current Development 0.10-SNAPSHOT (yyyy-mm-dd) +- [Sitemaps] Limit on "bad url" log messages (sebastian-nagel) #145 - EffectiveTldFinder to parse Internationalized Domain Names (sebastian-nagel) #179 - Add main() to EffectiveTldFinder (sebastian-nagel) #187 - Handle new suffixes in PaidLevelDomain (kkrugler) #183 diff --git a/src/main/java/crawlercommons/sitemaps/SiteMapParser.java b/src/main/java/crawlercommons/sitemaps/SiteMapParser.java index 2d706e9f..d83b6708 100644 --- a/src/main/java/crawlercommons/sitemaps/SiteMapParser.java +++ b/src/main/java/crawlercommons/sitemaps/SiteMapParser.java @@ -301,8 +301,9 @@ protected SiteMap processText(URL sitemapUrl, InputStream stream) throws IOExcep int i = 0; while ((line = reader.readLine()) != null && ++i <= MAX_URLS) { line = line.trim(); - if (line.isEmpty()) + if (line.isEmpty()) { continue; + } try { URL url = new URL(line); boolean valid = urlIsValid(textSiteMap.getBaseUrl(), url.toString());