Merge pull request #4 from Chaiavi/master

Updating my fork
crawler-commons · Apr 29, 2020 · 95207d0 · 95207d0
2 parents f2b005e + 0265b98
commit 95207d0
Show file tree

Hide file tree

Showing 13 changed files with 161 additions and 52 deletions.
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -1,11 +1,15 @@
 Crawler-Commons Change Log
 
 Current Development 1.1-SNAPSHOT (yyyy-mm-dd)
-- [Robots] Upgrade the toString() method of the BaseRobotRules (Avi Hayun) #264
-- Upgrade GitIgnore (Avi Hayun) #260
-- [Robots] robots rules unit tests should be upgraded using junit5 parameterizedtest (Avi Hayun) #255
-- [Normalizer] Use ParameterizedTest in the normalizer unit tests (Avi Hayun) #253
-- Upgrade Junit to v5.x (Avi Hayun) #249
+- [Robots] Deduplicate sitemap links (sebastian-nagel) #261
+- EffectiveTldFinder to log loading of public suffix list (sebastian-nagel) #284
+- SiteMapParser getPublicationDate in VideoAttributes may throw NPE (panthony, sebastian-nagel) #283
+- SimpleRobotRulesParser: Trim log messages (jnioche, sebastian-nagel) #281
+- SimpleRobotRulesParser: counter _numWarnings not thread-safe (sebastian-nagel, kkrugler) #278
+- ParameterizedTest not executed by mvn builds (sebastian-nagel) #273
+- [BasicNormalizer] Empty path before query to be normalized to `/` (Chaiavi, sebastian-nagel) #247
+- EffectiveTldFinder to validate returned domain names for length restrictions (sebastian-nagel, Chaiavi) #251
+- Upgrade unit tests to use JUnit v5.x and parameterized tests (Chaiavi) #249, #253, #255
 - [Robots] Robots parser to always handle absolute sitemap URL even without valid base URL (pr3mar, kkrugler, sebastian-nagel) #240
 
 Release 1.0 (2019-03-19)

diff --git a/pom.xml b/pom.xml
@@ -327,7 +327,7 @@
 		<maven-compiler-plugin.version>2.3.2</maven-compiler-plugin.version>
 		<maven-resources-plugin.version>2.5</maven-resources-plugin.version>
 		<maven-jar-plugin.version>2.4</maven-jar-plugin.version>
-		<maven-surfire-plugin.version>2.12</maven-surfire-plugin.version>
+		<maven-surfire-plugin.version>2.22.2</maven-surfire-plugin.version>
 		<maven-release-plugin.version>2.5.1</maven-release-plugin.version>
 		<maven-source-plugin.version>2.1.2</maven-source-plugin.version>
 		<maven-javadoc-plugin.version>2.9.1</maven-javadoc-plugin.version>
@@ -367,7 +367,7 @@
 
 		<dependency>
 			<groupId>org.junit.jupiter</groupId>
-			<artifactId>junit-jupiter-api</artifactId>
+			<artifactId>junit-jupiter-engine</artifactId>
 			<version>${junit.version}</version>
 			<scope>test</scope>
 		</dependency>

diff --git a/src/main/java/crawlercommons/domains/EffectiveTldFinder.java b/src/main/java/crawlercommons/domains/EffectiveTldFinder.java
@@ -23,6 +23,7 @@
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.net.IDN;
+import java.net.URL;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
@@ -39,26 +40,30 @@
  * of the various domain registrars and their assignment policies. The best
  * publicly available knowledge base is the public suffix list maintained and
  * available at <a href="https://publicsuffix.org/">publicsuffix.org</a>. This
- * class implements the <a
- * href="https://publicsuffix.org/list/">publicsuffix.org ruleset</a> and uses a
- * copy of the public suffix list. data file format.
+ * class implements the
+ * <a href="https://publicsuffix.org/list/">publicsuffix.org ruleset</a> and
+ * uses a copy of the public suffix list.
  * 
  * For more information, see
  * <ul>
- * <li><a href="http://www.publicsuffix.org">publicsuffix.org</a></li>
+ * <li><a href="https://www.publicsuffix.org/">publicsuffix.org</a></li>
  * <li><a href="https://en.wikipedia.org/wiki/Public_Suffix_List">Wikipedia
  * article about the public suffix list</a></li>
- * <li>Mozilla's <a
- * href="http://wiki.mozilla.org/Gecko:Effective_TLD_Service">Effective TLD
+ * <li>Mozilla's
+ * <a href="https://wiki.mozilla.org/Gecko:Effective_TLD_Service">Effective TLD
  * Service</a>: for historic reasons the class name stems from the term
  * &quot;effective top-level domain&quot; (eTLD)</li>
  * </ul>
  * 
- * This class just needs "effective_tld_names.dat" in the classpath. If you want
- * to configure it with other data, call
- * {@link EffectiveTldFinder#getInstance() EffectiveTldFinder.getInstance()}
- * {@link EffectiveTldFinder#initialize(InputStream) .initialize(InputStream)}.
- * Updates to the public suffix list can be found here:
+ * EffectiveTldFinder loads the public suffix list as file
+ * "effective_tld_names.dat" from the Java classpath. Make sure your classpath
+ * does not contain any other file with the same name, eg. an outdated list
+ * shipped with a third party library. To force EffectiveTldFinder to load an
+ * updated or modified public suffix list, call
+ * {@link EffectiveTldFinder#getInstance()
+ * EffectiveTldFinder.getInstance()}{@link EffectiveTldFinder#initialize(InputStream)
+ * .initialize(InputStream)}. Updates to the public suffix list can be found
+ * here:
  * <ul>
  * <li><a href= "https://publicsuffix.org/list/public_suffix_list.dat"
  * >https://publicsuffix.org/list/public_suffix_list.dat</a></li>
@@ -94,16 +99,37 @@ public class EffectiveTldFinder {
     public static final String WILD_CARD = "*.";
     public static final char DOT = '.';
 
+    /**
+     * Max. length in ASCII characters of a dot-separated segment in host names
+     * (applies to domain names as well), cf.
+     * https://tools.ietf.org/html/rfc1034#section-3.1 and
+     * https://en.wikipedia.org/wiki/Hostname#Restrictions_on_valid_hostnames
+     * 
+     * Note: We only have to validate domain names and not the host names passed
+     * as input. For domain names a verification of the segment length also
+     * implies that the entire domain names stays in the limit of 253
+     * characters. Wildcard suffixes only allow two additional segments (2*63+1
+     * = 127 chars) and all wildcard suffixes are far away from reaching the
+     * critical length of 126 characters.
+     */
+    public static final int MAX_DOMAIN_LENGTH_PART = 63;
+
     private static EffectiveTldFinder instance = null;
     private Map<String, EffectiveTLD> domains = null;
     private SuffixTrie<EffectiveTLD> domainTrie = new SuffixTrie<>();
     private boolean configured = false;
 
     /**
-     * A singleton
+     * A singleton loading the public suffix list from the Java class path.
      */
     private EffectiveTldFinder() {
-        initialize(this.getClass().getResourceAsStream(ETLD_DATA));
+        URL publicSuffixList = this.getClass().getResource(ETLD_DATA);
+        LOGGER.info("Loading public suffix list from class path: {}", publicSuffixList);
+        try (InputStream is = publicSuffixList.openStream()) {
+            initialize(is);
+        } catch (IOException e) {
+            LOGGER.error("Failed to load public suffix list {} from class path: {}", publicSuffixList, e);
+        }
     }
 
     /**
@@ -152,9 +178,7 @@ public boolean initialize(InputStream effectiveTldDataStream) {
             }
             configured = true;
         } catch (IOException e) {
-            if (LOGGER.isDebugEnabled()) {
-                LOGGER.debug("EffectiveTldFinder configuration failed: ", e);
-            }
+            LOGGER.error("EffectiveTldFinder configuration failed: ", e);
             configured = false;
         }
         return configured;
@@ -332,9 +356,15 @@ public static String getAssignedDomain(String hostname, boolean strict, boolean
             try {
                 IDN.toASCII(domainSegment);
             } catch (IllegalArgumentException e) {
-                // not a valid IDN segment
+                // not a valid IDN segment,
+                // includes check for max. length (63 chars)
                 return (strict ? null : hostname);
             }
+        } else if (strict) {
+            // (strict mode) check for max. length of segment (63 chars)
+            if (domainSegment.length() > MAX_DOMAIN_LENGTH_PART) {
+                return null;
+            }
         }
         return hostname.substring(start);
     }

diff --git a/src/main/java/crawlercommons/filters/basic/BasicURLNormalizer.java b/src/main/java/crawlercommons/filters/basic/BasicURLNormalizer.java
@@ -130,6 +130,9 @@ public String filter(String urlString) {
             if (file == null || "".equals(file)) { // add a slash
                 file = "/";
                 changed = true;
+            } else if (!file.startsWith("/")) {
+              file = "/" + file;
+              changed = true;
             }
 
             if (url.getRef() != null) { // remove the ref
@@ -196,6 +199,8 @@ private String getFileWithNormalizedPath(URL url) throws MalformedURLException {
         // if path is empty return a single slash
         if (file.isEmpty()) {
             file = "/";
+        } else if (!file.startsWith("/")) {
+            file = "/" + file;
         }
 
         return file;

diff --git a/src/main/java/crawlercommons/robots/BaseRobotRules.java b/src/main/java/crawlercommons/robots/BaseRobotRules.java
@@ -18,6 +18,7 @@
 
 import java.io.Serializable;
 import java.util.ArrayList;
+import java.util.LinkedHashSet;
 import java.util.List;
 
 /**
@@ -37,10 +38,10 @@ public abstract class BaseRobotRules implements Serializable {
 
     private long _crawlDelay = UNSET_CRAWL_DELAY;
     private boolean _deferVisits = false;
-    private List<String> _sitemaps;
+    private LinkedHashSet<String> _sitemaps;
 
     public BaseRobotRules() {
-        _sitemaps = new ArrayList<String>();
+        _sitemaps = new LinkedHashSet<String>();
     }
 
     public long getCrawlDelay() {
@@ -59,12 +60,14 @@ public void setDeferVisits(boolean deferVisits) {
         _deferVisits = deferVisits;
     }
 
+    /** Add sitemap URL to rules if not a duplicate */
     public void addSitemap(String sitemap) {
         _sitemaps.add(sitemap);
     }
 
+    /** Get URLs of sitemap links found in robots.txt */
     public List<String> getSitemaps() {
-        return _sitemaps;
+        return new ArrayList<>(_sitemaps);
     }
 
     @Override

diff --git a/src/main/java/crawlercommons/robots/SimpleRobotRulesParser.java b/src/main/java/crawlercommons/robots/SimpleRobotRulesParser.java
@@ -170,6 +170,12 @@ private static class ParseState {
         // skip all remaining user agent blocks.
         private boolean _skipAgents;
 
+        /*
+         * Counter of warnings reporting invalid rules/lines in the robots.txt
+         * file. The counter is used to limit the number of logged warnings.
+         */
+        private int _numWarnings;
+
         private String _url;
         private String _targetName;
 
@@ -351,7 +357,9 @@ private static RobotToken tokenize(String line) {
     // greater than this, we'll skip all pages.
     private static final long DEFAULT_MAX_CRAWL_DELAY = 300000;
 
-    private int _numWarnings;
+    // number of warnings found in the latest processed robots.txt file
+    private ThreadLocal<Integer> _numWarningsDuringLastParse = new ThreadLocal<>();
+
     private int _maxWarnings;
     private long _maxCrawlDelay;
 
@@ -398,7 +406,6 @@ public SimpleRobotRules failedFetch(int httpStatusCode) {
      */
     @Override
     public SimpleRobotRules parseContent(String url, byte[] content, String contentType, String robotNames) {
-        _numWarnings = 0;
 
         // If there's nothing there, treat it like we have no restrictions.
         if ((content == null) || (content.length == 0)) {
@@ -513,12 +520,12 @@ else if ((bytesLen >= 2) && (content[0] == (byte) 0xFE) && (content[1] == (byte)
                     break;
 
                 case UNKNOWN:
-                reportWarning("Unknown directive in robots.txt file: " + line, url);
+                reportWarning(parseState, "Unknown directive in robots.txt file: {}", line);
                 parseState.setFinishedAgentFields(true);
                     break;
 
                 case MISSING:
-                reportWarning(String.format(Locale.ROOT, "Unknown line in robots.txt file (size %d): %s", content.length, line), url);
+                reportWarning(parseState, "Unknown line in robots.txt file (size {}): {}", content.length, line);
                 parseState.setFinishedAgentFields(true);
                     break;
 
@@ -532,6 +539,7 @@ else if ((bytesLen >= 2) && (content[0] == (byte) 0xFE) && (content[1] == (byte)
             }
         }
 
+        this._numWarningsDuringLastParse.set(parseState._numWarnings);
         SimpleRobotRules result = parseState.getRobotRules();
         if (result.getCrawlDelay() > _maxCrawlDelay) {
             // Some evil sites use a value like 3600 (seconds) for the crawl
@@ -545,15 +553,21 @@ else if ((bytesLen >= 2) && (content[0] == (byte) 0xFE) && (content[1] == (byte)
         }
     }
 
-    private void reportWarning(String msg, String url) {
-        _numWarnings += 1;
+    private void reportWarning(ParseState state, String msg, Object... args) {
+        state._numWarnings += 1;
 
-        if (_numWarnings == 1) {
-            LOGGER.warn("Problem processing robots.txt for {}", url);
+        if (state._numWarnings == 1) {
+            LOGGER.warn("Problem processing robots.txt for {}", state._url);
         }
 
-        if (_numWarnings < _maxWarnings) {
-            LOGGER.warn("\t {}", msg);
+        if (state._numWarnings < _maxWarnings) {
+            for (int i = 0; i < args.length; i++) {
+                if (args[i] instanceof String && ((String) args[i]).length() > 1024) {
+                    // clip overlong strings to prevent from overflows in log messages
+                    args[i] = ((String) args[i]).substring(0, 1024) + " ...";
+                }
+            }
+            LOGGER.warn("\t " + msg, args);
         }
     }
 
@@ -648,7 +662,7 @@ private void handleDisallow(ParseState state, RobotToken token) {
                 state.addRule(path, false);
             }
         } catch (Exception e) {
-            reportWarning("Error parsing robots rules - can't decode path: " + path, state.getUrl());
+            reportWarning(state, "Error parsing robots rules - can't decode path: {}", path);
         }
     }
 
@@ -676,7 +690,7 @@ private void handleAllow(ParseState state, RobotToken token) {
         try {
             path = URLDecoder.decode(path, "UTF-8");
         } catch (Exception e) {
-            reportWarning("Error parsing robots rules - can't decode path: " + path, state.getUrl());
+            reportWarning(state, "Error parsing robots rules - can't decode path: {}", path);
         }
 
         if (path.length() == 0) {
@@ -720,7 +734,7 @@ private void handleCrawlDelay(ParseState state, RobotToken token) {
                     state.setCrawlDelay(delayValue);
                 }
             } catch (Exception e) {
-                reportWarning("Error parsing robots rules - can't decode crawl delay: " + delayString, state.getUrl());
+                reportWarning(state, "Error parsing robots rules - can't decode crawl delay: {}", delayString);
             }
         }
     }
@@ -754,7 +768,7 @@ private void handleSitemap(ParseState state, RobotToken token) {
                 state.addSitemap(sitemapUrl.toExternalForm());
             }
         } catch (Exception e) {
-            reportWarning("Invalid URL with sitemap directive: " + sitemap, state.getUrl());
+            reportWarning(state, "Invalid URL with sitemap directive:  {}", sitemap);
         }
     }
 
@@ -773,13 +787,22 @@ private void handleHttp(ParseState state, RobotToken token) {
             RobotToken fixedToken = new RobotToken(RobotDirective.SITEMAP, "http:" + token.getData());
             handleSitemap(state, fixedToken);
         } else {
-            reportWarning("Found raw non-sitemap URL: http:" + urlFragment, state.getUrl());
+            reportWarning(state, "Found raw non-sitemap URL: http:{}", urlFragment);
         }
     }
 
-    // For testing
+    /**
+     * Get the number of warnings due to invalid rules/lines in the latest
+     * processed robots.txt file (see
+     * {@link #parseContent(String, byte[], String, String)}.
+     * 
+     * Note: an incorrect value may be returned if the processing of the
+     * robots.txt happened in a different than the current thread.
+     * 
+     * @return number of warnings
+     */
     public int getNumWarnings() {
-        return _numWarnings;
+        return _numWarningsDuringLastParse.get();
     }
 
     public int getMaxWarnings() {

diff --git a/src/main/java/crawlercommons/sitemaps/extension/NewsAttributes.java b/src/main/java/crawlercommons/sitemaps/extension/NewsAttributes.java
@@ -102,7 +102,10 @@ public void setGenres(NewsGenre[] genres) {
     }
 
     public Date getPublicationDate() {
-        return Date.from(publicationDate.toInstant());
+        if (publicationDate != null) {
+            return Date.from(publicationDate.toInstant());
+        }
+        return null;
     }
 
     public ZonedDateTime getPublicationDateTime() {

diff --git a/src/main/java/crawlercommons/sitemaps/extension/VideoAttributes.java b/src/main/java/crawlercommons/sitemaps/extension/VideoAttributes.java
@@ -215,7 +215,10 @@ public void setDuration(Integer duration) {
     }
 
     public Date getExpirationDate() {
-        return Date.from(expirationDate.toInstant());
+        if (expirationDate != null) {
+            return Date.from(expirationDate.toInstant());
+        }
+        return null;
     }
 
     public ZonedDateTime getExpirationDateTime() {
@@ -243,7 +246,10 @@ public void setViewCount(Integer viewCount) {
     }
 
     public Date getPublicationDate() {
-        return Date.from(publicationDate.toInstant());
+        if (publicationDate != null) {
+            return Date.from(publicationDate.toInstant());
+        }
+        return null;
     }
 
     public ZonedDateTime getPublicationDateTime() {