Merge pull request #126 from lewismc/ISSUE-125

Upgrade to JDK 1.8
crawler-commons · Sep 30, 2016 · d0c1221 · d0c1221
2 parents f4b76c7 + 18bbae9
commit d0c1221
Show file tree

Hide file tree

Showing 13 changed files with 256 additions and 171 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -1,5 +1,11 @@
 language: java
 
+jdk:
+  - oraclejdk8
+
+script:
+  - mvn install javadoc:aggregate
+
 notifications:
   email:
     - crawler-commons@googlegroups.com
diff --git a/pom.xml b/pom.xml
@@ -362,9 +362,9 @@
 
 		<!-- General Properties -->
 		<implementation.build>${scmBranch}@r${buildNumber}</implementation.build>
-		<javac.src.version>1.7</javac.src.version>
-		<javac.target.version>1.7</javac.target.version>
-		<maven.compiler.target>1.7</maven.compiler.target>
+		<javac.src.version>1.8</javac.src.version>
+		<javac.target.version>1.8</javac.target.version>
+		<maven.compiler.target>1.8</maven.compiler.target>
 		<maven.build.timestamp.format>yyyy-MM-dd HH:mm:ssZ</maven.build.timestamp.format>
 		<skipTests>false</skipTests>
 		<assembly.finalName>${project.build.finalName}</assembly.finalName>

diff --git a/src/main/java/crawlercommons/domains/EffectiveTldFinder.java b/src/main/java/crawlercommons/domains/EffectiveTldFinder.java
@@ -74,13 +74,13 @@ public static EffectiveTldFinder getInstance() {
         return instance;
     }
 
-    public boolean initialize(InputStream effective_tld_data_stream) {
-        domains = new HashMap<String, EffectiveTLD>();
+    public boolean initialize(InputStream effectiveTldDataStream) {
+        domains = new HashMap<>();
         try {
-            if (null == effective_tld_data_stream && null != this.getClass().getResource(ETLD_DATA)) {
-                effective_tld_data_stream = this.getClass().getResourceAsStream(ETLD_DATA);
+            if (null == effectiveTldDataStream && null != this.getClass().getResource(ETLD_DATA)) {
+              effectiveTldDataStream = this.getClass().getResourceAsStream(ETLD_DATA);
             }
-            BufferedReader input = new BufferedReader(new InputStreamReader(effective_tld_data_stream, Charset.defaultCharset()));
+            BufferedReader input = new BufferedReader(new InputStreamReader(effectiveTldDataStream, Charset.defaultCharset()));
             String line = null;
             while (null != (line = input.readLine())) {
                 if (line.length() == 0 || (line.length() > 1 && line.startsWith(COMMENT))) {
@@ -109,7 +109,10 @@ public static Map<String, EffectiveTLD> getEffectiveTLDs() {
 
     /**
      * @param hostname
-     * @return the Effective TLD
+     *            the hostname for which to find the
+     *            {@link crawlercommons.domains.EffectiveTldFinder.EffectiveTLD}
+     * @return the
+     *         {@link crawlercommons.domains.EffectiveTldFinder.EffectiveTLD}
      */
     public static EffectiveTLD getEffectiveTLD(String hostname) {
         if (getInstance().domains.containsKey(hostname)) {
@@ -145,15 +148,15 @@ public static EffectiveTLD getEffectiveTLD(String hostname) {
      * is the NIC-assigned domain name.
      * 
      * @param hostname
+     *            a string for which to obtain a NIC-assigned domain name
      * @return the NIC-assigned domain name
      */
     public static String getAssignedDomain(String hostname) {
         EffectiveTLD etld = getEffectiveTLD(hostname);
         if (null == etld || etld.getDomain() == hostname.toLowerCase(Locale.getDefault())) {
             return hostname.toLowerCase(Locale.getDefault());
         }
-        String domain = hostname.replaceFirst(".*?([^.]+\\.)" + etld.getDomain() + "$", "$1" + etld.getDomain());
-        return domain;
+        return hostname.replaceFirst(".*?([^.]+\\.)" + etld.getDomain() + "$", "$1" + etld.getDomain());
     }
 
     public boolean isConfigured() {
@@ -185,10 +188,10 @@ public EffectiveTLD(String line) {
             } else {
                 domain = line;
             }
-            domain = normalize_name(domain);
+            domain = normalizeName(domain);
         }
 
-        private String normalize_name(String name) {
+        private String normalizeName(String name) {
             String[] parts = name.split(DOT_REGEX);
             if (parts.length < 2) {
                 return name;

diff --git a/src/main/java/crawlercommons/fetcher/BaseFetcher.java b/src/main/java/crawlercommons/fetcher/BaseFetcher.java
@@ -36,9 +36,9 @@ public abstract class BaseFetcher implements Serializable {
 
     public static final int DEFAULT_MAX_CONTENT_SIZE = 64 * 1024;
 
-    protected Map<String, Integer> _maxContentSizes = new HashMap<String, Integer>();
+    protected Map<String, Integer> _maxContentSizes = new HashMap<>();
     protected int _defaultMaxContentSize = DEFAULT_MAX_CONTENT_SIZE;
-    protected Set<String> _validMimeTypes = new HashSet<String>();
+    protected Set<String> _validMimeTypes = new HashSet<>();
 
     public BaseFetcher() {
     }
@@ -95,12 +95,12 @@ protected static String getMimeTypeFromContentType(String contentType) {
     }
 
     /**
-     * Get the content stored in the resource referenced by <url>
+     * Get the content stored in the resource referenced by the 'url' parameter.
      * 
-     * @param url
-     * @param payload
-     * @return
-     * @throws BaseFetchException
+     * @param url a string url for which to get content
+     * @param payload a populated {@link crawlercommons.fetcher.Payload}
+     * @return the {@link crawlercommons.fetcher.FetchedResult} associated with the URL
+     * @throws BaseFetchException if an error results from fetching the url.
      */
     public abstract FetchedResult get(String url, Payload payload) throws BaseFetchException;
 

diff --git a/src/main/java/crawlercommons/fetcher/http/UserAgent.java b/src/main/java/crawlercommons/fetcher/http/UserAgent.java
@@ -45,18 +45,21 @@ public class UserAgent implements Serializable {
     public static final String DEFAULT_BROWSER_VERSION = "Mozilla/5.0";
     public static final String DEFAULT_CRAWLER_VERSION = CrawlerCommons.getVersion();
 
-    private final String _agentName;
-    private final String _emailAddress;
-    private final String _webAddress;
-    private final String _browserVersion;
-    private final String _crawlerVersion;
+    private final String agentName;
+    private final String emailAddress;
+    private final String webAddress;
+    private final String browserVersion;
+    private final String crawlerConfiguration;
 
     /**
      * Set user agent characteristics
      * 
      * @param agentName
+     *            an agent name string to associate with the crawler
      * @param emailAddress
+     *            an agent email address string to associate with the crawler
      * @param webAddress
+     *            a Web address string to associate with the crawler
      */
     public UserAgent(String agentName, String emailAddress, String webAddress) {
         this(agentName, emailAddress, webAddress, DEFAULT_BROWSER_VERSION);
@@ -66,9 +69,13 @@ public UserAgent(String agentName, String emailAddress, String webAddress) {
      * Set user agent characteristics
      * 
      * @param agentName
+     *            an agent name string to associate with the crawler
      * @param emailAddress
+     *            an agent email address string to associate with the crawler
      * @param webAddress
+     *            a Web address string to associate with the crawler
      * @param browserVersion
+     *            a browser version to mimic
      */
     public UserAgent(String agentName, String emailAddress, String webAddress, String browserVersion) {
         this(agentName, emailAddress, webAddress, browserVersion, DEFAULT_CRAWLER_VERSION);
@@ -78,17 +85,22 @@ public UserAgent(String agentName, String emailAddress, String webAddress, Strin
      * Set user agent characteristics
      * 
      * @param agentName
+     *            an agent name string to associate with the crawler
      * @param emailAddress
+     *            an agent email address string to associate with the crawler
      * @param webAddress
+     *            a Web address string to associate with the crawler
      * @param browserVersion
+     *            a browser version to mimic
      * @param crawlerVersion
+     *            the version of your crawler/crawl agent
      */
     public UserAgent(String agentName, String emailAddress, String webAddress, String browserVersion, String crawlerVersion) {
-        _agentName = agentName;
-        _emailAddress = emailAddress;
-        _webAddress = webAddress;
-        _browserVersion = browserVersion;
-        _crawlerVersion = (crawlerVersion == null ? "" : "/" + crawlerVersion);
+        this.agentName = agentName;
+        this.emailAddress = emailAddress;
+        this.webAddress = webAddress;
+        this.browserVersion = browserVersion;
+        this.crawlerConfiguration = crawlerVersion == null ? "" : "/" + crawlerVersion;
     }
 
     /**
@@ -97,7 +109,7 @@ public UserAgent(String agentName, String emailAddress, String webAddress, Strin
      * @return User Agent name (String)
      */
     public String getAgentName() {
-        return _agentName;
+        return agentName;
     }
 
     /**
@@ -108,6 +120,6 @@ public String getAgentName() {
     public String getUserAgentString() {
         // Mozilla/5.0 (compatible; mycrawler/1.0; +http://www.mydomain.com;
         // mycrawler@mydomain.com)
-        return String.format(Locale.getDefault(), "%s (compatible; %s%s; +%s; %s)", _browserVersion, getAgentName(), _crawlerVersion, _webAddress, _emailAddress);
+        return String.format(Locale.getDefault(), "%s (compatible; %s%s; +%s; %s)", browserVersion, getAgentName(), crawlerConfiguration, webAddress, emailAddress);
     }
 }
diff --git a/src/main/java/crawlercommons/filters/URLFilter.java b/src/main/java/crawlercommons/filters/URLFilter.java
@@ -21,6 +21,10 @@ public abstract class URLFilter {
     /**
      * Returns a modified version of the input URL or null if the URL should be
      * removed
+     * 
+     * @param urlString
+     *            a URL string to check against filter(s)
+     * @return a filtered URL
      **/
     public abstract String filter(String urlString);
 

diff --git a/src/main/java/crawlercommons/robots/BaseRobotsParser.java b/src/main/java/crawlercommons/robots/BaseRobotsParser.java
@@ -22,11 +22,11 @@
 public abstract class BaseRobotsParser implements Serializable {
 
     /**
-     * Parse the robots.txt file in <content>, and return rules appropriate for
-     * processing paths by <userAgent>. Note that multiple agent names may be
-     * provided as comma-separated values; the order of these shouldn't matter,
-     * as the file is parsed in order, and each agent name found in the file
-     * will be compared to every agent name found in robotNames.
+     * Parse the robots.txt file in <i>content</i>, and return rules appropriate
+     * for processing paths by <i>userAgent</i>. Note that multiple agent names
+     * may be provided as comma-separated values; the order of these shouldn't
+     * matter, as the file is parsed in order, and each agent name found in the
+     * file will be compared to every agent name found in robotNames.
      * 
      * Also note that names are lower-cased before comparison, and that any
      * robot name you pass shouldn't contain commas or spaces; if the name has

diff --git a/src/main/java/crawlercommons/robots/RobotUtils.java b/src/main/java/crawlercommons/robots/RobotUtils.java
@@ -85,6 +85,9 @@ public static long getMaxFetchTime() {
      * 
      * @param fetcher
      *            Fetcher for downloading robots.txt file
+     * @param parser
+     *            a {@link crawlercommons.robots.BaseRobotsParser} to use for
+     *            obtaining appropriate rules
      * @param robotsUrl
      *            URL to robots.txt file
      * @return Robot rules

diff --git a/src/main/java/crawlercommons/robots/SimpleRobotRules.java b/src/main/java/crawlercommons/robots/SimpleRobotRules.java
@@ -65,7 +65,7 @@ public int compareTo(RobotRule o) {
 
         /*
          * (non-Javadoc)
-         *
+         * 
          * @see java.lang.Object#hashCode()
          */
         @Override
@@ -79,7 +79,7 @@ public int hashCode() {
 
         /*
          * (non-Javadoc)
-         *
+         * 
          * @see java.lang.Object#equals(java.lang.Object)
          */
         @Override
@@ -275,7 +275,7 @@ public void sortRules() {
 
     /**
      * Is our ruleset set up to allow all access?
-     *
+     * 
      * @return true if all URLs are allowed.
      */
     @Override
@@ -285,7 +285,7 @@ public boolean isAllowAll() {
 
     /**
      * Is our ruleset set up to disallow all access?
-     *
+     * 
      * @return true if no URLs are allowed.
      */
     @Override