Skip to content

Commit

Permalink
Merge pull request #126 from lewismc/ISSUE-125
Browse files Browse the repository at this point in the history
Upgrade to JDK 1.8
  • Loading branch information
jnioche committed Sep 30, 2016
2 parents f4b76c7 + 18bbae9 commit d0c1221
Show file tree
Hide file tree
Showing 13 changed files with 256 additions and 171 deletions.
6 changes: 6 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
language: java

jdk:
- oraclejdk8

script:
- mvn install javadoc:aggregate

notifications:
email:
- crawler-commons@googlegroups.com
6 changes: 3 additions & 3 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -362,9 +362,9 @@

<!-- General Properties -->
<implementation.build>${scmBranch}@r${buildNumber}</implementation.build>
<javac.src.version>1.7</javac.src.version>
<javac.target.version>1.7</javac.target.version>
<maven.compiler.target>1.7</maven.compiler.target>
<javac.src.version>1.8</javac.src.version>
<javac.target.version>1.8</javac.target.version>
<maven.compiler.target>1.8</maven.compiler.target>
<maven.build.timestamp.format>yyyy-MM-dd HH:mm:ssZ</maven.build.timestamp.format>
<skipTests>false</skipTests>
<assembly.finalName>${project.build.finalName}</assembly.finalName>
Expand Down
23 changes: 13 additions & 10 deletions src/main/java/crawlercommons/domains/EffectiveTldFinder.java
Original file line number Diff line number Diff line change
Expand Up @@ -74,13 +74,13 @@ public static EffectiveTldFinder getInstance() {
return instance;
}

public boolean initialize(InputStream effective_tld_data_stream) {
domains = new HashMap<String, EffectiveTLD>();
public boolean initialize(InputStream effectiveTldDataStream) {
domains = new HashMap<>();
try {
if (null == effective_tld_data_stream && null != this.getClass().getResource(ETLD_DATA)) {
effective_tld_data_stream = this.getClass().getResourceAsStream(ETLD_DATA);
if (null == effectiveTldDataStream && null != this.getClass().getResource(ETLD_DATA)) {
effectiveTldDataStream = this.getClass().getResourceAsStream(ETLD_DATA);
}
BufferedReader input = new BufferedReader(new InputStreamReader(effective_tld_data_stream, Charset.defaultCharset()));
BufferedReader input = new BufferedReader(new InputStreamReader(effectiveTldDataStream, Charset.defaultCharset()));
String line = null;
while (null != (line = input.readLine())) {
if (line.length() == 0 || (line.length() > 1 && line.startsWith(COMMENT))) {
Expand Down Expand Up @@ -109,7 +109,10 @@ public static Map<String, EffectiveTLD> getEffectiveTLDs() {

/**
* @param hostname
* @return the Effective TLD
* the hostname for which to find the
* {@link crawlercommons.domains.EffectiveTldFinder.EffectiveTLD}
* @return the
* {@link crawlercommons.domains.EffectiveTldFinder.EffectiveTLD}
*/
public static EffectiveTLD getEffectiveTLD(String hostname) {
if (getInstance().domains.containsKey(hostname)) {
Expand Down Expand Up @@ -145,15 +148,15 @@ public static EffectiveTLD getEffectiveTLD(String hostname) {
* is the NIC-assigned domain name.
*
* @param hostname
* a string for which to obtain a NIC-assigned domain name
* @return the NIC-assigned domain name
*/
public static String getAssignedDomain(String hostname) {
EffectiveTLD etld = getEffectiveTLD(hostname);
if (null == etld || etld.getDomain() == hostname.toLowerCase(Locale.getDefault())) {
return hostname.toLowerCase(Locale.getDefault());
}
String domain = hostname.replaceFirst(".*?([^.]+\\.)" + etld.getDomain() + "$", "$1" + etld.getDomain());
return domain;
return hostname.replaceFirst(".*?([^.]+\\.)" + etld.getDomain() + "$", "$1" + etld.getDomain());
}

public boolean isConfigured() {
Expand Down Expand Up @@ -185,10 +188,10 @@ public EffectiveTLD(String line) {
} else {
domain = line;
}
domain = normalize_name(domain);
domain = normalizeName(domain);
}

private String normalize_name(String name) {
private String normalizeName(String name) {
String[] parts = name.split(DOT_REGEX);
if (parts.length < 2) {
return name;
Expand Down
14 changes: 7 additions & 7 deletions src/main/java/crawlercommons/fetcher/BaseFetcher.java
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,9 @@ public abstract class BaseFetcher implements Serializable {

public static final int DEFAULT_MAX_CONTENT_SIZE = 64 * 1024;

protected Map<String, Integer> _maxContentSizes = new HashMap<String, Integer>();
protected Map<String, Integer> _maxContentSizes = new HashMap<>();
protected int _defaultMaxContentSize = DEFAULT_MAX_CONTENT_SIZE;
protected Set<String> _validMimeTypes = new HashSet<String>();
protected Set<String> _validMimeTypes = new HashSet<>();

public BaseFetcher() {
}
Expand Down Expand Up @@ -95,12 +95,12 @@ protected static String getMimeTypeFromContentType(String contentType) {
}

/**
* Get the content stored in the resource referenced by <url>
* Get the content stored in the resource referenced by the 'url' parameter.
*
* @param url
* @param payload
* @return
* @throws BaseFetchException
* @param url a string url for which to get content
* @param payload a populated {@link crawlercommons.fetcher.Payload}
* @return the {@link crawlercommons.fetcher.FetchedResult} associated with the URL
* @throws BaseFetchException if an error results from fetching the url.
*/
public abstract FetchedResult get(String url, Payload payload) throws BaseFetchException;

Expand Down
36 changes: 24 additions & 12 deletions src/main/java/crawlercommons/fetcher/http/UserAgent.java
Original file line number Diff line number Diff line change
Expand Up @@ -45,18 +45,21 @@ public class UserAgent implements Serializable {
public static final String DEFAULT_BROWSER_VERSION = "Mozilla/5.0";
public static final String DEFAULT_CRAWLER_VERSION = CrawlerCommons.getVersion();

private final String _agentName;
private final String _emailAddress;
private final String _webAddress;
private final String _browserVersion;
private final String _crawlerVersion;
private final String agentName;
private final String emailAddress;
private final String webAddress;
private final String browserVersion;
private final String crawlerConfiguration;

/**
* Set user agent characteristics
*
* @param agentName
* an agent name string to associate with the crawler
* @param emailAddress
* an agent email address string to associate with the crawler
* @param webAddress
* a Web address string to associate with the crawler
*/
public UserAgent(String agentName, String emailAddress, String webAddress) {
this(agentName, emailAddress, webAddress, DEFAULT_BROWSER_VERSION);
Expand All @@ -66,9 +69,13 @@ public UserAgent(String agentName, String emailAddress, String webAddress) {
* Set user agent characteristics
*
* @param agentName
* an agent name string to associate with the crawler
* @param emailAddress
* an agent email address string to associate with the crawler
* @param webAddress
* a Web address string to associate with the crawler
* @param browserVersion
* a browser version to mimic
*/
public UserAgent(String agentName, String emailAddress, String webAddress, String browserVersion) {
this(agentName, emailAddress, webAddress, browserVersion, DEFAULT_CRAWLER_VERSION);
Expand All @@ -78,17 +85,22 @@ public UserAgent(String agentName, String emailAddress, String webAddress, Strin
* Set user agent characteristics
*
* @param agentName
* an agent name string to associate with the crawler
* @param emailAddress
* an agent email address string to associate with the crawler
* @param webAddress
* a Web address string to associate with the crawler
* @param browserVersion
* a browser version to mimic
* @param crawlerVersion
* the version of your crawler/crawl agent
*/
public UserAgent(String agentName, String emailAddress, String webAddress, String browserVersion, String crawlerVersion) {
_agentName = agentName;
_emailAddress = emailAddress;
_webAddress = webAddress;
_browserVersion = browserVersion;
_crawlerVersion = (crawlerVersion == null ? "" : "/" + crawlerVersion);
this.agentName = agentName;
this.emailAddress = emailAddress;
this.webAddress = webAddress;
this.browserVersion = browserVersion;
this.crawlerConfiguration = crawlerVersion == null ? "" : "/" + crawlerVersion;
}

/**
Expand All @@ -97,7 +109,7 @@ public UserAgent(String agentName, String emailAddress, String webAddress, Strin
* @return User Agent name (String)
*/
public String getAgentName() {
return _agentName;
return agentName;
}

/**
Expand All @@ -108,6 +120,6 @@ public String getAgentName() {
public String getUserAgentString() {
// Mozilla/5.0 (compatible; mycrawler/1.0; +http://www.mydomain.com;
// mycrawler@mydomain.com)
return String.format(Locale.getDefault(), "%s (compatible; %s%s; +%s; %s)", _browserVersion, getAgentName(), _crawlerVersion, _webAddress, _emailAddress);
return String.format(Locale.getDefault(), "%s (compatible; %s%s; +%s; %s)", browserVersion, getAgentName(), crawlerConfiguration, webAddress, emailAddress);
}
}
4 changes: 4 additions & 0 deletions src/main/java/crawlercommons/filters/URLFilter.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@ public abstract class URLFilter {
/**
* Returns a modified version of the input URL or null if the URL should be
* removed
*
* @param urlString
* a URL string to check against filter(s)
* @return a filtered URL
**/
public abstract String filter(String urlString);

Expand Down
10 changes: 5 additions & 5 deletions src/main/java/crawlercommons/robots/BaseRobotsParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,11 @@
public abstract class BaseRobotsParser implements Serializable {

/**
* Parse the robots.txt file in <content>, and return rules appropriate for
* processing paths by <userAgent>. Note that multiple agent names may be
* provided as comma-separated values; the order of these shouldn't matter,
* as the file is parsed in order, and each agent name found in the file
* will be compared to every agent name found in robotNames.
* Parse the robots.txt file in <i>content</i>, and return rules appropriate
* for processing paths by <i>userAgent</i>. Note that multiple agent names
* may be provided as comma-separated values; the order of these shouldn't
* matter, as the file is parsed in order, and each agent name found in the
* file will be compared to every agent name found in robotNames.
*
* Also note that names are lower-cased before comparison, and that any
* robot name you pass shouldn't contain commas or spaces; if the name has
Expand Down
3 changes: 3 additions & 0 deletions src/main/java/crawlercommons/robots/RobotUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,9 @@ public static long getMaxFetchTime() {
*
* @param fetcher
* Fetcher for downloading robots.txt file
* @param parser
* a {@link crawlercommons.robots.BaseRobotsParser} to use for
* obtaining appropriate rules
* @param robotsUrl
* URL to robots.txt file
* @return Robot rules
Expand Down
8 changes: 4 additions & 4 deletions src/main/java/crawlercommons/robots/SimpleRobotRules.java
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ public int compareTo(RobotRule o) {

/*
* (non-Javadoc)
*
*
* @see java.lang.Object#hashCode()
*/
@Override
Expand All @@ -79,7 +79,7 @@ public int hashCode() {

/*
* (non-Javadoc)
*
*
* @see java.lang.Object#equals(java.lang.Object)
*/
@Override
Expand Down Expand Up @@ -275,7 +275,7 @@ public void sortRules() {

/**
* Is our ruleset set up to allow all access?
*
*
* @return true if all URLs are allowed.
*/
@Override
Expand All @@ -285,7 +285,7 @@ public boolean isAllowAll() {

/**
* Is our ruleset set up to disallow all access?
*
*
* @return true if no URLs are allowed.
*/
@Override
Expand Down
Loading

0 comments on commit d0c1221

Please sign in to comment.