Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow to include/exclude private domains / effective TLDs without re-initialization, fixes #185 #186

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
114 changes: 64 additions & 50 deletions src/main/java/crawlercommons/domains/EffectiveTldFinder.java
Original file line number Diff line number Diff line change
Expand Up @@ -72,24 +72,24 @@
* The <a href="https://publicsuffix.org/list/">public suffix list (see section
* &quot;divisions&quot;)</a> is subdivided into &quot;ICANN&quot; and
* &quot;PRIVATE&quot; domains. To restrict the EffectiveTldFinder to
* &quot;ICANN&quot; domains only, (re)initialize it by
* {@link EffectiveTldFinder#getInstance() EffectiveTldFinder.getInstance()}
* {@link EffectiveTldFinder#initialize(boolean) .initialize(true)} or
* {@link EffectiveTldFinder#initialize(InputStream,boolean)
* .initialize(InputStream, true)}. This will exclude the PRIVATE domain section
* from the public suffix list.
* &quot;ICANN&quot; domains only, pass &quot;true&quot; as flag
* <code>excludePrivate</code> to
* {@link EffectiveTldFinder#getAssignedDomain(String, boolean, boolean)} resp.
* {@link EffectiveTldFinder#getEffectiveTLD(String, boolean)}. This will
* exclude the eTLDs from the PRIVATE domain section of the public suffix list
* while a domain or eTLD is matched.
*
*/
public class EffectiveTldFinder {
private static final Logger LOGGER = LoggerFactory.getLogger(EffectiveTldFinder.class);

public static final String ETLD_DATA = "/effective_tld_names.dat";
public static final String COMMENT = "//";
public static final String DOT_REGEX = "\\.";
public static final String EXCEPTION = "!";
public static final String WILD_CARD = "*.";
public static final char DOT = '.';

private static EffectiveTldFinder instance = null;
private Map<String, EffectiveTLD> domains = null;
private boolean configured = false;
Expand All @@ -113,18 +113,6 @@ public static EffectiveTldFinder getInstance() {
return instance;
}

/**
* (Re)initialize EffectiveTldFinder with built-in public suffix list.
*
* @param excludePrivateDomains
* whether to exclude the public suffixes listed in the PRIVATE
* domain section (opposed to &quot;ICANN&quot; domains)
* @return true if (re)initialization was successful
*/
public boolean initialize(boolean excludePrivateDomains) {
return initialize(this.getClass().getResourceAsStream(ETLD_DATA), excludePrivateDomains);
}

/**
* (Re)initialize EffectiveTldFinder with custom public suffix list.
*
Expand All @@ -133,20 +121,6 @@ public boolean initialize(boolean excludePrivateDomains) {
* @return true if (re)initialization was successful
*/
public boolean initialize(InputStream effectiveTldDataStream) {
return initialize(effectiveTldDataStream, false);
}

/**
* (Re)initialize EffectiveTldFinder with custom public suffix list.
*
* @param effectiveTldDataStream
* content of public suffix list as input stream
* @param excludePrivateDomains
* whether to exclude the public suffixes listed in the PRIVATE
* domain section (opposed to &quot;ICANN&quot; domains)
* @return true if (re)initialization was successful
*/
public boolean initialize(InputStream effectiveTldDataStream, boolean excludePrivateDomains) {
domains = new HashMap<>();
boolean inPrivateDomainSection = false;
try {
Expand All @@ -156,18 +130,14 @@ public boolean initialize(InputStream effectiveTldDataStream, boolean excludePri
if (line.trim().isEmpty()) {
continue;
} else if (line.startsWith(COMMENT)) {
if (excludePrivateDomains) {
if (line.contains("===BEGIN PRIVATE DOMAINS===")) {
inPrivateDomainSection = true;
} else if (line.contains("===END PRIVATE DOMAINS===")) {
inPrivateDomainSection = false;
}
if (line.contains("===BEGIN PRIVATE DOMAINS===")) {
inPrivateDomainSection = true;
} else if (line.contains("===END PRIVATE DOMAINS===")) {
inPrivateDomainSection = false;
}
continue;
} else if (excludePrivateDomains && inPrivateDomainSection) {
continue;
} else {
EffectiveTLD entry = new EffectiveTLD(line);
EffectiveTLD entry = new EffectiveTLD(line, inPrivateDomainSection);
domains.put(entry.getDomain(), entry);
}
}
Expand Down Expand Up @@ -197,8 +167,26 @@ public static Map<String, EffectiveTLD> getEffectiveTLDs() {
* @return the {@link EffectiveTLD}
*/
public static EffectiveTLD getEffectiveTLD(String hostname) {
return getEffectiveTLD(hostname, false);
}

/**
* Get EffectiveTLD for host name using the singleton instance of
* EffectiveTldFinder.
*
* @param hostname
* the hostname for which to find the {@link EffectiveTLD}
* @param excludePrivate
* do not return an effective TLD from the PRIVATE section,
* instead return the shorter eTLD not in the PRIVATE section
* @return the {@link EffectiveTLD}
*/
public static EffectiveTLD getEffectiveTLD(String hostname, boolean excludePrivate) {
if (getInstance().domains.containsKey(hostname)) {
return getInstance().domains.get(hostname);
EffectiveTLD foundTld = getInstance().domains.get(hostname);
if (!(excludePrivate && foundTld.isPrivate)) {
return foundTld;
}
}
String[] parts = hostname.split(DOT_REGEX);
for (int i = 1; i < parts.length; i++) {
Expand All @@ -209,13 +197,16 @@ public static EffectiveTLD getEffectiveTLD(String hostname) {
}
if (getInstance().domains.containsKey(tryTld)) {
EffectiveTLD foundTld = getInstance().domains.get(tryTld);
if (excludePrivate && foundTld.isPrivate) {
continue;
}
if (foundTld.isException() || !foundTld.isWild()) {
return foundTld;
}
// wildcards create an open ETLD namespace
slice = Arrays.copyOfRange(parts, i - 1, parts.length);
String retryTld = join(slice);
foundTld = new EffectiveTLD(retryTld);
foundTld = new EffectiveTLD(retryTld, foundTld.isPrivate);
return foundTld;
}
}
Expand All @@ -232,7 +223,7 @@ public static EffectiveTLD getEffectiveTLD(String hostname) {
* FQDN with valid TLD is found
*/
public static String getAssignedDomain(String hostname) {
return getAssignedDomain(hostname, false);
return getAssignedDomain(hostname, false, false);
}

/**
Expand All @@ -248,8 +239,28 @@ public static String getAssignedDomain(String hostname) {
* valid TLD is found
*/
public static String getAssignedDomain(String hostname, boolean strict) {
return getAssignedDomain(hostname, strict, false);
}

/**
* This method uses the effective TLD to determine which component of a FQDN
* is the NIC-assigned domain name.
*
* @param hostname
* a string for which to obtain a NIC-assigned domain name
* @param strict
* do not return the hostname as fall-back if a FQDN with valid
* TLD cannot be determined
* @param excludePrivate
* do not return a domain which is below an eTLD from the PRIVATE
* section, return the shorter domain which is below the
* &quot;ICANN&quot; registry suffix
* @return the NIC-assigned domain name, null if strict and no FQDN with
* valid TLD is found
*/
public static String getAssignedDomain(String hostname, boolean strict, boolean excludePrivate) {
hostname = hostname.toLowerCase(Locale.ROOT);
EffectiveTLD etld = getEffectiveTLD(hostname);
EffectiveTLD etld = getEffectiveTLD(hostname, excludePrivate);
if (null == etld) {
return (strict ? null : hostname);
}
Expand Down Expand Up @@ -305,9 +316,10 @@ public static class EffectiveTLD {

private boolean exception = false;
private boolean wild = false;
private boolean isPrivate = false;
private String domain = null;

public EffectiveTLD(String line) {
public EffectiveTLD(String line, boolean isPrivateDomain) {
if (line.startsWith(EXCEPTION)) {
exception = true;
domain = line.substring(EXCEPTION.length(), line.length());
Expand All @@ -317,8 +329,9 @@ public EffectiveTLD(String line) {
} else {
domain = line;
}

domain = normalizeName(domain);
isPrivate = isPrivateDomain;
}

private String normalizeName(String name) {
Expand Down Expand Up @@ -364,7 +377,8 @@ public String toString() {
StringBuffer sb = new StringBuffer("[");
sb.append("domain=").append(domain).append(",");
sb.append("wild=").append(wild).append(",");
sb.append("exception=").append(exception).append("]");
sb.append("exception=").append(exception).append(",");
sb.append("private=").append(isPrivate).append("]");
return sb.toString();
}
}
Expand Down
8 changes: 4 additions & 4 deletions src/main/java/crawlercommons/domains/PaidLevelDomain.java
Original file line number Diff line number Diff line change
Expand Up @@ -54,12 +54,12 @@ public static String getPLD(String hostname) {
}

// Now use support in EffectiveTldFinder
String result = EffectiveTldFinder.getAssignedDomain(hostname, true);
String result = EffectiveTldFinder.getAssignedDomain(hostname, true, true);
if (result == null) {
LOGGER.debug("Hostname {} isn't a valid FQDN", hostname);
return hostname;
LOGGER.debug("Hostname {} isn't a valid FQDN", hostname);
return hostname;
} else {
return result;
return result;
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -190,4 +190,13 @@ public final void testStrictDomain() throws Exception {
assertNull(ad);
}

@Test
public final void testPrivateDomain() throws Exception {
String ad = null;
ad = EffectiveTldFinder.getAssignedDomain("myblog.blogspot.com", true, false);
assertEquals("myblog.blogspot.com", ad);
ad = EffectiveTldFinder.getAssignedDomain("myblog.blogspot.com", true, true);
assertEquals("blogspot.com", ad);
}

}
28 changes: 15 additions & 13 deletions src/test/java/crawlercommons/domains/PaidLevelDomainTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,11 @@ public final void testIPv4() throws MalformedURLException {

@Test
public void testInvalidFQDN() {
assertEquals("blah", PaidLevelDomain.getPLD("blah"));
assertEquals("1.2.3", PaidLevelDomain.getPLD("1.2.3"));
assertEquals("me.i", PaidLevelDomain.getPLD("me.i"));
assertEquals("blah", PaidLevelDomain.getPLD("blah"));
assertEquals("1.2.3", PaidLevelDomain.getPLD("1.2.3"));
assertEquals("me.i", PaidLevelDomain.getPLD("me.i"));
}

@Test
public final void testIPv6() throws MalformedURLException, UnknownHostException {
InetAddress inet = InetAddress.getByName("1080:0:0:0:8:800:200c:417a");
Expand Down Expand Up @@ -74,11 +74,12 @@ public final void testJapaneseDomains() {
assertEquals("xxx.ne.jp", PaidLevelDomain.getPLD("www.xxx.ne.jp"));
}

// In Germany you can have xxx.de.com
// de.com (and com.de) are domains registered by CentralNic,
// xxx.de.com and xxx.com.de are private domains
@Test
public final void testGermanDomains() {
assertEquals("xxx.de.com", PaidLevelDomain.getPLD("xxx.de.com"));
assertEquals("xxx.de.com", PaidLevelDomain.getPLD("www.xxx.de.com"));
assertEquals("de.com", PaidLevelDomain.getPLD("xxx.de.com"));
assertEquals("de.com", PaidLevelDomain.getPLD("www.xxx.de.com"));
}

// Typical international domains look like xxx.it. So xxx.com.it is
Expand All @@ -89,18 +90,19 @@ public final void testItalianDomains() {
assertEquals("xxx.it", PaidLevelDomain.getPLD("www.xxx.it"));
assertEquals("com.it", PaidLevelDomain.getPLD("xxx.com.it"));
}

@Test
public final void testFinnishDomains() {
assertEquals("fi.com", PaidLevelDomain.getPLD("www.fi.com"));
}

// TODO enable this test when getPLD uses new TLD support to exclude
// private domains (See https://github.com/crawler-commons/crawler-commons/pull/186)
@Ignore

@Test
public final void testPrivateDomains() {
assertEquals("blogspot.com", PaidLevelDomain.getPLD("myblog.blogspot.com"));
/*
* do not match "private" domains (based on public suffixes from the
* private section of the public suffix list)
*/
assertEquals("blogspot.com", PaidLevelDomain.getPLD("myblog.blogspot.com"));
}

}