Skip to content

Commit

Permalink
Merge pull request #324 from aecio/issue-321-builder
Browse files Browse the repository at this point in the history
Add a builder API for configuring the BasicURLNormalizer
  • Loading branch information
sebastian-nagel committed Oct 5, 2021
2 parents 47ee966 + 4b45097 commit ec1f2e5
Show file tree
Hide file tree
Showing 3 changed files with 89 additions and 19 deletions.
2 changes: 2 additions & 0 deletions CHANGES.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
Crawler-Commons Change Log

Current Development 1.2-SNAPSHOT (yyyy-mm-dd)
- [URLs] Provide a builder class to configure the URL normalizer (aecio) #321, #324
- [URLs] Make normalization of IDNs configurable (to ASCII or Unicode) via builder (aecio, sebastian-nagel) #324
- [Sitemaps] Fix XXE vulnerability in Sitemap parser (kovyrin) #323
- [URLs] Sorting the Query Parameters (aecio) #246, #309
- [URLs] Allows to (optionally) remove common irrelevant query parameters (aecio) #309
Expand Down
84 changes: 71 additions & 13 deletions src/main/java/crawlercommons/filters/basic/BasicURLNormalizer.java
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,7 @@
import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Locale;
import java.util.Set;
import java.util.TreeSet;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

Expand Down Expand Up @@ -145,14 +139,17 @@ private static boolean isAscii(String str) {
return true;
}

final Set<String> queryElementsToRemove;
private final Set<String> queryParamsToRemove;
private final IdnNormalization idnNormalization;


public BasicURLNormalizer() {
this(new TreeSet<>());
this(new Builder());
}

public BasicURLNormalizer(Set<String> queryElementsToRemove) {
this.queryElementsToRemove = new TreeSet<>(queryElementsToRemove);
public BasicURLNormalizer(Builder builder) {
this.queryParamsToRemove = builder.queryParamsToRemove;
this.idnNormalization = builder.idnNormalization;
}

@Override
Expand Down Expand Up @@ -310,7 +307,7 @@ private String normalizeUrlFile(String file) {
file = escapePath(unescapePath(file));

List<NameValuePair> pairs =
parseQueryParameters(file, queryStartIdx, queryElementsToRemove);
parseQueryParameters(file, queryStartIdx, queryParamsToRemove);

StringBuilder normalizedFile = new StringBuilder();
String path = file.substring(0, endPathIdx);
Expand Down Expand Up @@ -612,7 +609,7 @@ private String normalizeHostName(String host) throws IllegalArgumentException, I
* 3. convert between Unicode and ASCII forms for Internationalized
* Domain Names (IDNs)
*/
if (!isAscii(host)) {
if (this.idnNormalization == IdnNormalization.PUNYCODE && !isAscii(host)) {
/*
* IllegalArgumentException: thrown if the input string contains
* non-convertible Unicode codepoints
Expand All @@ -622,6 +619,8 @@ private String normalizeHostName(String host) throws IllegalArgumentException, I
* cf. https://bugs.openjdk.java.net/browse/JDK-6806873
*/
host = IDN.toASCII(host);
} else if (this.idnNormalization == IdnNormalization.UNICODE && host.contains("xn--")) {
host = IDN.toUnicode(host);
}

/* 4. trim a trailing dot */
Expand All @@ -632,6 +631,65 @@ private String normalizeHostName(String host) throws IllegalArgumentException, I
return host;
}

/**
* Create a new builder object for creating a customized {@link BasicURLNormalizer} object.
*
* @return
*/
public static Builder newBuilder() {
return new Builder();
}

public enum IdnNormalization {
NONE,
PUNYCODE,
UNICODE
}

/**
* A builder class for the {@link BasicURLNormalizer}.
*/
public static class Builder {

public IdnNormalization idnNormalization = IdnNormalization.PUNYCODE;
Set<String> queryParamsToRemove = new TreeSet<>();

private Builder() {
}

/**
* A collection of names of query parameters that should be removed from the URL query.
*
* @param queryParamsToRemove
* @return this builder
*/
public Builder queryParamsToRemove(Collection<String> queryParamsToRemove) {
this.queryParamsToRemove = new TreeSet<>(queryParamsToRemove);
return this;
}

/**
* Configures whether internationalized domain names (IDNs) should be
* converted to ASCII/Punycode or Unicode.
*
* @param idnNormalization
* @return this builder
*/
public Builder idnNormalization(IdnNormalization idnNormalization) {
this.idnNormalization = idnNormalization;
return this;
}

/**
* Constructs the custom URL normalizer instance.
*
* @return the constructed URL normalizer
*/
public BasicURLNormalizer build() {
return new BasicURLNormalizer(this);
}
}

public static void main(String args[]) throws IOException {
BasicURLNormalizer normalizer = new BasicURLNormalizer();
String line, normUrl;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,14 @@

package crawlercommons.filters.basic;

import static java.util.Arrays.asList;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNull;

import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.CsvFileSource;
import java.util.Arrays;
import java.util.List;
import java.util.TreeSet;

/** Unit tests for BasicURLNormalizer. */
public class BasicURLNormalizerTest {
Expand All @@ -50,14 +48,26 @@ void testBasicNormalizerExceptionCaught(String weirdUrl) {

@Test
public void testRemoveSessionQueryParameters() {
List<String> invalidParameters = Arrays
.asList("sid", "phpsessid", "sessionid", "jsessionid");
normalizer = new BasicURLNormalizer(new TreeSet<>(invalidParameters));
normalizer = BasicURLNormalizer.newBuilder().queryParamsToRemove(asList("sid", "phpsessid", "sessionid", "jsessionid")).build();
normalizeTest("http://foo.com/foo.php?phpsessid=2Aa3ASdfasfdadf&a=1", "http://foo.com/foo.php?a=1");
normalizeTest("http://foo.com/foo.php?phpsessid=2Aa3ASdfasfdadf&a=1&b", "http://foo.com/foo.php?a=1&b");
normalizeTest("http://foo.com/foo.php?phpsessid=2Aa3ASdfasfdadf", "http://foo.com/foo.php");
}

@Test
public void testHostToUnicode() {
normalizer = BasicURLNormalizer.newBuilder().idnNormalization(BasicURLNormalizer.IdnNormalization.UNICODE).build();
normalizeTest("http://xn--schne-lua.xn--bcher-kva.de/", "http://schöne.bücher.de/");
normalizeTest("https://xn--90ax2c.xn--p1ai/", "https://нэб.рф/");
}

@Test
public void testNoIdnNormalization() {
normalizer = BasicURLNormalizer.newBuilder().idnNormalization(BasicURLNormalizer.IdnNormalization.NONE).build();
// leave the host name as is, even if it's mixed
normalizeTest("http://schöne.xn--bcher-kva.de/", "http://schöne.xn--bcher-kva.de/");
}

private void normalizeTest(String weird, String normal) {
assertEquals(normal, normalizer.filter(weird), "normalizing: " + weird);
}
Expand Down

0 comments on commit ec1f2e5

Please sign in to comment.