Skip to content

Commit

Permalink
Query parameters normalization
Browse files Browse the repository at this point in the history
- Sort query parameters (fix #246)
- Allows to (optionally) remove common irrelevant query parameters
- Consistently encode query parameters with
'application/x-www-form-urlencoded'
  • Loading branch information
aecio committed Jan 4, 2021
1 parent 9630f4c commit ab610bf
Show file tree
Hide file tree
Showing 5 changed files with 393 additions and 27 deletions.
199 changes: 181 additions & 18 deletions src/main/java/crawlercommons/filters/basic/BasicURLNormalizer.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@

import static java.nio.charset.StandardCharsets.UTF_8;

import crawlercommons.utils.Strings;
import crawlercommons.utils.URLEncoding;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
Expand All @@ -27,7 +29,14 @@
import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLDecoder;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Locale;
import java.util.Set;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

Expand Down Expand Up @@ -137,6 +146,16 @@ private static boolean isAscii(String str) {
return true;
}

final Set<String> queryElementsToRemove;

public BasicURLNormalizer() {
this(new TreeSet<>());
}

public BasicURLNormalizer(Set<String> queryElementsToRemove) {
this.queryElementsToRemove = new TreeSet<>(queryElementsToRemove);
}

@Override
public String filter(String urlString) {

Expand All @@ -145,18 +164,7 @@ public String filter(String urlString) {

urlString = urlString.trim(); // remove extra spaces

URL url = null;
try {
url = new URL(urlString);
} catch (MalformedURLException e) {
if (!hasSchemePattern.matcher(urlString).find()) {
// no protocol/scheme : try to prefix http://
try {
url = new URL("http://" + urlString);
} catch (MalformedURLException e1) {
}
}
}
URL url = parseUrl(urlString);
if (url == null) {
LOG.debug("Malformed URL {}", urlString);
return null;
Expand Down Expand Up @@ -220,8 +228,7 @@ public String filter(String urlString) {
}

// properly encode characters in path/file using percent-encoding
String file2 = unescapePath(file);
file2 = escapePath(file2);
String file2 = normalizeUrlFile(file);
if (!file.equals(file2)) {
changed = true;
file = file2;
Expand Down Expand Up @@ -255,6 +262,162 @@ public String filter(String urlString) {
return urlString;
}

private static URL parseUrl(String urlString) {
URL url = null;
try {
url = new URL(urlString);
} catch (MalformedURLException e) {
if (!hasSchemePattern.matcher(urlString).find()) {
// no protocol/scheme : try to prefix http://
try {
url = new URL("http://" + urlString);
} catch (MalformedURLException e1) {
}
}
}
return url;
}

private String normalizeUrlFile(String file) {
int queryStartIdx;
int endPathIdx = file.indexOf('?');
if (endPathIdx == -1) {
// no query parameters
return escapePath(unescapePath(file));
} else {
queryStartIdx = endPathIdx + 1;
if(queryStartIdx >= file.length()) {
// question mark was the last char in file
final String path = file.substring(0, file.length() - 1);
return escapePath(unescapePath(path));
}
}

List<NameValuePair> pairs =
parseQueryParameters(file, queryStartIdx, UTF_8, queryElementsToRemove);

StringBuilder normalizedFile = new StringBuilder();

String path = unescapePath(file.substring(0, endPathIdx));
if (Strings.isNotBlank(path)) {
normalizedFile.append(path);
}
if (!pairs.isEmpty()) {
pairs.sort(NameValuePair.NAME_COMPARATOR);
normalizedFile
.append('?')
.append(formatQueryParameters(pairs));
}

return normalizedFile.toString();
}

public static List<NameValuePair> parseQueryParameters(final String s, final int queryStartIdx,
final Charset charset, final Set<String> queryElementsToRemove) {

if (s == null || s.isEmpty()) {
return Collections.emptyList();
}

final List<NameValuePair> list = new ArrayList<>();

int nameBeginIdx;
String name;
int valueBeginIdx;
String value;

char c = s.charAt(queryStartIdx);
for (int i = queryStartIdx, len = s.length(); i < len; i++) {

// parse query parameter name
nameBeginIdx = i;
while (i < len) {
c = s.charAt(i);
if (isNameEnd(c)) {
break;
}
i++;
}
name = s.substring(nameBeginIdx, i);

// parse query parameter value
value = null;
if (i < len && c == '=') {
i++;
valueBeginIdx = i;
while (i < len) {
c = s.charAt(i);
if (isValueEnd(c)) {
break;
}
i++;
}
if (valueBeginIdx < i) {
value = s.substring(valueBeginIdx, i);
}
}

if (!name.isEmpty()) {
final String decodedName = URLEncoding.urlDecode(name, charset);
final String decodedValue = URLEncoding.urlDecode(value, charset);
if (queryElementsToRemove != null && !queryElementsToRemove.contains(decodedName)) {
list.add(new NameValuePair(
URLEncoding.urlEncode(decodedName, charset),
URLEncoding.urlEncode(decodedValue, charset)
));
}
}
}
return list;
}

private static boolean isValueEnd(final char c) {
return c == '&' || c == ';';
}

private static boolean isNameEnd(final char c) {
return c == '=' || c == '&' || c == ';';
}

public static String formatQueryParameters(final List<NameValuePair> parameters) {
final StringBuilder result = new StringBuilder();
for (final NameValuePair parameter : parameters) {
if (result.length() > 0) {
result.append('&');
}
result.append(parameter.getName());
final String value = parameter.getValue();
if (value != null) {
result.append('=');
result.append(value);
}
}
return result.toString();
}

public static class NameValuePair {

protected final String name;
protected final String value;

public NameValuePair(String name, String value) {
this.name = name;
this.value = value;
}

public String getName() {
return name;
}

public String getValue() {
return value;
}

public final static Comparator<NameValuePair> NAME_COMPARATOR =
Comparator.comparing(NameValuePair::getName);

}

private String getFileWithNormalizedPath(URL url) throws MalformedURLException {
String file;

Expand Down Expand Up @@ -294,7 +457,7 @@ private String getFileWithNormalizedPath(URL url) throws MalformedURLException {
* unescaped according to <a
* href="https://tools.ietf.org/html/rfc3986#section-2.2">RFC3986</a>.
*/
private String unescapePath(String path) {
public static String unescapePath(String path) {
StringBuilder sb = new StringBuilder();

Matcher matcher = unescapeRulePattern.matcher(path);
Expand All @@ -305,7 +468,7 @@ private String unescapePath(String path) {
// Traverse over all encoded groups
while (matcher.find()) {
// Append everything up to this group
sb.append(path.substring(end + 1, matcher.start()));
sb.append(path, end + 1, matcher.start());

// Get the integer representation of this hexadecimal encoded
// character
Expand All @@ -326,7 +489,7 @@ private String unescapePath(String path) {

// Append the rest if there's anything
if (end <= letter - 1) {
sb.append(path.substring(end + 1, letter));
sb.append(path, end + 1, letter);
}

return sb.toString();
Expand All @@ -337,7 +500,7 @@ private String unescapePath(String path) {
* characters which should be escaped according to <a
* href="https://tools.ietf.org/html/rfc3986#section-2.2">RFC3986</a>.
*/
private String escapePath(String path) {
private static String escapePath(String path) {
StringBuilder sb = new StringBuilder(path.length());

// Traverse over all bytes in this URL
Expand Down
28 changes: 28 additions & 0 deletions src/main/java/crawlercommons/utils/Strings.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
package crawlercommons.utils;

/**
* Util functions for manipulating strings.
*/
public class Strings {

public static boolean isBlank(final String cs) {
if (cs == null || cs.isEmpty()) {
return true;
}
for (int i = 0; i < cs.length(); i++) {
if (Character.isWhitespace(cs.charAt(i)) == false) {
return false;
}
}
return true;
}

public static boolean isNotBlank(final String cs) {
return !isBlank(cs);
}

public static boolean isEmpty(final String cs) {
return cs == null || cs.length() == 0;
}

}
Loading

0 comments on commit ab610bf

Please sign in to comment.