Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Query parameters normalization in BasicURLNormalizer #309

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ Current Development 1.2-SNAPSHOT (yyyy-mm-dd)
- [Robots] Maximum values (crawl-delay and warnings): document and make visible (sebastian-nagel, Avi Hayun) #276
- [sitemaps] Replace priority "NaN" by default value (sebastian-nagel) #296
- [Sitemaps] Adding duration to the map generated by VideoAttributes.asMap (evanhalley) #300
- [BasicNormalizer] Sorting the Query Parameters (aecio) #246
- [BasicNormalizer] Allows to (optionally) remove common irrelevant query parameters (aecio)

Release 1.1 (2020-06-29)
- [sitemaps] Sitemaps to implement Serializable (cdalexndr, sebastian-nagel) #244
Expand Down
242 changes: 224 additions & 18 deletions src/main/java/crawlercommons/filters/basic/BasicURLNormalizer.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@

import static java.nio.charset.StandardCharsets.UTF_8;

import crawlercommons.utils.Strings;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
Expand All @@ -27,7 +29,13 @@
import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Locale;
import java.util.Set;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

Expand Down Expand Up @@ -137,6 +145,16 @@ private static boolean isAscii(String str) {
return true;
}

final Set<String> queryElementsToRemove;

public BasicURLNormalizer() {
this(new TreeSet<>());
}

public BasicURLNormalizer(Set<String> queryElementsToRemove) {
this.queryElementsToRemove = new TreeSet<>(queryElementsToRemove);
}

@Override
public String filter(String urlString) {

Expand All @@ -145,18 +163,7 @@ public String filter(String urlString) {

urlString = urlString.trim(); // remove extra spaces

URL url = null;
try {
url = new URL(urlString);
} catch (MalformedURLException e) {
if (!hasSchemePattern.matcher(urlString).find()) {
// no protocol/scheme : try to prefix http://
try {
url = new URL("http://" + urlString);
} catch (MalformedURLException e1) {
}
}
}
URL url = parseStringToURL(urlString);
if (url == null) {
LOG.debug("Malformed URL {}", urlString);
return null;
Expand Down Expand Up @@ -220,8 +227,8 @@ public String filter(String urlString) {
}

// properly encode characters in path/file using percent-encoding
String file2 = unescapePath(file);
file2 = escapePath(file2);
String file2 = normalizeUrlFile(file);

if (!file.equals(file2)) {
changed = true;
file = file2;
Expand Down Expand Up @@ -255,6 +262,205 @@ public String filter(String urlString) {
return urlString;
}

/**
* Tries to parse the given string into a java.net.URL object.
*
* @param urlString a string which possibly contains a URL
* @return a URL object or null if an exception occurs.
*/
private static URL parseStringToURL(String urlString) {
URL url = null;
try {
url = new URL(urlString);
} catch (MalformedURLException e) {
if (!hasSchemePattern.matcher(urlString).find()) {
// no protocol/scheme : try to prefix http://
try {
url = new URL("http://" + urlString);
} catch (MalformedURLException e1) {
}
}
}
return url;
}

/**
* Parses the URL file and applies normalizations to the path and query components.
*
* @param file the URL file (as in java.net.URL.getFile()).
* @return a normalized URL file
*/
private String normalizeUrlFile(String file) {
aecio marked this conversation as resolved.
Show resolved Hide resolved
// find the beginning of the query parameters
int endPathIdx = file.indexOf('?');
if (endPathIdx == -1) {
// no query parameters, just properly normalize the path
return escapePath(unescapePath(file));
}

int queryStartIdx = endPathIdx + 1;
if (queryStartIdx >= file.length()) {
// question mark was the last char in the file, so the query parameters
// string is empty. we can just remove the question mark and properly
// normalize the path.
final String path = file.substring(0, file.length() - 1);
return escapePath(unescapePath(path));
}

file = escapePath(unescapePath(file));

List<NameValuePair> pairs =
parseQueryParameters(file, queryStartIdx, queryElementsToRemove);

StringBuilder normalizedFile = new StringBuilder();
String path = file.substring(0, endPathIdx);
if (!Strings.isBlank(path)) {
normalizedFile.append(path);
}

// reconstruct query parameters in sorted order
if (!pairs.isEmpty()) {
pairs.sort(NameValuePair.NAME_COMPARATOR);
normalizedFile
.append('?')
.append(formatQueryParameters(pairs));
}

return normalizedFile.toString();
}

/**
* Receives the URL query string and parses it into a list of name-value pairs. Optionally,
* allows to remove query parameters.
*
* @param s a String containing the URL file (as per java.net.URL.getFile(), i.e., the path + query +
* fragment)
* @param queryStartIdx the index position of the query part in the string {@param s}.
* @param queryElementsToRemove a set of query parameter names to be ignored while parsing the
* query parameters.
*/
public static List<NameValuePair> parseQueryParameters(final String s, final int queryStartIdx,
aecio marked this conversation as resolved.
Show resolved Hide resolved
final Set<String> queryElementsToRemove) {

if (s == null || s.isEmpty()) {
return Collections.emptyList();
}

final List<NameValuePair> list = new ArrayList<>();

int nameBeginIdx;
String name;
int valueBeginIdx;
String value;

char c = s.charAt(queryStartIdx);
for (int i = queryStartIdx, len = s.length(); i < len; i++) {

// parse query parameter name
nameBeginIdx = i;
while (i < len) {
c = s.charAt(i);
if (isNameEnd(c)) {
break;
}
i++;
}
name = s.substring(nameBeginIdx, i);

// parse query parameter value
value = null;
if (i < len && c == '=') {
i++;
valueBeginIdx = i;
while (i < len) {
c = s.charAt(i);
if (isValueEnd(c)) {
break;
}
i++;
}
if (valueBeginIdx < i) {
value = s.substring(valueBeginIdx, i);
}
}

if (!name.isEmpty()) {
if (queryElementsToRemove != null && !queryElementsToRemove.contains(name)) {
list.add(new NameValuePair(name, value));
}
}
}
return list;
}

/**
* Checks if the given char is a delimiter of a query parameter value.
*
* @param c the char to be checked
* @return true if the char is a delimiter, false otherwise.
*/
private static boolean isValueEnd(final char c) {
return c == '&';
}

/**
* Checks if the given char is a delimiter of a query parameter name.
*
* @param c the char to be checked
* @return true if the char is a delimiter, false otherwise.
*/
private static boolean isNameEnd(final char c) {
return c == '=' || c == '&';
}

/**
* Formats a list of query parameter name-value pairs into a query parameter string.
*
* @param parameters the query parameter name-value pairs
* @return a URL query string
*/
public static String formatQueryParameters(final List<NameValuePair> parameters) {
aecio marked this conversation as resolved.
Show resolved Hide resolved
final StringBuilder result = new StringBuilder();
for (final NameValuePair parameter : parameters) {
if (result.length() > 0) {
result.append('&');
}
result.append(parameter.getName());
final String value = parameter.getValue();
if (value != null) {
result.append('=');
result.append(value);
}
}
return result.toString();
}

/**
* Represents the name-value pairs of each URL query parameter.
*/
private static class NameValuePair {

protected final String name;
protected final String value;

public NameValuePair(String name, String value) {
this.name = name;
this.value = value;
}

public String getName() {
return name;
}

public String getValue() {
return value;
}

public final static Comparator<NameValuePair> NAME_COMPARATOR =
Comparator.comparing(NameValuePair::getName);

}

private String getFileWithNormalizedPath(URL url) throws MalformedURLException {
String file;

Expand Down Expand Up @@ -294,7 +500,7 @@ private String getFileWithNormalizedPath(URL url) throws MalformedURLException {
* unescaped according to <a
* href="https://tools.ietf.org/html/rfc3986#section-2.2">RFC3986</a>.
*/
private String unescapePath(String path) {
public static String unescapePath(String path) {
StringBuilder sb = new StringBuilder();

Matcher matcher = unescapeRulePattern.matcher(path);
Expand All @@ -305,7 +511,7 @@ private String unescapePath(String path) {
// Traverse over all encoded groups
while (matcher.find()) {
// Append everything up to this group
sb.append(path.substring(end + 1, matcher.start()));
sb.append(path, end + 1, matcher.start());
Chaiavi marked this conversation as resolved.
Show resolved Hide resolved

// Get the integer representation of this hexadecimal encoded
// character
Expand All @@ -326,7 +532,7 @@ private String unescapePath(String path) {

// Append the rest if there's anything
if (end <= letter - 1) {
sb.append(path.substring(end + 1, letter));
sb.append(path, end + 1, letter);
}

return sb.toString();
Expand All @@ -337,7 +543,7 @@ private String unescapePath(String path) {
* characters which should be escaped according to <a
* href="https://tools.ietf.org/html/rfc3986#section-2.2">RFC3986</a>.
*/
private String escapePath(String path) {
private static String escapePath(String path) {
StringBuilder sb = new StringBuilder(path.length());

// Traverse over all bytes in this URL
Expand Down
36 changes: 36 additions & 0 deletions src/main/java/crawlercommons/utils/Strings.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
/**
* Copyright 2016 Crawler-Commons
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package crawlercommons.utils;

/**
* Util functions for manipulating strings.
*/
public class Strings {

public static boolean isBlank(final String cs) {
if (cs == null || cs.isEmpty()) {
Chaiavi marked this conversation as resolved.
Show resolved Hide resolved
return true;
}
for (int i = 0; i < cs.length(); i++) {
if (Character.isWhitespace(cs.charAt(i)) == false) {
return false;
}
}
return true;
}

}
Loading