Skip to content

Commit

Permalink
Merge branch 'aecio:aecio/query-params-normalization', fixes #246, cl…
Browse files Browse the repository at this point in the history
…oses #309

- rebase to master and squash commits
- fix failing sitemaps unit tests with URL filtering using BasicURLNormalizer
  (sort query params in test sitemap)
- CHANGES.txt: updated to follow style, added missing entry for preceding commit
  • Loading branch information
sebastian-nagel committed Sep 21, 2021
2 parents 7a8bbb6 + 94bac65 commit a10cf25
Show file tree
Hide file tree
Showing 6 changed files with 330 additions and 31 deletions.
3 changes: 3 additions & 0 deletions CHANGES.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
Crawler-Commons Change Log

Current Development 1.2-SNAPSHOT (yyyy-mm-dd)
- [URLs] Sorting the Query Parameters (aecio) #246, #309
- [URLs] Allows to (optionally) remove common irrelevant query parameters (aecio) #309
- [Sitemaps] Allow to normalize URLs in sitemaps (murderinc, sebastian-nagel) #305
- Normalize CHANGES.txt (Avi Hayun) #270
- Readme.MD Overhaul of TOC, Installation, License (Avi Hayun) #311
- [URLs] Normalize URL without a scheme (Avi Hayun, sebastian-nagel) #271
Expand Down
242 changes: 224 additions & 18 deletions src/main/java/crawlercommons/filters/basic/BasicURLNormalizer.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@

import static java.nio.charset.StandardCharsets.UTF_8;

import crawlercommons.utils.Strings;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
Expand All @@ -27,7 +29,13 @@
import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Locale;
import java.util.Set;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

Expand Down Expand Up @@ -137,6 +145,16 @@ private static boolean isAscii(String str) {
return true;
}

final Set<String> queryElementsToRemove;

public BasicURLNormalizer() {
this(new TreeSet<>());
}

public BasicURLNormalizer(Set<String> queryElementsToRemove) {
this.queryElementsToRemove = new TreeSet<>(queryElementsToRemove);
}

@Override
public String filter(String urlString) {

Expand All @@ -145,18 +163,7 @@ public String filter(String urlString) {

urlString = urlString.trim(); // remove extra spaces

URL url = null;
try {
url = new URL(urlString);
} catch (MalformedURLException e) {
if (!hasSchemePattern.matcher(urlString).find()) {
// no protocol/scheme : try to prefix http://
try {
url = new URL("http://" + urlString);
} catch (MalformedURLException e1) {
}
}
}
URL url = parseStringToURL(urlString);
if (url == null) {
LOG.debug("Malformed URL {}", urlString);
return null;
Expand Down Expand Up @@ -220,8 +227,8 @@ public String filter(String urlString) {
}

// properly encode characters in path/file using percent-encoding
String file2 = unescapePath(file);
file2 = escapePath(file2);
String file2 = normalizeUrlFile(file);

if (!file.equals(file2)) {
changed = true;
file = file2;
Expand Down Expand Up @@ -255,6 +262,205 @@ public String filter(String urlString) {
return urlString;
}

/**
* Tries to parse the given string into a java.net.URL object.
*
* @param urlString a string which possibly contains a URL
* @return a URL object or null if an exception occurs.
*/
private static URL parseStringToURL(String urlString) {
URL url = null;
try {
url = new URL(urlString);
} catch (MalformedURLException e) {
if (!hasSchemePattern.matcher(urlString).find()) {
// no protocol/scheme : try to prefix http://
try {
url = new URL("http://" + urlString);
} catch (MalformedURLException e1) {
}
}
}
return url;
}

/**
* Parses the URL file and applies normalizations to the path and query components.
*
* @param file the URL file (as in java.net.URL.getFile()).
* @return a normalized URL file
*/
private String normalizeUrlFile(String file) {
// find the beginning of the query parameters
int endPathIdx = file.indexOf('?');
if (endPathIdx == -1) {
// no query parameters, just properly normalize the path
return escapePath(unescapePath(file));
}

int queryStartIdx = endPathIdx + 1;
if (queryStartIdx >= file.length()) {
// question mark was the last char in the file, so the query parameters
// string is empty. we can just remove the question mark and properly
// normalize the path.
final String path = file.substring(0, file.length() - 1);
return escapePath(unescapePath(path));
}

file = escapePath(unescapePath(file));

List<NameValuePair> pairs =
parseQueryParameters(file, queryStartIdx, queryElementsToRemove);

StringBuilder normalizedFile = new StringBuilder();
String path = file.substring(0, endPathIdx);
if (!Strings.isBlank(path)) {
normalizedFile.append(path);
}

// reconstruct query parameters in sorted order
if (!pairs.isEmpty()) {
pairs.sort(NameValuePair.NAME_COMPARATOR);
normalizedFile
.append('?')
.append(formatQueryParameters(pairs));
}

return normalizedFile.toString();
}

/**
* Receives the URL query string and parses it into a list of name-value pairs. Optionally,
* allows to remove query parameters.
*
* @param s a String containing the URL file (as per java.net.URL.getFile(), i.e., the path + query +
* fragment)
* @param queryStartIdx the index position of the query part in the string {@param s}.
* @param queryElementsToRemove a set of query parameter names to be ignored while parsing the
* query parameters.
*/
public static List<NameValuePair> parseQueryParameters(final String s, final int queryStartIdx,
final Set<String> queryElementsToRemove) {

if (s == null || s.isEmpty()) {
return Collections.emptyList();
}

final List<NameValuePair> list = new ArrayList<>();

int nameBeginIdx;
String name;
int valueBeginIdx;
String value;

char c = s.charAt(queryStartIdx);
for (int i = queryStartIdx, len = s.length(); i < len; i++) {

// parse query parameter name
nameBeginIdx = i;
while (i < len) {
c = s.charAt(i);
if (isNameEnd(c)) {
break;
}
i++;
}
name = s.substring(nameBeginIdx, i);

// parse query parameter value
value = null;
if (i < len && c == '=') {
i++;
valueBeginIdx = i;
while (i < len) {
c = s.charAt(i);
if (isValueEnd(c)) {
break;
}
i++;
}
if (valueBeginIdx < i) {
value = s.substring(valueBeginIdx, i);
}
}

if (!name.isEmpty()) {
if (queryElementsToRemove != null && !queryElementsToRemove.contains(name)) {
list.add(new NameValuePair(name, value));
}
}
}
return list;
}

/**
* Checks if the given char is a delimiter of a query parameter value.
*
* @param c the char to be checked
* @return true if the char is a delimiter, false otherwise.
*/
private static boolean isValueEnd(final char c) {
return c == '&';
}

/**
* Checks if the given char is a delimiter of a query parameter name.
*
* @param c the char to be checked
* @return true if the char is a delimiter, false otherwise.
*/
private static boolean isNameEnd(final char c) {
return c == '=' || c == '&';
}

/**
* Formats a list of query parameter name-value pairs into a query parameter string.
*
* @param parameters the query parameter name-value pairs
* @return a URL query string
*/
public static String formatQueryParameters(final List<NameValuePair> parameters) {
final StringBuilder result = new StringBuilder();
for (final NameValuePair parameter : parameters) {
if (result.length() > 0) {
result.append('&');
}
result.append(parameter.getName());
final String value = parameter.getValue();
if (value != null) {
result.append('=');
result.append(value);
}
}
return result.toString();
}

/**
* Represents the name-value pairs of each URL query parameter.
*/
private static class NameValuePair {

protected final String name;
protected final String value;

public NameValuePair(String name, String value) {
this.name = name;
this.value = value;
}

public String getName() {
return name;
}

public String getValue() {
return value;
}

public final static Comparator<NameValuePair> NAME_COMPARATOR =
Comparator.comparing(NameValuePair::getName);

}

private String getFileWithNormalizedPath(URL url) throws MalformedURLException {
String file;

Expand Down Expand Up @@ -294,7 +500,7 @@ private String getFileWithNormalizedPath(URL url) throws MalformedURLException {
* unescaped according to <a
* href="https://tools.ietf.org/html/rfc3986#section-2.2">RFC3986</a>.
*/
private String unescapePath(String path) {
public static String unescapePath(String path) {
StringBuilder sb = new StringBuilder();

Matcher matcher = unescapeRulePattern.matcher(path);
Expand All @@ -305,7 +511,7 @@ private String unescapePath(String path) {
// Traverse over all encoded groups
while (matcher.find()) {
// Append everything up to this group
sb.append(path.substring(end + 1, matcher.start()));
sb.append(path, end + 1, matcher.start());

// Get the integer representation of this hexadecimal encoded
// character
Expand All @@ -326,7 +532,7 @@ private String unescapePath(String path) {

// Append the rest if there's anything
if (end <= letter - 1) {
sb.append(path.substring(end + 1, letter));
sb.append(path, end + 1, letter);
}

return sb.toString();
Expand All @@ -337,7 +543,7 @@ private String unescapePath(String path) {
* characters which should be escaped according to <a
* href="https://tools.ietf.org/html/rfc3986#section-2.2">RFC3986</a>.
*/
private String escapePath(String path) {
private static String escapePath(String path) {
StringBuilder sb = new StringBuilder(path.length());

// Traverse over all bytes in this URL
Expand Down
36 changes: 36 additions & 0 deletions src/main/java/crawlercommons/utils/Strings.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
/**
* Copyright 2016 Crawler-Commons
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package crawlercommons.utils;

/**
* Util functions for manipulating strings.
*/
public class Strings {

public static boolean isBlank(final String cs) {
if (cs == null || cs.isEmpty()) {
return true;
}
for (int i = 0; i < cs.length(); i++) {
if (Character.isWhitespace(cs.charAt(i)) == false) {
return false;
}
}
return true;
}

}
Loading

0 comments on commit a10cf25

Please sign in to comment.