Merge branch 'aecio:aecio/query-params-normalization', fixes #246, cl…

…oses #309 - rebase to master and squash commits - fix failing sitemaps unit tests with URL filtering using BasicURLNormalizer (sort query params in test sitemap) - CHANGES.txt: updated to follow style, added missing entry for preceding commit
crawler-commons · Sep 21, 2021 · a10cf25 · a10cf25
2 parents 7a8bbb6 + 94bac65
commit a10cf25
Show file tree

Hide file tree

Showing 6 changed files with 330 additions and 31 deletions.
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -1,6 +1,9 @@
 Crawler-Commons Change Log
 
 Current Development 1.2-SNAPSHOT (yyyy-mm-dd)
+- [URLs] Sorting the Query Parameters (aecio) #246, #309
+- [URLs] Allows to (optionally) remove common irrelevant query parameters (aecio) #309
+- [Sitemaps] Allow to normalize URLs in sitemaps (murderinc, sebastian-nagel) #305
 - Normalize CHANGES.txt (Avi Hayun) #270
 - Readme.MD Overhaul of TOC, Installation, License (Avi Hayun) #311
 - [URLs] Normalize URL without a scheme (Avi Hayun, sebastian-nagel) #271

diff --git a/src/main/java/crawlercommons/filters/basic/BasicURLNormalizer.java b/src/main/java/crawlercommons/filters/basic/BasicURLNormalizer.java
@@ -18,6 +18,8 @@
 
 import static java.nio.charset.StandardCharsets.UTF_8;
 
+import crawlercommons.utils.Strings;
+
 import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.InputStreamReader;
@@ -27,7 +29,13 @@
 import java.net.URISyntaxException;
 import java.net.URL;
 import java.net.URLDecoder;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.List;
 import java.util.Locale;
+import java.util.Set;
+import java.util.TreeSet;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
@@ -137,6 +145,16 @@ private static boolean isAscii(String str) {
         return true;
     }
 
+    final Set<String> queryElementsToRemove;
+
+    public BasicURLNormalizer() {
+        this(new TreeSet<>());
+    }
+
+    public BasicURLNormalizer(Set<String> queryElementsToRemove) {
+        this.queryElementsToRemove = new TreeSet<>(queryElementsToRemove);
+    }
+
     @Override
     public String filter(String urlString) {
 
@@ -145,18 +163,7 @@ public String filter(String urlString) {
 
         urlString = urlString.trim(); // remove extra spaces
 
-        URL url = null;
-        try {
-            url = new URL(urlString);
-        } catch (MalformedURLException e) {
-            if (!hasSchemePattern.matcher(urlString).find()) {
-                // no protocol/scheme : try to prefix http://
-                try {
-                    url = new URL("http://" + urlString);
-                } catch (MalformedURLException e1) {
-                }
-            }
-        }
+        URL url = parseStringToURL(urlString);
         if (url == null) {
             LOG.debug("Malformed URL {}", urlString);
             return null;
@@ -220,8 +227,8 @@ public String filter(String urlString) {
         }
 
         // properly encode characters in path/file using percent-encoding
-        String file2 = unescapePath(file);
-        file2 = escapePath(file2);
+        String file2 = normalizeUrlFile(file);
+
         if (!file.equals(file2)) {
             changed = true;
             file = file2;
@@ -255,6 +262,205 @@ public String filter(String urlString) {
         return urlString;
     }
 
+    /**
+     * Tries to parse the given string into a java.net.URL object.
+     *
+     * @param urlString a string which possibly contains a URL
+     * @return a URL object or null if an exception occurs.
+     */
+    private static URL parseStringToURL(String urlString) {
+        URL url = null;
+        try {
+            url = new URL(urlString);
+        } catch (MalformedURLException e) {
+            if (!hasSchemePattern.matcher(urlString).find()) {
+                // no protocol/scheme : try to prefix http://
+                try {
+                    url = new URL("http://" + urlString);
+                } catch (MalformedURLException e1) {
+                }
+            }
+        }
+        return url;
+    }
+
+    /**
+     * Parses the URL file and applies normalizations to the path and query components.
+     *
+     * @param file the URL file (as in java.net.URL.getFile()).
+     * @return a normalized URL file
+     */
+    private String normalizeUrlFile(String file) {
+        // find the beginning of the query parameters
+        int endPathIdx = file.indexOf('?');
+        if (endPathIdx == -1) {
+            // no query parameters, just properly normalize the path
+            return escapePath(unescapePath(file));
+        }
+
+        int queryStartIdx = endPathIdx + 1;
+        if (queryStartIdx >= file.length()) {
+            // question mark was the last char in the file, so the query parameters
+            // string is empty. we can just remove the question mark and properly
+            // normalize the path.
+            final String path = file.substring(0, file.length() - 1);
+            return escapePath(unescapePath(path));
+        }
+
+        file = escapePath(unescapePath(file));
+
+        List<NameValuePair> pairs =
+                parseQueryParameters(file, queryStartIdx, queryElementsToRemove);
+
+        StringBuilder normalizedFile = new StringBuilder();
+        String path = file.substring(0, endPathIdx);
+        if (!Strings.isBlank(path)) {
+            normalizedFile.append(path);
+        }
+
+        // reconstruct query parameters in sorted order
+        if (!pairs.isEmpty()) {
+            pairs.sort(NameValuePair.NAME_COMPARATOR);
+            normalizedFile
+                    .append('?')
+                    .append(formatQueryParameters(pairs));
+        }
+
+        return normalizedFile.toString();
+    }
+
+    /**
+     * Receives the URL query string and parses it into a list of name-value pairs. Optionally,
+     * allows to remove query parameters.
+     *
+     * @param s a String containing the URL file (as per java.net.URL.getFile(), i.e., the path + query +
+     * fragment)
+     * @param queryStartIdx the index position of the query part in the string {@param s}.
+     * @param queryElementsToRemove a set of query parameter names to be ignored while parsing the
+     * query parameters.
+     */
+    public static List<NameValuePair> parseQueryParameters(final String s, final int queryStartIdx,
+                                                           final Set<String> queryElementsToRemove) {
+
+        if (s == null || s.isEmpty()) {
+            return Collections.emptyList();
+        }
+
+        final List<NameValuePair> list = new ArrayList<>();
+
+        int nameBeginIdx;
+        String name;
+        int valueBeginIdx;
+        String value;
+
+        char c = s.charAt(queryStartIdx);
+        for (int i = queryStartIdx, len = s.length(); i < len; i++) {
+
+            // parse query parameter name
+            nameBeginIdx = i;
+            while (i < len) {
+                c = s.charAt(i);
+                if (isNameEnd(c)) {
+                    break;
+                }
+                i++;
+            }
+            name = s.substring(nameBeginIdx, i);
+
+            // parse query parameter value
+            value = null;
+            if (i < len && c == '=') {
+                i++;
+                valueBeginIdx = i;
+                while (i < len) {
+                    c = s.charAt(i);
+                    if (isValueEnd(c)) {
+                        break;
+                    }
+                    i++;
+                }
+                if (valueBeginIdx < i) {
+                    value = s.substring(valueBeginIdx, i);
+                }
+            }
+
+            if (!name.isEmpty()) {
+                if (queryElementsToRemove != null && !queryElementsToRemove.contains(name)) {
+                    list.add(new NameValuePair(name, value));
+                }
+            }
+        }
+        return list;
+    }
+
+    /**
+     * Checks if the given char is a delimiter of a query parameter value.
+     *
+     * @param c the char to be checked
+     * @return true if the char is a delimiter, false otherwise.
+     */
+    private static boolean isValueEnd(final char c) {
+        return c == '&';
+    }
+
+    /**
+     * Checks if the given char is a delimiter of a query parameter name.
+     *
+     * @param c the char to be checked
+     * @return true if the char is a delimiter, false otherwise.
+     */
+    private static boolean isNameEnd(final char c) {
+        return c == '=' || c == '&';
+    }
+
+    /**
+     * Formats a list of query parameter name-value pairs into a query parameter string.
+     *
+     * @param parameters the query parameter name-value pairs
+     * @return a URL query string
+     */
+    public static String formatQueryParameters(final List<NameValuePair> parameters) {
+        final StringBuilder result = new StringBuilder();
+        for (final NameValuePair parameter : parameters) {
+            if (result.length() > 0) {
+                result.append('&');
+            }
+            result.append(parameter.getName());
+            final String value = parameter.getValue();
+            if (value != null) {
+                result.append('=');
+                result.append(value);
+            }
+        }
+        return result.toString();
+    }
+
+    /**
+     * Represents the name-value pairs of each URL query parameter.
+     */
+    private static class NameValuePair {
+
+        protected final String name;
+        protected final String value;
+
+        public NameValuePair(String name, String value) {
+            this.name = name;
+            this.value = value;
+        }
+
+        public String getName() {
+            return name;
+        }
+
+        public String getValue() {
+            return value;
+        }
+
+        public final static Comparator<NameValuePair> NAME_COMPARATOR =
+            Comparator.comparing(NameValuePair::getName);
+
+    }
+
     private String getFileWithNormalizedPath(URL url) throws MalformedURLException {
         String file;
 
@@ -294,7 +500,7 @@ private String getFileWithNormalizedPath(URL url) throws MalformedURLException {
      * unescaped according to <a
      * href="https://tools.ietf.org/html/rfc3986#section-2.2">RFC3986</a>.
      */
-    private String unescapePath(String path) {
+    public static String unescapePath(String path) {
         StringBuilder sb = new StringBuilder();
 
         Matcher matcher = unescapeRulePattern.matcher(path);
@@ -305,7 +511,7 @@ private String unescapePath(String path) {
         // Traverse over all encoded groups
         while (matcher.find()) {
             // Append everything up to this group
-            sb.append(path.substring(end + 1, matcher.start()));
+            sb.append(path, end + 1, matcher.start());
 
             // Get the integer representation of this hexadecimal encoded
             // character
@@ -326,7 +532,7 @@ private String unescapePath(String path) {
 
         // Append the rest if there's anything
         if (end <= letter - 1) {
-            sb.append(path.substring(end + 1, letter));
+            sb.append(path, end + 1, letter);
         }
 
         return sb.toString();
@@ -337,7 +543,7 @@ private String unescapePath(String path) {
      * characters which should be escaped according to <a
      * href="https://tools.ietf.org/html/rfc3986#section-2.2">RFC3986</a>.
      */
-    private String escapePath(String path) {
+    private static String escapePath(String path) {
         StringBuilder sb = new StringBuilder(path.length());
 
         // Traverse over all bytes in this URL

diff --git a/src/main/java/crawlercommons/utils/Strings.java b/src/main/java/crawlercommons/utils/Strings.java
@@ -0,0 +1,36 @@
+/**
+ * Copyright 2016 Crawler-Commons
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package crawlercommons.utils;
+
+/**
+ * Util functions for manipulating strings.
+ */
+public class Strings {
+
+    public static boolean isBlank(final String cs) {
+        if (cs == null || cs.isEmpty()) {
+            return true;
+        }
+        for (int i = 0; i < cs.length(); i++) {
+            if (Character.isWhitespace(cs.charAt(i)) == false) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+}