Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Robots.txt] Handle allow/disallow directives containing unescaped Unicode characters #401

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 27 additions & 16 deletions src/main/java/crawlercommons/robots/SimpleRobotRulesParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,9 @@
* <p>
* This implementation of {@link BaseRobotsParser} retrieves a set of
* {@link SimpleRobotRules rules} for an agent with the given name from the
* <code>robots.txt</code> file of a given domain.
* <code>robots.txt</code> file of a given domain. The implementation follows
* <a href="https://www.rfc-editor.org/rfc/rfc9309.html#name-access-method">RFC
* 9309</a>.
* </p>
*
* <p>
Expand Down Expand Up @@ -492,7 +494,17 @@ private SimpleRobotRules parseContent(String url, byte[] content, String content

int bytesLen = content.length;
int offset = 0;
Charset encoding = StandardCharsets.US_ASCII;

/*
* RFC 9309 requires that is "UTF-8 encoded" (<a href=
* "https://www.rfc-editor.org/rfc/rfc9309.html#name-access-method"> RFC
* 9309, section 2.3 Access Method</a>), but
* "Implementors MAY bridge encoding mismatches if they detect that the robots.txt file is not UTF-8 encoded."
* (<a href=
* "https://www.rfc-editor.org/rfc/rfc9309.html#name-the-allow-and-disallow-line"
* > RFC 9309, section 2.2.2. The "Allow" and "Disallow" Lines</a>)
*/
Charset encoding = StandardCharsets.UTF_8;

// Check for a UTF-8 BOM at the beginning (EF BB BF)
if ((bytesLen >= 3) && (content[0] == (byte) 0xEF) && (content[1] == (byte) 0xBB) && (content[2] == (byte) 0xBF)) {
Expand All @@ -519,11 +531,11 @@ else if ((bytesLen >= 2) && (content[0] == (byte) 0xFE) && (content[1] == (byte)
// Decide if we need to do special HTML processing.
boolean isHtmlType = ((contentType != null) && contentType.toLowerCase(Locale.ROOT).startsWith("text/html"));

// If it looks like it contains HTML, but doesn't have a user agent
// field, then
// assume somebody messed up and returned back to us a random HTML page
// instead
// of a robots.txt file.
/*
* If it looks like it contains HTML, but doesn't have a user agent
* field, then assume somebody messed up and returned back to us a
* random HTML page instead of a robots.txt file.
*/
boolean hasHTML = false;
if (isHtmlType || SIMPLE_HTML_PATTERN.matcher(contentAsStr).find()) {
if (!USER_AGENT_PATTERN.matcher(contentAsStr).find()) {
Expand All @@ -550,12 +562,12 @@ else if ((bytesLen >= 2) && (content[0] == (byte) 0xFE) && (content[1] == (byte)
while (lineParser.hasMoreTokens()) {
String line = lineParser.nextToken();

// Get rid of HTML markup, in case some brain-dead webmaster has
// created an HTML
// page for robots.txt. We could do more sophisticated processing
// here to better
// handle bad HTML, but that's a very tiny percentage of all
// robots.txt files.
/*
* Get rid of HTML markup, in case some brain-dead webmaster has
* created an HTML page for robots.txt. We could do more
* sophisticated processing here to better handle bad HTML, but
* that's a very tiny percentage of all robots.txt files.
*/
if (hasHTML) {
line = line.replaceAll("<[^>]+>", "");
}
Expand Down Expand Up @@ -855,9 +867,8 @@ private void handleCrawlDelay(ParseState state, RobotToken token) {
double delayValue = Double.parseDouble(delayString) * 1000.0;
state.setCrawlDelay(Math.round(delayValue));
} else {
long delayValue = Integer.parseInt(delayString) * 1000L; // sec
// to
// millisec
// seconds to milliseconds
long delayValue = Integer.parseInt(delayString) * 1000L;
state.setCrawlDelay(delayValue);
}
} catch (Exception e) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@

import java.io.InputStream;
import java.net.HttpURLConnection;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
Expand Down Expand Up @@ -226,6 +227,44 @@ void testNonAsciiEncoding() {
assertTrue(rules.isAllowed("http://www.domain.com/anypage.html"));
}

@Test
void testUnicodeUnescapedPaths() {
final String simpleRobotsTxt = "User-agent: *" + CRLF //
+ "Disallow: /bücher/" + CRLF //
+ "Disallow: /k%C3%B6nyvek/" + CRLF //
+ CRLF //
+ "User-agent: GoodBot" + CRLF //
+ "Allow: /";

BaseRobotRules rules = createRobotRules("mybot", simpleRobotsTxt);
assertTrue(rules.isAllowed("https://www.example.com/"));

// test using escaped and unescaped URLs
assertFalse(rules.isAllowed("https://www.example.com/b%C3%BCcher/book1.html"));
assertFalse(rules.isAllowed("https://www.example.com/bücher/book2.html"));

// (for completeness) check also escaped path in robots.txt
assertFalse(rules.isAllowed("https://www.example.com/k%C3%B6nyvek/book1.html"));
assertFalse(rules.isAllowed("https://www.example.com/könyvek/book2.html"));

// test invalid encoding: invalid encoded characters should not break
// parsing of rules below
rules = createRobotRules("goodbot", simpleRobotsTxt.getBytes(StandardCharsets.ISO_8859_1));
assertTrue(rules.isAllowed("https://www.example.com/"));
assertTrue(rules.isAllowed("https://www.example.com/b%C3%BCcher/book1.html"));

// test invalid encoding: only rules with invalid characters should be
// ignored
rules = createRobotRules("mybot", simpleRobotsTxt.getBytes(StandardCharsets.ISO_8859_1));
assertTrue(rules.isAllowed("https://www.example.com/"));
assertFalse(rules.isAllowed("https://www.example.com/k%C3%B6nyvek/book1.html"));
assertFalse(rules.isAllowed("https://www.example.com/könyvek/book2.html"));
// if URL paths in disallow rules are not properly encoded, these two
// URLs are not matched:
// assertFalse(rules.isAllowed("https://www.example.com/b%C3%BCcher/book2.html"));
rzo1 marked this conversation as resolved.
Show resolved Hide resolved
// assertFalse(rules.isAllowed("https://www.example.com/bücher/book1.html"));
}

@Test
void testSimplestAllowAll() {
final String simpleRobotsTxt = "User-agent: *" + CRLF //
Expand Down