Skip to content

Commit

Permalink
RFC compliance: matching user-agent names when selecting rule blocks
Browse files Browse the repository at this point in the history
- match user-agent product tokens followed by ignored characters
  also in legacy prefix matching mode, e.g. match "butterfly" in
  "User-agent: Butterfly/1.0"
- refactor prefix matching: switch inner and outer loop, handle
  check for (common) wild-card user-agent outside of loop
  • Loading branch information
sebastian-nagel committed Apr 22, 2023
1 parent 1099206 commit c57d716
Showing 1 changed file with 31 additions and 21 deletions.
52 changes: 31 additions & 21 deletions src/main/java/crawlercommons/robots/SimpleRobotRulesParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -626,8 +626,7 @@ else if ((bytesLen >= 2) && (content[0] == (byte) 0xFE) && (content[1] == (byte)
SimpleRobotRules result = parseState.getRobotRules();
if (result.getCrawlDelay() > _maxCrawlDelay) {
// Some evil sites use a value like 3600 (seconds) for the crawl
// delay, which would
// cause lots of problems for us.
// delay, which would cause lots of problems for us.
LOGGER.debug("Crawl delay exceeds max value - so disallowing all URLs: {}", url);
return new SimpleRobotRules(RobotRulesMode.ALLOW_NONE);
} else {
Expand Down Expand Up @@ -720,35 +719,46 @@ private void handleUserAgent(ParseState state, RobotToken token) {
}
} else {
/*
* prefix matching on user-agent words - backward-compatibility to
* prefix matching on user-agent words - backward-compatibility with
* the old and deprecated API if "User-Agent" HTTP request header
* strings are passed as param instead of single-word/token
* user-agent names, e.g. if the robot name is "WebCrawler/1.0" and is
* expected match the robots.txt directive "User-agent: mybot"
* or also "User-agent: my XXX
* or also "User-agent: my".
*/
// TODO: even this should be fixed: the robot name butterfly should match "Butterfly/1.0" in the user-agent line
for (String targetName : targetNames) {
LOGGER.debug(targetName);
// TODO KKr - catch case of multiple names, log as non-standard.
String[] agentNames = ROBOT_NAMES_SPLIT.split(token.getData().trim().toLowerCase(Locale.ROOT));
String agentNameFull = token.getData().trim().toLowerCase(Locale.ROOT);
boolean matched = false;
if (agentNameFull.equals("*") && !state.isMatchedRealName()) {
state.setMatchedWildcard(true);
state.setAddingRules(true);
} else if (userAgentProductTokenPrefixMatch(agentNameFull, targetNames)) {
// match "butterfly" in the line "User-agent: Butterfly/1.0"
matched = true;
} else {
String[] agentNames = ROBOT_NAMES_SPLIT.split(agentNameFull);
if (agentNames.length > 1) {
LOGGER.debug("Multiple agent names in user-agent line: {}", token.getData());
}
for (String agentName : agentNames) {
if (agentName.equals("*") && !state.isMatchedRealName()) {
state.setMatchedWildcard(true);
state.setAddingRules(true);
} else if (targetName.startsWith(agentName)) {
if (state.isMatchedWildcard()) {
// Clear rules of the wildcard user-agent found
// before the non-wildcard user-agent match.
state.clearRules();
for (String targetName : targetNames) {
LOGGER.debug(targetName);
if (targetName.startsWith(agentName)) {
matched = true;
break;
}
state.setMatchedRealName(true);
state.setAddingRules(true);
state.setMatchedWildcard(false);
break;
}
}
}
if (matched) {
if (state.isMatchedWildcard()) {
// Clear rules of the wildcard user-agent found
// before the non-wildcard user-agent match.
state.clearRules();
}
state.setMatchedRealName(true);
state.setAddingRules(true);
state.setMatchedWildcard(false);
}
}
}

Expand Down

0 comments on commit c57d716

Please sign in to comment.