Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
5 contributors

Users who have contributed to this file

@luchaninov @Korilakkuma @happyxgang @garyillyes @TomAnthony
707 lines (621 sloc) 23.6 KB
// Copyright 1999 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// -----------------------------------------------------------------------------
// File: robots.cc
// -----------------------------------------------------------------------------
//
// Implements expired internet draft
// http://www.robotstxt.org/norobots-rfc.txt
// with Google-specific optimizations detailed at
// https://developers.google.com/search/reference/robots_txt
#include "robots.h"
#include <stdlib.h>
#include <cstddef>
#include <vector>
#include "absl/base/macros.h"
#include "absl/container/fixed_array.h"
#include "absl/strings/ascii.h"
#include "absl/strings/match.h"
#include "absl/strings/numbers.h"
#include "absl/strings/string_view.h"
// Allow for typos such as DISALOW in robots.txt.
static bool kAllowFrequentTypos = true;
namespace googlebot {
// A RobotsMatchStrategy defines a strategy for matching individual lines in a
// robots.txt file. Each Match* method should return a match priority, which is
// interpreted as:
//
// match priority < 0:
// No match.
//
// match priority == 0:
// Match, but treat it as if matched an empty pattern.
//
// match priority > 0:
// Match.
class RobotsMatchStrategy {
public:
virtual ~RobotsMatchStrategy() {}
virtual int MatchAllow(absl::string_view path,
absl::string_view pattern) = 0;
virtual int MatchDisallow(absl::string_view path,
absl::string_view pattern) = 0;
protected:
// Implements robots.txt pattern matching.
static bool Matches(absl::string_view path, absl::string_view pattern);
};
// Returns true if URI path matches the specified pattern. Pattern is anchored
// at the beginning of path. '$' is special only at the end of pattern.
//
// Since 'path' and 'pattern' are both externally determined (by the webmaster),
// we make sure to have acceptable worst-case performance.
/* static */ bool RobotsMatchStrategy::Matches(
absl::string_view path, absl::string_view pattern) {
const size_t pathlen = path.length();
absl::FixedArray<size_t> pos(pathlen + 1);
int numpos;
// The pos[] array holds a sorted list of indexes of 'path', with length
// 'numpos'. At the start and end of each iteration of the main loop below,
// the pos[] array will hold a list of the prefixes of the 'path' which can
// match the current prefix of 'pattern'. If this list is ever empty,
// return false. If we reach the end of 'pattern' with at least one element
// in pos[], return true.
pos[0] = 0;
numpos = 1;
for (auto pat = pattern.begin(); pat != pattern.end(); ++pat) {
if (*pat == '$' && pat + 1 == pattern.end()) {
return (pos[numpos-1] == pathlen);
}
if (*pat == '*') {
numpos = pathlen - pos[0] + 1;
for (int i = 1; i < numpos; i++) {
pos[i] = pos[i-1] + 1;
}
} else {
// Includes '$' when not at end of pattern.
int newnumpos = 0;
const size_t len = path.length();
for (int i = 0; i < numpos; i++) {
if (pos[i] < len && path[pos[i]] == *pat) {
pos[newnumpos++] = pos[i] + 1;
}
}
numpos = newnumpos;
if (numpos == 0) return false;
}
}
return true;
}
static const char* kHexDigits = "0123456789ABCDEF";
// GetPathParamsQuery is not in anonymous namespace to allow testing.
//
// Extracts path (with params) and query part from URL. Removes scheme,
// authority, and fragment. Result always starts with "/".
// Returns "/" if the url doesn't have a path or is not valid.
std::string GetPathParamsQuery(const std::string& url) {
std::string path;
// Initial two slashes are ignored.
size_t search_start = 0;
if (url.size() >= 2 && url[0] == '/' && url[1] == '/') search_start = 2;
size_t early_path = url.find_first_of("/?;", search_start);
size_t protocol_end = url.find("://", search_start);
if (early_path < protocol_end) {
// If path, param or query starts before ://, :// doesn't indicate protocol.
protocol_end = std::string::npos;
}
if (protocol_end == std::string::npos) {
protocol_end = search_start;
} else {
protocol_end += 3;
}
size_t path_start = url.find_first_of("/?;", protocol_end);
if (path_start != std::string::npos) {
size_t hash_pos = url.find('#', search_start);
if (hash_pos < path_start) return "/";
size_t path_end = (hash_pos == std::string::npos) ? url.size() : hash_pos;
if (url[path_start] != '/') {
// Prepend a slash if the result would start e.g. with '?'.
return "/" + url.substr(path_start, path_end - path_start);
}
return url.substr(path_start, path_end - path_start);
}
return "/";
}
// MaybeEscapePattern is not in anonymous namespace to allow testing.
//
// Canonicalize the allowed/disallowed paths. For example:
// /SanJoséSellers ==> /Sanjos%C3%A9Sellers
// %aa ==> %AA
// When the function returns, (*dst) either points to src, or is newly
// allocated.
// Returns true if dst was newly allocated.
bool MaybeEscapePattern(const char* src, char** dst) {
int num_to_escape = 0;
bool need_capitalize = false;
// First, scan the buffer to see if changes are needed. Most don't.
for (int i = 0; src[i] != 0; i++) {
// (a) % escape sequence.
if (src[i] == '%' &&
absl::ascii_isxdigit(src[i+1]) && absl::ascii_isxdigit(src[i+2])) {
if (absl::ascii_islower(src[i+1]) || absl::ascii_islower(src[i+2])) {
need_capitalize = true;
}
i += 2;
// (b) needs escaping.
} else if (src[i] & 0x80) {
num_to_escape++;
}
// (c) Already escaped and escape-characters normalized (eg. %2f -> %2F).
}
// Return if no changes needed.
if (!num_to_escape && !need_capitalize) {
(*dst) = const_cast<char*>(src);
return false;
}
(*dst) = new char[num_to_escape * 2 + strlen(src) + 1];
int j = 0;
for (int i = 0; src[i] != 0; i++) {
// (a) Normalize %-escaped sequence (eg. %2f -> %2F).
if (src[i] == '%' &&
absl::ascii_isxdigit(src[i+1]) && absl::ascii_isxdigit(src[i+2])) {
(*dst)[j++] = src[i++];
(*dst)[j++] = absl::ascii_toupper(src[i++]);
(*dst)[j++] = absl::ascii_toupper(src[i]);
// (b) %-escape octets whose highest bit is set. These are outside the
// ASCII range.
} else if (src[i] & 0x80) {
(*dst)[j++] = '%';
(*dst)[j++] = kHexDigits[(src[i] >> 4) & 0xf];
(*dst)[j++] = kHexDigits[src[i] & 0xf];
// (c) Normal character, no modification needed.
} else {
(*dst)[j++] = src[i];
}
}
(*dst)[j] = 0;
return true;
}
// Internal helper classes and functions.
namespace {
// A robots.txt has lines of key/value pairs. A ParsedRobotsKey represents
// a key. This class can parse a text-representation (including common typos)
// and represent them as an enumeration which allows for faster processing
// afterwards.
// For unparsable keys, the original string representation is kept.
class ParsedRobotsKey {
public:
enum KeyType {
// Generic highlevel fields.
USER_AGENT,
SITEMAP,
// Fields within a user-agent.
ALLOW,
DISALLOW,
// Unrecognized field; kept as-is. High number so that additions to the
// enumeration above does not change the serialization.
UNKNOWN = 128
};
ParsedRobotsKey() : type_(UNKNOWN) {}
// Disallow copying and assignment.
ParsedRobotsKey(const ParsedRobotsKey&) = delete;
ParsedRobotsKey& operator=(const ParsedRobotsKey&) = delete;
// Parse given key text. Does not copy the text, so the text_key must stay
// valid for the object's life-time or the next Parse() call.
void Parse(absl::string_view key);
// Returns the type of key.
KeyType type() const { return type_; }
// If this is an unknown key, get the text.
absl::string_view GetUnknownText() const;
private:
static bool KeyIsUserAgent(absl::string_view key);
static bool KeyIsAllow(absl::string_view key);
static bool KeyIsDisallow(absl::string_view key);
static bool KeyIsSitemap(absl::string_view key);
KeyType type_;
absl::string_view key_text_;
};
void EmitKeyValueToHandler(int line, const ParsedRobotsKey& key,
absl::string_view value,
RobotsParseHandler* handler) {
typedef ParsedRobotsKey Key;
switch (key.type()) {
case Key::USER_AGENT: handler->HandleUserAgent(line, value); break;
case Key::ALLOW: handler->HandleAllow(line, value); break;
case Key::DISALLOW: handler->HandleDisallow(line, value); break;
case Key::SITEMAP: handler->HandleSitemap(line, value); break;
case Key::UNKNOWN:
handler->HandleUnknownAction(line, key.GetUnknownText(), value);
break;
// No default case Key:: to have the compiler warn about new values.
}
}
class RobotsTxtParser {
public:
typedef ParsedRobotsKey Key;
RobotsTxtParser(absl::string_view robots_body,
RobotsParseHandler* handler)
: robots_body_(robots_body), handler_(handler) {
}
void Parse();
private:
static bool GetKeyAndValueFrom(char ** key, char **value, char *line);
static void StripWhitespaceSlowly(char ** s);
void ParseAndEmitLine(int current_line, char* line);
bool NeedEscapeValueForKey(const Key& key);
absl::string_view robots_body_;
RobotsParseHandler* const handler_;
};
bool RobotsTxtParser::NeedEscapeValueForKey(const Key& key) {
switch (key.type()) {
case RobotsTxtParser::Key::USER_AGENT:
case RobotsTxtParser::Key::SITEMAP:
return false;
default:
return true;
}
}
// Removes leading and trailing whitespace from null-terminated string s.
/* static */ void RobotsTxtParser::StripWhitespaceSlowly(char ** s) {
absl::string_view stripped = absl::StripAsciiWhitespace(*s);
*s = const_cast<char*>(stripped.data());
(*s)[stripped.size()] = '\0';
}
bool RobotsTxtParser::GetKeyAndValueFrom(char ** key, char ** value,
char * line) {
// Remove comments from the current robots.txt line.
char* const comment = strchr(line, '#');
if (nullptr != comment) {
*comment = '\0';
}
StripWhitespaceSlowly(&line);
// Rules must match the following pattern:
// <key>[ \t]*:[ \t]*<value>
char* sep = strchr(line, ':');
if (nullptr == sep) {
// Google-specific optimization: some people forget the colon, so we need to
// accept whitespace in its stead.
static const char * const kWhite = " \t";
sep = strpbrk(line, kWhite);
if (nullptr != sep) {
const char* const val = sep + strspn(sep, kWhite);
assert(*val); // since we dropped trailing whitespace above.
if (nullptr != strpbrk(val, kWhite)) {
// We only accept whitespace as a separator if there are exactly two
// sequences of non-whitespace characters. If we get here, there were
// more than 2 such sequences since we stripped trailing whitespace
// above.
return false;
}
}
}
if (nullptr == sep) {
return false; // Couldn't find a separator.
}
*key = line; // Key starts at beginning of line.
*sep = '\0'; // And stops at the separator.
StripWhitespaceSlowly(key); // Get rid of any trailing whitespace.
if (strlen(*key) > 0) {
*value = 1 + sep; // Value starts after the separator.
StripWhitespaceSlowly(value); // Get rid of any leading whitespace.
return true;
}
return false;
}
void RobotsTxtParser::ParseAndEmitLine(int current_line, char* line) {
char* string_key;
char* value;
if (!GetKeyAndValueFrom(&string_key, &value, line)) {
return;
}
Key key;
key.Parse(string_key);
if (NeedEscapeValueForKey(key)) {
char* escaped_value = nullptr;
const bool is_escaped = MaybeEscapePattern(value, &escaped_value);
EmitKeyValueToHandler(current_line, key, escaped_value, handler_);
if (is_escaped) delete[] escaped_value;
} else {
EmitKeyValueToHandler(current_line, key, value, handler_);
}
}
void RobotsTxtParser::Parse() {
// UTF-8 byte order marks.
static const unsigned char utf_bom[3] = {0xEF, 0xBB, 0xBF};
// Certain browsers limit the URL length to 2083 bytes. In a robots.txt, it's
// fairly safe to assume any valid line isn't going to be more than many times
// that max url length of 2KB. We want some padding for
// UTF-8 encoding/nulls/etc. but a much smaller bound would be okay as well.
// If so, we can ignore the chars on a line past that.
const int kMaxLineLen = 2083 * 8;
// Allocate a buffer used to process the current line.
char* const line_buffer = new char[kMaxLineLen];
// last_line_pos is the last writeable pos within the line array
// (only a final '\0' may go here).
const char* const line_buffer_end = line_buffer + kMaxLineLen - 1;
char* line_pos = line_buffer;
int line_num = 0;
size_t bom_pos = 0;
bool last_was_carriage_return = false;
handler_->HandleRobotsStart();
{
for (const unsigned char ch : robots_body_) {
ABSL_ASSERT(line_pos <= line_buffer_end);
// Google-specific optimization: UTF-8 byte order marks should never
// appear in a robots.txt file, but they do nevertheless. Skipping
// possible BOM-prefix in the first bytes of the input.
if (bom_pos < sizeof(utf_bom) && ch == utf_bom[bom_pos++]) {
continue;
}
bom_pos = sizeof(utf_bom);
if (ch != 0x0A && ch != 0x0D) { // Non-line-ending char case.
// Put in next spot on current line, as long as there's room.
if (line_pos < line_buffer_end) {
*(line_pos++) = ch;
}
} else { // Line-ending character char case.
*line_pos = '\0';
// Only emit an empty line if this was not due to the second character
// of the DOS line-ending \r\n .
const bool is_CRLF_continuation =
(line_pos == line_buffer) && last_was_carriage_return && ch == 0x0A;
if (!is_CRLF_continuation) {
ParseAndEmitLine(++line_num, line_buffer);
}
line_pos = line_buffer;
last_was_carriage_return = (ch == 0x0D);
}
}
}
*line_pos = '\0';
ParseAndEmitLine(++line_num, line_buffer);
handler_->HandleRobotsEnd();
delete [] line_buffer;
}
// Implements the default robots.txt matching strategy. The maximum number of
// characters matched by a pattern is returned as its match priority.
class LongestMatchRobotsMatchStrategy : public RobotsMatchStrategy {
public:
LongestMatchRobotsMatchStrategy() { }
// Disallow copying and assignment.
LongestMatchRobotsMatchStrategy(const LongestMatchRobotsMatchStrategy&) =
delete;
LongestMatchRobotsMatchStrategy& operator=(
const LongestMatchRobotsMatchStrategy&) = delete;
int MatchAllow(absl::string_view path, absl::string_view pattern) override;
int MatchDisallow(absl::string_view path, absl::string_view pattern) override;
};
} // end anonymous namespace
void ParseRobotsTxt(absl::string_view robots_body,
RobotsParseHandler* parse_callback) {
RobotsTxtParser parser(robots_body, parse_callback);
parser.Parse();
}
RobotsMatcher::RobotsMatcher()
: seen_global_agent_(false),
seen_specific_agent_(false),
ever_seen_specific_agent_(false),
seen_separator_(false),
path_(nullptr),
user_agents_(nullptr) {
match_strategy_ = new LongestMatchRobotsMatchStrategy();
}
RobotsMatcher::~RobotsMatcher() {
delete match_strategy_;
}
bool RobotsMatcher::ever_seen_specific_agent() const {
return ever_seen_specific_agent_;
}
void RobotsMatcher::InitUserAgentsAndPath(
const std::vector<std::string>* user_agents, const char* path) {
// The RobotsParser object doesn't own path_ or user_agents_, so overwriting
// these pointers doesn't cause a memory leak.
path_ = path;
ABSL_ASSERT('/' == *path_);
user_agents_ = user_agents;
}
bool RobotsMatcher::AllowedByRobots(absl::string_view robots_body,
const std::vector<std::string>* user_agents,
const std::string& url) {
// The url is not normalized (escaped, percent encoded) here because the user
// is asked to provide it in escaped form already.
std::string path = GetPathParamsQuery(url);
InitUserAgentsAndPath(user_agents, path.c_str());
ParseRobotsTxt(robots_body, this);
return !disallow();
}
bool RobotsMatcher::OneAgentAllowedByRobots(absl::string_view robots_txt,
const std::string& user_agent,
const std::string& url) {
std::vector<std::string> v;
v.push_back(user_agent);
return AllowedByRobots(robots_txt, &v, url);
}
bool RobotsMatcher::disallow() const {
if (allow_.specific.priority() > 0 || disallow_.specific.priority() > 0) {
return (disallow_.specific.priority() > allow_.specific.priority());
}
if (ever_seen_specific_agent_) {
// Matching group for user-agent but either without disallow or empty one,
// i.e. priority == 0.
return false;
}
if (disallow_.global.priority() > 0 || allow_.global.priority() > 0) {
return disallow_.global.priority() > allow_.global.priority();
}
return false;
}
bool RobotsMatcher::disallow_ignore_global() const {
if (allow_.specific.priority() > 0 || disallow_.specific.priority() > 0) {
return disallow_.specific.priority() > allow_.specific.priority();
}
return false;
}
const int RobotsMatcher::matching_line() const {
if (ever_seen_specific_agent_) {
return Match::HigherPriorityMatch(disallow_.specific, allow_.specific)
.line();
}
return Match::HigherPriorityMatch(disallow_.global, allow_.global).line();
}
void RobotsMatcher::HandleRobotsStart() {
// This is a new robots.txt file, so we need to reset all the instance member
// variables. We do it in the same order the instance member variables are
// declared, so it's easier to keep track of which ones we have (or maybe
// haven't!) done.
allow_.Clear();
disallow_.Clear();
seen_global_agent_ = false;
seen_specific_agent_ = false;
ever_seen_specific_agent_ = false;
seen_separator_ = false;
}
/*static*/ absl::string_view RobotsMatcher::ExtractUserAgent(
absl::string_view user_agent) {
// Allowed characters in user-agent are [a-zA-Z_-].
const char* end = user_agent.data();
while (absl::ascii_isalpha(*end) || *end == '-' || *end == '_') {
++end;
}
return user_agent.substr(0, end - user_agent.data());
}
/*static*/ bool RobotsMatcher::IsValidUserAgentToObey(
absl::string_view user_agent) {
return user_agent.length() > 0 && ExtractUserAgent(user_agent) == user_agent;
}
void RobotsMatcher::HandleUserAgent(int line_num,
absl::string_view user_agent) {
if (seen_separator_) {
seen_specific_agent_ = seen_global_agent_ = seen_separator_ = false;
}
// Google-specific optimization: a '*' followed by space and more characters
// in a user-agent record is still regarded a global rule.
if (user_agent.length() >= 1 && user_agent[0] == '*' &&
(user_agent.length() == 1 || isspace(user_agent[1]))) {
seen_global_agent_ = true;
} else {
user_agent = ExtractUserAgent(user_agent);
for (const auto& agent : *user_agents_) {
if (absl::EqualsIgnoreCase(user_agent, agent)) {
ever_seen_specific_agent_ = seen_specific_agent_ = true;
break;
}
}
}
}
void RobotsMatcher::HandleAllow(int line_num, absl::string_view value) {
if (!seen_any_agent()) return;
seen_separator_ = true;
const int priority = match_strategy_->MatchAllow(path_, value);
if (priority >= 0) {
if (seen_specific_agent_) {
if (allow_.specific.priority() < priority) {
allow_.specific.Set(priority, line_num);
}
} else {
assert(seen_global_agent_);
if (allow_.global.priority() < priority) {
allow_.global.Set(priority, line_num);
}
}
} else {
// Google-specific optimization: 'index.htm' and 'index.html' are normalized
// to '/'.
const size_t slash_pos = value.find_last_of('/');
if (slash_pos != absl::string_view::npos &&
absl::StartsWith(absl::ClippedSubstr(value, slash_pos),
"/index.htm")) {
const int len = slash_pos + 1;
absl::FixedArray<char> newpattern(len + 1);
strncpy(newpattern.data(), value.data(), len);
newpattern[len] = '$';
HandleAllow(line_num,
absl::string_view(newpattern.data(), newpattern.size()));
}
}
}
void RobotsMatcher::HandleDisallow(int line_num, absl::string_view value) {
if (!seen_any_agent()) return;
seen_separator_ = true;
const int priority = match_strategy_->MatchDisallow(path_, value);
if (priority >= 0) {
if (seen_specific_agent_) {
if (disallow_.specific.priority() < priority) {
disallow_.specific.Set(priority, line_num);
}
} else {
assert(seen_global_agent_);
if (disallow_.global.priority() < priority) {
disallow_.global.Set(priority, line_num);
}
}
}
}
int LongestMatchRobotsMatchStrategy::MatchAllow(absl::string_view path,
absl::string_view pattern) {
return Matches(path, pattern) ? pattern.length() : -1;
}
int LongestMatchRobotsMatchStrategy::MatchDisallow(absl::string_view path,
absl::string_view pattern) {
return Matches(path, pattern) ? pattern.length() : -1;
}
void RobotsMatcher::HandleSitemap(int line_num, absl::string_view value) {
seen_separator_ = true;
}
void RobotsMatcher::HandleUnknownAction(int line_num, absl::string_view action,
absl::string_view value) {
seen_separator_ = true;
}
void ParsedRobotsKey::Parse(absl::string_view key) {
key_text_ = absl::string_view();
if (KeyIsUserAgent(key)) {
type_ = USER_AGENT;
} else if (KeyIsAllow(key)) {
type_ = ALLOW;
} else if (KeyIsDisallow(key)) {
type_ = DISALLOW;
} else if (KeyIsSitemap(key)) {
type_ = SITEMAP;
} else {
type_ = UNKNOWN;
key_text_ = key;
}
}
absl::string_view ParsedRobotsKey::GetUnknownText() const {
ABSL_ASSERT(type_ == UNKNOWN && !key_text_.empty());
return key_text_;
}
bool ParsedRobotsKey::KeyIsUserAgent(absl::string_view key) {
return (
absl::StartsWithIgnoreCase(key, "user-agent") ||
(kAllowFrequentTypos && (absl::StartsWithIgnoreCase(key, "useragent") ||
absl::StartsWithIgnoreCase(key, "user agent"))));
}
bool ParsedRobotsKey::KeyIsAllow(absl::string_view key) {
return absl::StartsWithIgnoreCase(key, "allow");
}
bool ParsedRobotsKey::KeyIsDisallow(absl::string_view key) {
return (
absl::StartsWithIgnoreCase(key, "disallow") ||
(kAllowFrequentTypos && ((absl::StartsWithIgnoreCase(key, "dissallow")) ||
(absl::StartsWithIgnoreCase(key, "dissalow")) ||
(absl::StartsWithIgnoreCase(key, "disalow")) ||
(absl::StartsWithIgnoreCase(key, "diasllow")) ||
(absl::StartsWithIgnoreCase(key, "disallaw")))));
}
bool ParsedRobotsKey::KeyIsSitemap(absl::string_view key) {
return ((absl::StartsWithIgnoreCase(key, "sitemap")) ||
(absl::StartsWithIgnoreCase(key, "site-map")));
}
} // namespace googlebot
You can’t perform that action at this time.