Skip to content
Browse files

Fix a matching rule for known tokens in that it performed prefix matc…

…hing.
  • Loading branch information...
1 parent 84148f0 commit d350aec2c5102c230d4c668ead97339394d74473 @knu knu committed Jan 30, 2011
Showing with 18 additions and 13 deletions.
  1. +7 −6 lib/webrobots/robotstxt.rb
  2. +7 −6 lib/webrobots/robotstxt.ry
  3. +4 −1 test/test_webrobots.rb
View
13 lib/webrobots/robotstxt.rb
@@ -34,7 +34,7 @@ def parse!(input, site)
end
KNOWN_TOKENS = %w[User-agent Allow Disallow Crawl-delay Sitemap]
- RE_KNOWN_TOKENS = /#{KNOWN_TOKENS.join('|')}/i
+ RE_KNOWN_TOKENS = /\A(#{KNOWN_TOKENS.map { |t| Regexp.quote(t) }.join('|')})\z/i
def parse(input, site)
@q = []
@@ -71,14 +71,15 @@ def parse(input, site)
parse_error @lineno, "unexpected characters: %s" % s.check(/.*/)
end
value_expected = false
- else
- if t = s.scan(RE_KNOWN_TOKENS)
+ elsif t = s.scan(/[^\x00-\x1f\x7f()<>@,;:\\"\/\[\]?={}]+/)
+ case t
+ when RE_KNOWN_TOKENS
@q << [t.downcase, t]
- elsif t = s.scan(/[^\x00-\x1f\x7f()<>@,;:\\"\/\[\]?={}]+/)
- @q << [:TOKEN, t]
else
- parse_error "unexpected characters: %s" % s.check(/.*/)
+ @q << [:TOKEN, t]
end
+ else
+ parse_error "unexpected characters: %s" % s.check(/.*/)
end
end
end
View
13 lib/webrobots/robotstxt.ry
@@ -174,7 +174,7 @@ class WebRobots
end
KNOWN_TOKENS = %w[User-agent Allow Disallow Crawl-delay Sitemap]
- RE_KNOWN_TOKENS = /#{KNOWN_TOKENS.join('|')}/i
+ RE_KNOWN_TOKENS = /\A(#{KNOWN_TOKENS.map { |t| Regexp.quote(t) }.join('|')})\z/i
def parse(input, site)
@q = []
@@ -211,14 +211,15 @@ class WebRobots
parse_error @lineno, "unexpected characters: %s" % s.check(/.*/)
end
value_expected = false
- else
- if t = s.scan(RE_KNOWN_TOKENS)
+ elsif t = s.scan(/[^\x00-\x1f\x7f()<>@,;:\\"\/\[\]?={}]+/)
+ case t
+ when RE_KNOWN_TOKENS
@q << [t.downcase, t]
- elsif t = s.scan(/[^\x00-\x1f\x7f()<>@,;:\\"\/\[\]?={}]+/)
- @q << [:TOKEN, t]
else
- parse_error "unexpected characters: %s" % s.check(/.*/)
+ @q << [:TOKEN, t]
end
+ else
+ parse_error "unexpected characters: %s" % s.check(/.*/)
end
end
end
View
5 test/test_webrobots.rb
@@ -89,6 +89,7 @@ class TestWebRobots < Test::Unit::TestCase
# Punish evil bots
User-Agent: evil
Disallow: /
+Disallow-Not: / # parser teaser
User-Agent: good
# Be generous to good bots
@@ -172,7 +173,9 @@ class TestWebRobots < Test::Unit::TestCase
end
should "properly restrict access" do
- assert @robots_good.allowed?('http://www.example.org/index.html')
+ assert_nothing_raised {
+ assert @robots_good.allowed?('http://www.example.org/index.html')
+ }
assert !@robots_good.allowed?('http://www.example.org/2heavy/index.php')
assert @robots_good.allowed?('http://www.example.org/2HEAVY/index.php')
assert !@robots_good.allowed?(URI('http://www.example.org/2heavy/index.php'))

0 comments on commit d350aec

Please sign in to comment.
Something went wrong with that request. Please try again.