Skip to content

Commit

Permalink
extend css selectors with ":text" and :attribute(<name>) scrapy#176
Browse files Browse the repository at this point in the history
  • Loading branch information
dangra committed Oct 10, 2013
1 parent b38ac27 commit 4e6967b
Show file tree
Hide file tree
Showing 2 changed files with 189 additions and 21 deletions.
93 changes: 72 additions & 21 deletions scrapy/selector/csssel.py
@@ -1,37 +1,88 @@
from cssselect import GenericTranslator, HTMLTranslator
from scrapy.utils.python import flatten
from scrapy.selector import HtmlXPathSelector, XmlXPathSelector
from .list import SelectorList
from cssselect.xpath import XPathExpr, ExpressionError
from scrapy.selector import XPathSelector, HtmlXPathSelector, XmlXPathSelector


class CSSSelectorList(SelectorList):
def xpath(self, xpath):
return self.__class__(flatten([x.xpath(xpath) for x in self]))
class ScrapyXPathExpr(XPathExpr):

def get(self, attr):
return self.__class__(flatten([x.get(attr) for x in self]))
textnode = False
attribute = None

def text(self, all=False):
return self.__class__(flatten([x.text(all) for x in self]))
@classmethod
def from_xpath(cls, xpath, textnode=False, attribute=None):
x = cls(path=xpath.path, element=xpath.element, condition=xpath.condition)
x.textnode = textnode
x.attribute = attribute
return x

def __str__(self):
path = super(ScrapyXPathExpr, self).__str__()
if self.textnode:
if path == '*':
path = 'text()'
elif path.endswith('::*/*'):
path = path[:-3] + 'text()'
else:
path += '/text()'

if self.attribute is not None:
if path.endswith('::*/*'):
path = path[:-2]
path += '/@%s' % self.attribute

return path

def join(self, combiner, other):
super(ScrapyXPathExpr, self).join(combiner, other)
self.textnode = other.textnode
self.attribute = other.attribute
return self


class TranslatorMixin(object):

def xpath_element(self, selector):
xpath = super(TranslatorMixin, self).xpath_element(selector)
return ScrapyXPathExpr.from_xpath(xpath)

def xpath_text_pseudo(self, xpath):
"""Support selecting text nodes using :text pseudo-element"""
return ScrapyXPathExpr.from_xpath(xpath, textnode=True)

def xpath_attribute_function(self, xpath, function):
if function.argument_types() not in (['STRING'], ['IDENT']):
raise ExpressionError(
"Expected a single string or ident for :contains(), got %r"
% function.arguments)
value = function.arguments[0].value
return ScrapyXPathExpr.from_xpath(xpath, attribute=value)


class ScrapyGenericTranslator(TranslatorMixin, GenericTranslator):
pass


class ScrapyHTMLTranslator(TranslatorMixin, HTMLTranslator):
pass


class CSSSelectorMixin(object):

def select(self, css):
return CSSSelectorList(super(CSSSelectorMixin, self).select(self.translator.css_to_xpath(css)))
xpath = self._css2xpath(css)
return super(CSSSelectorMixin, self).select(xpath)

def xpath(self, xpath):
return CSSSelectorList(super(CSSSelectorMixin, self).select(xpath))
def _css2xpath(self, css):
return self.translator.css_to_xpath(css)

def text(self, all=False):
return self.xpath('string()') if all else self.xpath('text()')

def get(self, attr):
return self.xpath('@' + attr)
class CSSSelector(CSSSelectorMixin, XPathSelector):
translator = ScrapyHTMLTranslator()


class XmlCSSSelector(CSSSelectorMixin, XmlXPathSelector):
translator = GenericTranslator()
class HtmlCSSSelector(CSSSelectorMixin, HtmlXPathSelector):
translator = ScrapyHTMLTranslator()


class HtmlCSSSelector(CSSSelectorMixin, HtmlXPathSelector):
translator = HTMLTranslator()
class XmlCSSSelector(CSSSelectorMixin, XmlXPathSelector):
translator = ScrapyGenericTranslator()
117 changes: 117 additions & 0 deletions scrapy/tests/test_selector_cssselect.py
@@ -0,0 +1,117 @@
"""
Selector tests for cssselect backend
"""
from twisted.trial import unittest
from scrapy.http import TextResponse, HtmlResponse, XmlResponse
from scrapy.selector import CSSSelector, XmlCSSSelector, HtmlCSSSelector
from scrapy.selector.csssel import ScrapyHTMLTranslator

HTMLBODY = '''
<html>
<body>
<div>
<a id="name-anchor" name="foo"></a>
<a id="tag-anchor" rel="tag" href="http://localhost/foo">link</a>
<a id="nofollow-anchor" rel="nofollow" href="https://example.org"> link</a>
<p id="paragraph">
lorem ipsum text
<b id="p-b">hi</b> <em id="p-em">there</em>
<b id="p-b2">guy</b>
<input type="checkbox" id="checkbox-unchecked" />
<input type="checkbox" id="checkbox-disabled" disabled="" />
<input type="text" id="text-checked" checked="checked" />
<input type="hidden" />
<input type="hidden" disabled="disabled" />
<input type="checkbox" id="checkbox-checked" checked="checked" />
<input type="checkbox" id="checkbox-disabled-checked"
disabled="disabled" checked="checked" />
<fieldset id="fieldset" disabled="disabled">
<input type="checkbox" id="checkbox-fieldset-disabled" />
<input type="hidden" />
</fieldset>
</p>
<map name="dummymap">
<area shape="circle" coords="200,250,25" href="foo.html" id="area-href" />
<area shape="default" id="area-nohref" />
</map>
</div>
<div class="cool-footer" id="foobar-div" foobar="ab bc cde">
<span id="foobar-span">foo ter</span>
</div>
</body></html>
'''


class TranslatorMixinTest(unittest.TestCase):

tr_cls = ScrapyHTMLTranslator

def setUp(self):
self.tr = self.tr_cls()
self.c2x = self.tr.css_to_xpath

def test_attribute_function(self):
cases = [
(':attribute(name)', u'descendant-or-self::*/@name'),
('a:attribute(name)', u'descendant-or-self::a/@name'),
('a :attribute(name)', u'descendant-or-self::a/descendant-or-self::*/@name'),
('a > :attribute(name)', u'descendant-or-self::a/*/@name'),
]
for css, xpath in cases:
self.assertEqual(self.c2x(css), xpath, css)

def test_text_pseudo_element(self):
cases = [
(':text', u'descendant-or-self::text()'),
('p:text', u'descendant-or-self::p/text()'),
('p :text', u'descendant-or-self::p/descendant-or-self::text()'),
('#id:text', u"descendant-or-self::*[@id = 'id']/text()"),
('p#id:text', u"descendant-or-self::p[@id = 'id']/text()"),
('p#id :text', u"descendant-or-self::p[@id = 'id']/descendant-or-self::text()"),
('p#id > :text', u"descendant-or-self::p[@id = 'id']/*/text()"),
('p#id ~ :text', u"descendant-or-self::p[@id = 'id']/following-sibling::*/text()"),
('a[href]:text', u'descendant-or-self::a[@href]/text()'),
('a[href] :text', u'descendant-or-self::a[@href]/descendant-or-self::text()'),
('p:text, a:text', u"descendant-or-self::p/text() | descendant-or-self::a/text()"),
]
for css, xpath in cases:
self.assertEqual(self.c2x(css), xpath, css)


class HTMLCSSSelectorTest(unittest.TestCase):

hcs_cls = HtmlCSSSelector

def setUp(self):
self.htmlresponse = HtmlResponse('http://example.com', body=HTMLBODY)
self.hcs = self.hcs_cls(self.htmlresponse)

def x(self, *a, **kw):
return [v.strip() for v in self.hcs.select(*a, **kw).extract() if v.strip()]

def test_selector_simple(self):
for x in self.hcs.select('input'):
self.assertTrue(isinstance(x, self.hcs.__class__), x)
self.assertEqual(self.hcs.select('input').extract(),
[x.extract() for x in self.hcs.select('input')])

def test_text_pseudo_element(self):
self.assertEqual(self.x('#p-b2'), [u'<b id="p-b2">guy</b>'])
self.assertEqual(self.x('#p-b2:text'), [u'guy'])
self.assertEqual(self.x('#p-b2 :text'), [u'guy'])
self.assertEqual(self.x('#paragraph:text'), [u'lorem ipsum text'])
self.assertEqual(self.x('#paragraph :text'), [u'lorem ipsum text', u'hi', u'there', u'guy'])
self.assertEqual(self.x('p:text'), [u'lorem ipsum text'])
self.assertEqual(self.x('p :text'), [u'lorem ipsum text', u'hi', u'there', u'guy'])

def test_attribute_function(self):
self.assertEqual(self.x('#p-b2:attribute(id)'), [u'p-b2'])
self.assertEqual(self.x('.cool-footer:attribute(class)'), [u'cool-footer'])
self.assertEqual(self.x('.cool-footer :attribute(id)'), [u'foobar-div', u'foobar-span'])
self.assertEqual(self.x('map[name="dummymap"] :attribute(shape)'), [u'circle', u'default'])

def test_nested_selector(self):
self.assertEqual(self.hcs.select('p').select('b:text').extract(),
[u'hi', u'guy'])
self.assertEqual(self.hcs.select('div').select('area:last-child').extract(),
[u'<area shape="default" id="area-nohref">'])

0 comments on commit 4e6967b

Please sign in to comment.