From 4e6967b854644d527c0eb5ace61e80a590fbd520 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Gra=C3=B1a?= Date: Wed, 9 Jan 2013 17:48:20 -0200 Subject: [PATCH] extend css selectors with ":text" and :attribute() #176 --- scrapy/selector/csssel.py | 93 ++++++++++++++----- scrapy/tests/test_selector_cssselect.py | 117 ++++++++++++++++++++++++ 2 files changed, 189 insertions(+), 21 deletions(-) create mode 100644 scrapy/tests/test_selector_cssselect.py diff --git a/scrapy/selector/csssel.py b/scrapy/selector/csssel.py index 13da8df955f..d5b571bae71 100644 --- a/scrapy/selector/csssel.py +++ b/scrapy/selector/csssel.py @@ -1,37 +1,88 @@ from cssselect import GenericTranslator, HTMLTranslator -from scrapy.utils.python import flatten -from scrapy.selector import HtmlXPathSelector, XmlXPathSelector -from .list import SelectorList +from cssselect.xpath import XPathExpr, ExpressionError +from scrapy.selector import XPathSelector, HtmlXPathSelector, XmlXPathSelector -class CSSSelectorList(SelectorList): - def xpath(self, xpath): - return self.__class__(flatten([x.xpath(xpath) for x in self])) +class ScrapyXPathExpr(XPathExpr): - def get(self, attr): - return self.__class__(flatten([x.get(attr) for x in self])) + textnode = False + attribute = None - def text(self, all=False): - return self.__class__(flatten([x.text(all) for x in self])) + @classmethod + def from_xpath(cls, xpath, textnode=False, attribute=None): + x = cls(path=xpath.path, element=xpath.element, condition=xpath.condition) + x.textnode = textnode + x.attribute = attribute + return x + + def __str__(self): + path = super(ScrapyXPathExpr, self).__str__() + if self.textnode: + if path == '*': + path = 'text()' + elif path.endswith('::*/*'): + path = path[:-3] + 'text()' + else: + path += '/text()' + + if self.attribute is not None: + if path.endswith('::*/*'): + path = path[:-2] + path += '/@%s' % self.attribute + + return path + + def join(self, combiner, other): + super(ScrapyXPathExpr, self).join(combiner, other) + self.textnode = other.textnode + self.attribute = other.attribute + return self + + +class TranslatorMixin(object): + + def xpath_element(self, selector): + xpath = super(TranslatorMixin, self).xpath_element(selector) + return ScrapyXPathExpr.from_xpath(xpath) + + def xpath_text_pseudo(self, xpath): + """Support selecting text nodes using :text pseudo-element""" + return ScrapyXPathExpr.from_xpath(xpath, textnode=True) + + def xpath_attribute_function(self, xpath, function): + if function.argument_types() not in (['STRING'], ['IDENT']): + raise ExpressionError( + "Expected a single string or ident for :contains(), got %r" + % function.arguments) + value = function.arguments[0].value + return ScrapyXPathExpr.from_xpath(xpath, attribute=value) + + +class ScrapyGenericTranslator(TranslatorMixin, GenericTranslator): + pass + + +class ScrapyHTMLTranslator(TranslatorMixin, HTMLTranslator): + pass class CSSSelectorMixin(object): + def select(self, css): - return CSSSelectorList(super(CSSSelectorMixin, self).select(self.translator.css_to_xpath(css))) + xpath = self._css2xpath(css) + return super(CSSSelectorMixin, self).select(xpath) - def xpath(self, xpath): - return CSSSelectorList(super(CSSSelectorMixin, self).select(xpath)) + def _css2xpath(self, css): + return self.translator.css_to_xpath(css) - def text(self, all=False): - return self.xpath('string()') if all else self.xpath('text()') - def get(self, attr): - return self.xpath('@' + attr) +class CSSSelector(CSSSelectorMixin, XPathSelector): + translator = ScrapyHTMLTranslator() -class XmlCSSSelector(CSSSelectorMixin, XmlXPathSelector): - translator = GenericTranslator() +class HtmlCSSSelector(CSSSelectorMixin, HtmlXPathSelector): + translator = ScrapyHTMLTranslator() -class HtmlCSSSelector(CSSSelectorMixin, HtmlXPathSelector): - translator = HTMLTranslator() +class XmlCSSSelector(CSSSelectorMixin, XmlXPathSelector): + translator = ScrapyGenericTranslator() diff --git a/scrapy/tests/test_selector_cssselect.py b/scrapy/tests/test_selector_cssselect.py new file mode 100644 index 00000000000..f406a80eec8 --- /dev/null +++ b/scrapy/tests/test_selector_cssselect.py @@ -0,0 +1,117 @@ +""" +Selector tests for cssselect backend +""" +from twisted.trial import unittest +from scrapy.http import TextResponse, HtmlResponse, XmlResponse +from scrapy.selector import CSSSelector, XmlCSSSelector, HtmlCSSSelector +from scrapy.selector.csssel import ScrapyHTMLTranslator + +HTMLBODY = ''' + + +
+ + + link +

+ lorem ipsum text + hi there + guy + + + + + + + +

+ + +
+

+ + + + +
+ + +''' + + +class TranslatorMixinTest(unittest.TestCase): + + tr_cls = ScrapyHTMLTranslator + + def setUp(self): + self.tr = self.tr_cls() + self.c2x = self.tr.css_to_xpath + + def test_attribute_function(self): + cases = [ + (':attribute(name)', u'descendant-or-self::*/@name'), + ('a:attribute(name)', u'descendant-or-self::a/@name'), + ('a :attribute(name)', u'descendant-or-self::a/descendant-or-self::*/@name'), + ('a > :attribute(name)', u'descendant-or-self::a/*/@name'), + ] + for css, xpath in cases: + self.assertEqual(self.c2x(css), xpath, css) + + def test_text_pseudo_element(self): + cases = [ + (':text', u'descendant-or-self::text()'), + ('p:text', u'descendant-or-self::p/text()'), + ('p :text', u'descendant-or-self::p/descendant-or-self::text()'), + ('#id:text', u"descendant-or-self::*[@id = 'id']/text()"), + ('p#id:text', u"descendant-or-self::p[@id = 'id']/text()"), + ('p#id :text', u"descendant-or-self::p[@id = 'id']/descendant-or-self::text()"), + ('p#id > :text', u"descendant-or-self::p[@id = 'id']/*/text()"), + ('p#id ~ :text', u"descendant-or-self::p[@id = 'id']/following-sibling::*/text()"), + ('a[href]:text', u'descendant-or-self::a[@href]/text()'), + ('a[href] :text', u'descendant-or-self::a[@href]/descendant-or-self::text()'), + ('p:text, a:text', u"descendant-or-self::p/text() | descendant-or-self::a/text()"), + ] + for css, xpath in cases: + self.assertEqual(self.c2x(css), xpath, css) + + +class HTMLCSSSelectorTest(unittest.TestCase): + + hcs_cls = HtmlCSSSelector + + def setUp(self): + self.htmlresponse = HtmlResponse('http://example.com', body=HTMLBODY) + self.hcs = self.hcs_cls(self.htmlresponse) + + def x(self, *a, **kw): + return [v.strip() for v in self.hcs.select(*a, **kw).extract() if v.strip()] + + def test_selector_simple(self): + for x in self.hcs.select('input'): + self.assertTrue(isinstance(x, self.hcs.__class__), x) + self.assertEqual(self.hcs.select('input').extract(), + [x.extract() for x in self.hcs.select('input')]) + + def test_text_pseudo_element(self): + self.assertEqual(self.x('#p-b2'), [u'guy']) + self.assertEqual(self.x('#p-b2:text'), [u'guy']) + self.assertEqual(self.x('#p-b2 :text'), [u'guy']) + self.assertEqual(self.x('#paragraph:text'), [u'lorem ipsum text']) + self.assertEqual(self.x('#paragraph :text'), [u'lorem ipsum text', u'hi', u'there', u'guy']) + self.assertEqual(self.x('p:text'), [u'lorem ipsum text']) + self.assertEqual(self.x('p :text'), [u'lorem ipsum text', u'hi', u'there', u'guy']) + + def test_attribute_function(self): + self.assertEqual(self.x('#p-b2:attribute(id)'), [u'p-b2']) + self.assertEqual(self.x('.cool-footer:attribute(class)'), [u'cool-footer']) + self.assertEqual(self.x('.cool-footer :attribute(id)'), [u'foobar-div', u'foobar-span']) + self.assertEqual(self.x('map[name="dummymap"] :attribute(shape)'), [u'circle', u'default']) + + def test_nested_selector(self): + self.assertEqual(self.hcs.select('p').select('b:text').extract(), + [u'hi', u'guy']) + self.assertEqual(self.hcs.select('div').select('area:last-child').extract(), + [u''])