Skip to content

Commit

Permalink
Added more rules for ebay.com, keep working host specific rules.
Browse files Browse the repository at this point in the history
  • Loading branch information
divout committed Mar 27, 2015
1 parent dc6f988 commit 4bd5b8b
Show file tree
Hide file tree
Showing 4 changed files with 25 additions and 12 deletions.
10 changes: 5 additions & 5 deletions goose/cleaners.py
Expand Up @@ -25,9 +25,10 @@
from urlparse import urlsplit
from goose.text import innerTrim
from configuration import Configuration
from host_utils import HostUtils

KNOWN_HOST_REMOVE_SELECTORS = {
'www.ebay.com': ['#desc_div', '[class *= "drpdwn"]']
'www.ebay.com': '#desc_div, [class *= "drpdwn"], .dropdownmenu, #PaginationAndExpansionsContainer, #ConstraintCaptionContainer, .noImage div, .yesImage div, .yesImage img[src *= "://ir"], .yesVideo, [class ^= addCaption], .removeModalLayer',
}

class OutputFormatterCleaner(clean.Cleaner):
Expand Down Expand Up @@ -221,10 +222,9 @@ def clean_bad_tags(self, doc):
return doc

def remove_host_specific_nodes(self, doc):
remove_selectors = KNOWN_HOST_REMOVE_SELECTORS[self.article.domain]
for selector in remove_selectors:
nodes = self.parser.css_select(doc, selector)
for node in nodes:
remove_selectors = HostUtils.host_selectors(KNOWN_HOST_REMOVE_SELECTORS, self.article.domain)
nodes = self.parser.css_select(doc, remove_selectors)
for node in nodes:
self.parser.remove(node)

return doc
Expand Down
15 changes: 10 additions & 5 deletions goose/extractors.py
Expand Up @@ -26,6 +26,7 @@
from goose.utils import StringSplitter
from goose.utils import StringReplacement
from goose.utils import ReplaceSequence
from host_utils import HostUtils

MOTLEY_REPLACEMENT = StringReplacement("�", "")
ESCAPED_FRAGMENT_REPLACEMENT = StringReplacement(u"#!", u"?_escaped_fragment_=")
Expand Down Expand Up @@ -59,7 +60,7 @@
]

KNOWN_HOST_CONTENT_TAGS = {
'www.ebay.com': ['#vi-desc-maincntr']
'www.ebay.com': '.vi-price, noscript [itemprop="image"], #vi-desc-maincntr, #Results, [itemprop="articleBody"]',
}


Expand Down Expand Up @@ -344,10 +345,8 @@ def calculate_best_node(self):

def get_top_host_node_from_known_tags(self):
if self.article.domain in KNOWN_HOST_CONTENT_TAGS:
selectors = KNOWN_HOST_CONTENT_TAGS[self.article.domain]
content_tags = []
for selector in selectors:
content_tags += self.parser.css_select(self.article.doc, selector)
selectors = HostUtils.host_selectors(KNOWN_HOST_CONTENT_TAGS, self.article.domain)
content_tags = self.parser.css_select(self.article.doc, selectors)

return self.parser.combine_nodes(content_tags)

Expand Down Expand Up @@ -605,6 +604,12 @@ def post_cleanup(self):
old_attribute_name = 'src',
new_attribute_name = 'data-src')

# fixing ebay images
self.replace_attributes(node,
tag_name = 'img',
old_attribute_name = 'src',
new_attribute_name = 'imgurl')

self.build_tag_paths(node, 'img', 'src')
self.build_tag_paths(node, 'a', 'href')
allowed_tags = ['p', 'img', 'ul', 'ol', 'h2', 'h3', 'h4', 'h5', 'h6',
Expand Down
8 changes: 8 additions & 0 deletions goose/host_utils.py
@@ -0,0 +1,8 @@
class HostUtils(object):

@classmethod
def host_selectors(self, all_selectors, host):
selectors = all_selectors[host]
if type(selectors) is dict:
selectors = all_selectors[selectors['reference']]
return selectors
4 changes: 2 additions & 2 deletions goose/parsers.py
Expand Up @@ -247,9 +247,9 @@ def decode_html(self, html_string):
def combine_nodes(self, nodes):
if len(nodes):
if len(nodes) > 1:
root = self.parser.createElement('div')
root = self.createElement('div')
for node in nodes:
self.parser.appendChild(root, node)
self.appendChild(root, node)
return root
else:
return nodes[0]
Expand Down

0 comments on commit 4bd5b8b

Please sign in to comment.