From f51b71f8a11cc27df9f14b87e4e976f939c63453 Mon Sep 17 00:00:00 2001 From: Ronald Schmidt Date: Tue, 21 Feb 2017 16:33:43 +0100 Subject: [PATCH] Fixes #149 plus some styles --- GoogleScraper/core.py | 11 +++--- GoogleScraper/parsing.py | 48 +++++++++++------------ GoogleScraper/search_engine_parameters.py | 18 ++++----- GoogleScraper/selenium_mode.py | 25 +++++------- 4 files changed, 48 insertions(+), 54 deletions(-) diff --git a/GoogleScraper/core.py b/GoogleScraper/core.py index 9d2bf094..dd84909c 100755 --- a/GoogleScraper/core.py +++ b/GoogleScraper/core.py @@ -27,6 +27,7 @@ class WrongConfigurationError(Exception): pass + def id_for_keywords(keywords): """Determine a unique id for the keywords. @@ -97,7 +98,8 @@ def start_python_console(namespace=None, noipython=False, banner=''): except ImportError: pass else: - import rlcompleter + pass + # import rlcompleter readline.parse_and_bind("tab:complete") code.interact(banner=banner, local=namespace) @@ -202,7 +204,7 @@ def main(return_results=False, parse_cmd_line=True, config_from_dict=None): proxy_db = config.get('mysql_proxy_db', '') # when no search engine is specified, use google - search_engines = config.get('search_engines', ['google',]) + search_engines = config.get('search_engines', ['google']) if not isinstance(search_engines, list): if search_engines == '*': search_engines = config.get('supported_search_engines') @@ -238,8 +240,7 @@ def main(return_results=False, parse_cmd_line=True, config_from_dict=None): if not (keyword or keywords) and not kwfile: # Just print the help. get_command_line(True) - print('No keywords to scrape for. Please provide either an keyword file (Option: --keyword-file) or specify and ' - 'keyword with --keyword.') + print('No keywords to scrape for. Please provide either an keyword file (Option: --keyword-file) or specify and keyword with --keyword.') return cache_manager = CacheManager(config) @@ -456,4 +457,4 @@ def main(return_results=False, parse_cmd_line=True, config_from_dict=None): session.commit() if return_results: - return scraper_search + return session diff --git a/GoogleScraper/parsing.py b/GoogleScraper/parsing.py index 09fd4b41..6d5e6aeb 100644 --- a/GoogleScraper/parsing.py +++ b/GoogleScraper/parsing.py @@ -30,12 +30,12 @@ class Parser(): """Parses SERP pages. Each search engine results page (SERP) has a similar layout: - + The main search results are usually in a html container element (#main, .results, #leftSide). - There might be separate columns for other search results (like ads for example). Then each + There might be separate columns for other search results (like ads for example). Then each result contains basically a link, a snippet and a description (usually some text on the target site). It's really astonishing how similar other search engines are to Google. - + Each child class (that can actual parse a concrete search engine results page) needs to specify css selectors for the different search types (Like normal search, news search, video search, ...). @@ -73,10 +73,10 @@ def __init__(self, config={}, html='', query=''): """Create new Parser instance and parse all information. Args: - html: The raw html from the search engine search. If not provided, you can parse + html: The raw html from the search engine search. If not provided, you can parse the data later by calling parse(html) directly. searchtype: The search type. By default "normal" - + Raises: Assertion error if the subclassed specific parser cannot handle the the settings. @@ -109,8 +109,8 @@ def __init__(self, config={}, html='', query=''): def parse(self, html=None): """Public function to start parsing the search engine results. - - Args: + + Args: html: The raw html data to extract the SERP entries from. """ if html: @@ -137,7 +137,7 @@ def _parse_lxml(self, cleaner=None): def _parse(self, cleaner=None): """Internal parse the dom according to the provided css selectors. - + Raises: InvalidSearchTypeException if no css selectors for the searchtype could be found. """ self.num_results = 0 @@ -152,8 +152,7 @@ def _parse(self, cleaner=None): self.num_results_for_query = self.first_match(num_results_selector, self.dom) if not self.num_results_for_query: - logger.debug('{}: Cannot parse num_results from serp page with selectors {}'.format(self.__class__.__name__, - num_results_selector)) + logger.debug('{}: Cannot parse num_results from serp page with selectors {}'.format(self.__class__.__name__, num_results_selector)) # get the current page we are at. Sometimes we search engines don't show this. try: @@ -180,7 +179,7 @@ def _parse(self, cleaner=None): self.search_results[result_type] = [] - for selector_specific, selectors in selector_class.items(): + for _, selectors in selector_class.items(): if 'result_container' in selectors and selectors['result_container']: css = '{container} {result_container}'.format(**selectors) @@ -272,14 +271,14 @@ def first_match(self, selectors, element): match = self.advanced_css(selector, element=element) if match: return match - except IndexError as e: + except IndexError: pass return False def after_parsing(self): """Subclass specific behaviour after parsing happened. - + Override in subclass to add search engine specific behaviour. Commonly used to clean the results. """ @@ -312,7 +311,7 @@ def iter_serp_items(self): """ -Here follow the different classes that provide CSS selectors +Here follow the different classes that provide CSS selectors for different types of SERP pages of several common search engines. Just look at them and add your own selectors in a new class if you @@ -404,7 +403,7 @@ class GoogleParser(Parser): image_search_selectors = { 'results': { 'de_ip': { - 'container': 'li#isr_mc', + 'container': '#isr_mc', 'result_container': 'div.rg_di', 'link': 'a.rg_l::attr(href)' }, @@ -422,12 +421,12 @@ def __init__(self, *args, **kwargs): def after_parsing(self): """Clean the urls. - + A typical scraped results looks like the following: - + '/url?q=http://www.youtube.com/user/Apple&sa=U&ei=\ lntiVN7JDsTfPZCMgKAO&ved=0CFQQFjAO&usg=AFQjCNGkX65O-hKLmyq1FX9HQqbb9iYn9A' - + Clean with a short regex. """ super().after_parsing() @@ -543,11 +542,10 @@ def after_parsing(self): try: i = self.html.index(substr) if i: - self.num_results_for_query = re.search(r'— (.)*?"', self.html[i:i+len(self.query) + 150]).group() + self.num_results_for_query = re.search(r'— (.)*?"', self.html[i:i + len(self.query) + 150]).group() except Exception as e: logger.debug(str(e)) - if self.searchtype == 'image': for key, i in self.iter_serp_items(): for regex in ( @@ -626,7 +624,7 @@ class BingParser(Parser): 'ch_ip': { 'container': '#dg_c .imgres', 'result_container': '.dg_u', - 'link': 'a.dv_i::attr(m)' + 'link': 'a::attr(m)' }, } } @@ -1049,12 +1047,12 @@ def parse_serp(config, html=None, parser=None, scraper=None, search_engine=None, if __name__ == '__main__': """Originally part of https://github.com/NikolaiT/GoogleScraper. - - Only for testing purposes: May be called directly with an search engine + + Only for testing purposes: May be called directly with an search engine search url. For example: - + python3 parsing.py 'http://yandex.ru/yandsearch?text=GoogleScraper&lr=178&csg=82%2C4317%2C20%2C20%2C0%2C0%2C0' - + Please note: Using this module directly makes little sense, because requesting such urls directly without imitating a real browser (which is done in my GoogleScraper module) makes the search engines return crippled html, which makes it impossible to parse. diff --git a/GoogleScraper/search_engine_parameters.py b/GoogleScraper/search_engine_parameters.py index a3db0505..e9d31281 100644 --- a/GoogleScraper/search_engine_parameters.py +++ b/GoogleScraper/search_engine_parameters.py @@ -73,7 +73,7 @@ # current geographic location. 'safe': 'off', # Turns the adult content filter on or off 'rls': None, - #Source of query with version of the client and language set. With firefox set to 'org.mozilla:en-US:official' + # Source of query with version of the client and language set. With firefox set to 'org.mozilla:en-US:official' 'sa': None, # User search behavior parameter sa=N: User searched, sa=X: User clicked on related searches in the SERP 'source': None, # Google navigational parameter specifying where you came from, univ: universal search @@ -117,8 +117,8 @@ 'oe': 'UTF-8', # Sets the character encoding that is used to encode the results. 'ip': None, # When queries are made using the HTTP protocol, the ip parameter contains the IP address of the user - #who submitted the search query. You do not supply this parameter with the search request. The ip - #parameter is returned in the XML search results. For example: + # who submitted the search query. You do not supply this parameter with the search request. The ip + # parameter is returned in the XML search results. For example: 'sitesearch': None, # Limits search results to documents in the specified domain, host, or web directory. Has no effect if the q # parameter is empty. This parameter has the same effect as the site special query term. @@ -147,19 +147,19 @@ # ft are: 'i': filetype and 'e': -filetype 'as_lq': None, # Specifies a URL, and causes search results to show pages that link to the that URL. This parameter has - #the same effect as the link special query term (see “Back Links” on page 20). No other query terms can - #be used when using this parameter. + # the same effect as the link special query term (see “Back Links” on page 20). No other query terms can + # be used when using this parameter. 'as_occt': None, # Specifies where the search engine is to look for the query terms on the page: anywhere on the page, in - #the title, or in the URL. + # the title, or in the URL. 'as_oq': None, # Combines the specified terms to the search query in parameter q, with an OR operation. This parameter # has the same effect as the OR special query term (see “Boolean OR Search” on page 20). 'as_q': None, # Adds the specified query terms to the query terms in parameter q. 'as_sitesearch': None, # Limits search results to documents in the specified domain, host or web directory, or excludes results - #from the specified location, depending on the value of as_dt. This parameter has the same effect as the - #site or -site special query terms. It has no effect if the q parameter is empty. + # from the specified location, depending on the value of as_dt. This parameter has the same effect as the + # site or -site special query terms. It has no effect if the q parameter is empty. 'entqr': None, # This parameter sets the query expansion policy according to the following valid values: # 0: None # 1: Standard Uses only the search appliance’s synonym file. @@ -182,7 +182,7 @@ """ bing_search_params = { - + 'adlt': 'off' } """ diff --git a/GoogleScraper/selenium_mode.py b/GoogleScraper/selenium_mode.py index 2be5b383..ce4af6bd 100644 --- a/GoogleScraper/selenium_mode.py +++ b/GoogleScraper/selenium_mode.py @@ -58,7 +58,7 @@ class SelScrape(SearchEngineScrape, threading.Thread): 'google': '#pnnext', 'yandex': '.pager__button_kind_next', 'bing': '.sb_pagN', - 'yahoo': '#pg-next', + 'yahoo': '.compPagination .next', 'baidu': '.n', 'ask': '#paging div a.txt3.l_nu', 'blekko': '', @@ -301,7 +301,7 @@ def handle_request_denied(self, status_code): if self.config.get('manual_captcha_solving', False): with self.captcha_lock: - import tempfile + # import tempfile tf = tempfile.NamedTemporaryFile('wb') tf.write(self.webdriver.get_screenshot_as_png()) @@ -450,15 +450,18 @@ def _find_next_page_element(self): try: # wait until the next page link is clickable WebDriverWait(self.webdriver, 5).until(EC.element_to_be_clickable((By.CSS_SELECTOR, selector))) - except (WebDriverException, TimeoutException) as e: + except (WebDriverException, TimeoutException): self._save_debug_screenshot() - raise Exception('{}: Cannot locate next page element: {}'.format(self.name, str(e))) + # raise Exception('{}: Cannot locate next page element: {}'.format(self.name, str(e))) return self.webdriver.find_element_by_css_selector(selector) elif self.search_type == 'image': self.page_down() - return True + if self.search_engine_name == 'google': + return self.webdriver.find_element_by_css_selector('input._kvc') + else: + return True def wait_until_serp_loaded(self): """ @@ -595,17 +598,9 @@ def page_down(self): Used for next page in image search mode or when the next results are obtained by scrolling down a page. """ - js = ''' - var w = window, - d = document, - e = d.documentElement, - g = d.getElementsByTagName('body')[0], - y = w.innerHeight|| e.clientHeight|| g.clientHeight; - - window.scrollBy(0,y); - return y; - ''' + js = 'window.scrollTo(0,document.body.scrollHeight);' + time.sleep(5) self.webdriver.execute_script(js) def run(self):