Permalink
Browse files

Fixed buggy time search where it only worked for http search not for …

…async mode
  • Loading branch information...
ethanchewy committed Nov 21, 2017
1 parent 9bd8c5d commit 4356901540ea0f3ad4a639e8b5d084656895098f
Showing with 7 additions and 8 deletions.
  1. +1 −1 GoogleScraper/http_mode.py
  2. +3 −3 GoogleScraper/parsing.py
  3. +3 −4 GoogleScraper/scraping.py
@@ -175,7 +175,7 @@ def __init__(self, config, *args, time_offset=0.0, **kwargs):

# get the base search url based on the search engine.
#+ "&source=lnt&tbs=cdr%3A1%2Ccd_min%3A2015%2Ccd_max%3A2016&tbm=
self.base_search_url = get_base_search_url_by_search_engine(self.config, self.search_engine_name, self.scrape_method) + "&source=lnt&tbs=cdr%3A1%2Ccd_min%3A2015%2Ccd_max%3A2016&tbm="
self.base_search_url = get_base_search_url_by_search_engine(self.config, self.search_engine_name, self.scrape_method)

super().instance_creation_info(self.__class__.__name__)

@@ -359,7 +359,7 @@ class GoogleParser(Parser):
'result_container': 'div.g ',
'link': 'h3.r > a:first-child::attr(href)',
'snippet': 'div.s span.st::text',
'time_stamp' : 'div.slp::text',
'time_stamp' : 'div.s div.slp::text',
'title': 'h3.r > a:first-child::text',
'visible_link': 'cite::text'
},
@@ -368,15 +368,15 @@ class GoogleParser(Parser):
'result_container': 'li.g ',
'link': 'h3.r > a:first-child::attr(href)',
'snippet': 'div.s span.st::text',
'time_stamp' : 'div.slp::text',
'time_stamp' : 'div.s div.slp::text',
'title': 'h3.r > a:first-child::text',
'visible_link': 'cite::text'
},
'de_ip_news_items': {
'container': 'li.card-section',
'link': 'a._Dk::attr(href)',
'snippet': 'span._dwd::text',
'time_stamp' : 'div.slp::text',
'time_stamp' : 'div.s div.slp::text',
'title': 'a._Dk::text',
'visible_link': 'cite::text'
},
@@ -78,7 +78,7 @@ def get_base_search_url_by_search_engine(config, search_engine_name, search_mode
"""
assert search_mode in SEARCH_MODES, 'search mode "{}" is not available'.format(search_mode)

specific_base_url = config.get('{}_{}_search_url'.format(search_mode, search_engine_name), None)
specific_base_url = config.get('{}_{}_search_url'.format(search_mode, search_engine_name), None)

if not specific_base_url:
specific_base_url = config.get('{}_search_url'.format(search_engine_name), None)
@@ -90,9 +90,8 @@ def get_base_search_url_by_search_engine(config, search_engine_name, search_mode
ips = file.read().split('\n')
random_ip = random.choice(ips)
return random_ip

return specific_base_url

specific_base_url += "&source=lnt&tbs=cdr%3A1%2Ccd_min%3A3%2F1%2F2015%2Ccd_max%3A11%2F1%2F2015&tbm=&"
return specific_base_url

class SearchEngineScrape(metaclass=abc.ABCMeta):
"""Abstract base class that represents a search engine scrape.

0 comments on commit 4356901

Please sign in to comment.