From 5c278700caf8dcda40a049958e33df1f930d494c Mon Sep 17 00:00:00 2001 From: Manish Devgan Date: Sat, 12 Aug 2017 18:00:09 +0100 Subject: [PATCH 1/2] Custom Count with upto 100 search results Added a dropdown for the number of results and changed the scrapper and server .py files according to it. Closes fossasia/query-server#124 and fossasia/query-server#59 --- app/scraper.py | 110 ++++++++++++++++++--------------------- app/server.py | 7 +-- app/templates/index.html | 37 +++++++++---- 3 files changed, 83 insertions(+), 71 deletions(-) diff --git a/app/scraper.py b/app/scraper.py index 8f172ced..9da690fa 100644 --- a/app/scraper.py +++ b/app/scraper.py @@ -12,36 +12,37 @@ query = '' -def get_bing_page(query): +def get_bing_page(query,index): """ Fetches search response from bing.com returns : result page in html """ header = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36'} - payload = {'q': query} + payload = {'q': query, 'first' : index} response = requests.get('http://www.bing.com/search', params=payload, headers=header) return response -def bing_search(query): +def bing_search(query,count): """ Search bing for the query and return set of urls Returns: urls (list) [[Tile1,url1], [Title2, url2],..] """ urls = [] - response = get_bing_page(query) - soup = BeautifulSoup(response.text, 'html.parser') - for li in soup.findAll('li', {'class': 'b_algo'}): - title = li.h2.text.replace('\n', '').replace(' ', '') - url = li.h2.a['href'] - desc = li.find('p').text - url_entry = {'title': title, - 'link': url, - 'desc': desc} - urls.append(url_entry) - - return urls + for index in range(10,count+1,10): + response = get_bing_page(query,index+1) + soup = BeautifulSoup(response.text, 'html.parser') + for li in soup.findAll('li', {'class': 'b_algo'}): + title = li.h2.text.replace('\n', '').replace(' ', '') + url = li.h2.a['href'] + desc = li.find('p').text + url_entry = {'title': title, + 'link': url, + 'desc': desc} + urls.append(url_entry) + if len(urls) == count: + return urls def get_duckduckgo_page(query): @@ -73,35 +74,25 @@ def duckduckgo_search(query): return urls -def get_google_page(query): +def get_google_page(query,index): """ Fetch the google search results page Returns : Results Page """ header = { - 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36'} - payload = {'q': query} - response = requests.get('https://www.google.com/search', headers=header, params=payload) - return response - -def get_google_page(query,startIndex): - """ Fetch the google search results page - Returns : Results Page - """ - header = { - 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36'} - payload = {'q': query,'start':startIndex} + 'User-Agent': 'Mozilla/5.4 (Macintosh; Intel Mac OS X 10_8_1) AppleWebKit/537.34 (KHTML, like Gecko) Chrome/27.0.1453.106 Safari/535.36'} + payload = {'q': query,'start' : index} response = requests.get('https://www.google.com/search', headers=header, params=payload) return response -def google_search(query): +def google_search(query,count): """ Search google for the query and return set of urls Returns: urls (list) [[Tile1,url1], [Title2, url2],..] """ urls = [] - for count in range(0,10): - response = get_google_page(query,count*10) + for index in range(0,count,10): + response = get_google_page(query,index) soup = BeautifulSoup(response.text, 'html.parser') for h3 in soup.findAll('h3', {'class': 'r'}): links = h3.find('a') @@ -109,48 +100,51 @@ def google_search(query): urls.append({'title': links.getText(), 'link': links.get('href'), 'desc': desc.getText()}) + if len(urls) == count: + return urls - return urls -def get_yahoo_page(query): +def get_yahoo_page(query,index): """ Fetch the yahoo search results Returns : Results Page """ header = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36'} - payload = {'q': query} + payload = {'p': query,'b' : index} response = requests.get('https://search.yahoo.com/search', headers=header, params=payload) return response -def yahoo_search(query): +def yahoo_search(query,count): """ Gives search query to yahoo and returns the urls Returns: urls (list) [[Tile1,url1], [Title2, url2],..] """ urls = [] - response = get_yahoo_page(query) - soup = BeautifulSoup(response.content, 'lxml') - for h in soup.findAll('h3', attrs={'class': 'title'}): - t = h.findAll('a', attrs={'class': ' ac-algo fz-l ac-21th lh-24'}) - for y in t: - r = y.get('href') - f = r.split('RU=') - e = f[-1].split('/RK=0') - g = e[-1].split('/RK=1') - u = g[0].replace('%3a', ':').replace('%2f', '/').replace('%28', '(').replace('%29', ')').replace('%3f', - '?').replace( - '%3d', '=').replace('%26', '&').replace('%29', ')').replace('%26', "'").replace('%21', '!').replace( - '%23', '$').replace('%40', '[').replace('%5b', ']') - d = y.find_next('p') - print(d) - urls.append({'title': y.getText(), - 'link': u, - 'desc': d.getText()}) + for index in range(0,count+1,10): + response = get_yahoo_page(query,index+1) + soup = BeautifulSoup(response.content, 'lxml') + for h in soup.findAll('h3', attrs={'class': 'title'}): + t = h.findAll('a', attrs={'class': ' ac-algo fz-l ac-21th lh-24'}) + for y in t: + r = y.get('href') + f = r.split('RU=') + e = f[-1].split('/RK=0') + g = e[-1].split('/RK=1') + u = g[0].replace('%3a', ':').replace('%2f', '/').replace('%28', '(').replace('%29', ')').replace('%3f', + '?').replace( + '%3d', '=').replace('%26', '&').replace('%29', ')').replace('%26', "'").replace('%21', '!').replace( + '%23', '$').replace('%40', '[').replace('%5b', ']') + d = y.find_next('p') + print(d) + urls.append({'title': y.getText(), + 'link': u, + 'desc': d.getText()}) + if len(urls) == count: + return urls - return urls def read_in(): @@ -162,15 +156,15 @@ def small_test(): assert type(google_search('fossasia')) is list -def feedgen(query, engine): +def feedgen(query, engine,count): if engine == 'g': - urls = google_search(query) + urls = google_search(query,count) elif engine == 'd': urls = duckduckgo_search(query) elif engine == 'y': - urls = yahoo_search(query) + urls = yahoo_search(query,count) else: - urls = bing_search(query) + urls = bing_search(query,count) result = urls print(result) print(len(result)) diff --git a/app/server.py b/app/server.py index a36f9288..dfc9bcda 100644 --- a/app/server.py +++ b/app/server.py @@ -28,7 +28,8 @@ def bad_request(err): def search(search_engine): try: if request.method == 'GET': - + num = request.args.get('num') + count = int(num) qformat = request.args.get('format') or 'json' if qformat not in ['json', 'xml']: abort(400, 'Not Found - undefined format') @@ -43,7 +44,7 @@ def search(search_engine): err = [400, 'Not Found - missing query', qformat] return bad_request(err) - result = feedgen(query,engine[0]) + result = feedgen(query,engine[0],count) if not result: err = [404, 'No response', qformat] return bad_request(err) @@ -75,4 +76,4 @@ def set_header(r): if __name__ == '__main__': app.debug = True - app.run(host='0.0.0.0', port=(int)(os.environ.get('PORT', 7001)), debug=True) \ No newline at end of file + app.run(host='0.0.0.0', port=(int)(os.environ.get('PORT', 7001)), debug=True) diff --git a/app/templates/index.html b/app/templates/index.html index 08ac0391..6459aa9e 100644 --- a/app/templates/index.html +++ b/app/templates/index.html @@ -50,11 +50,29 @@

query-server


-
- - - - +
+
+ + + + +
+
+ + +
+
@@ -84,7 +102,8 @@

query-server

var sengine = $(this).val(); var squery = $('#query').val(); var sformat = $(' #format label.active input').val(); - var urlloc = window.location.href.split(/\?|#/)[0] + "api/v1/search/" + sengine + "?query=" + squery + "&format=" + sformat; + var count = $('#resp').val(); + var urlloc = window.location.href.split(/\?|#/)[0] + "api/v1/search/" + sengine + "?query=" + squery + "&format=" + sformat + "&num=" + count; $.ajax({ url: urlloc, type: 'GET', @@ -110,12 +129,10 @@

query-server

}); $('.formatButton').click(function(e){ e.preventDefault(); - if($(this).hasClass('active')){ - $(this).removeClass('active') - } else { + if(!$(this).hasClass('active')){ $(".active").removeClass("active"); $(this).addClass('active') - } + } }); $(window).keydown(function(event){ if(event.keyCode == 13) { From c4f40a7e576f98fb68eef5aadc6b1b03864c21d6 Mon Sep 17 00:00:00 2001 From: Manish Devgan Date: Thu, 14 Sep 2017 18:12:10 +0100 Subject: [PATCH 2/2] updated Readme and Handled Typeerror on query-server Readme.md has been updated with the latest trends. It did not have the `&num=10` mentioned before. query-server now returns a errorObject in JSON Format whenever 500 error_code is encountered, so that the page is not left stuck. Fixes https://github.com/fossasia/query-server#142 and https://github.com/fossasia/query-server#140 --- README.md | 3 ++- app/server.py | 8 ++++++-- package.json | 7 +++---- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 6ff218d5..19f153c0 100644 --- a/README.md +++ b/README.md @@ -29,11 +29,12 @@ The API(s) provided by query-server are as follows: > *format* : [ `json`, `xml` ] -A sample query : `/api/v1/search/bing?query=fossasia&format=xml` +A sample query : `/api/v1/search/bing?query=fossasia&format=xml&num=10` ## Error Codes 404 Not Found : Incorrect Search Engine, Zero Response 400 Bad Request : query and/or format is not in the correct format + 500 Internal Server Error : Server Error from Search Engine ## Dependencies diff --git a/app/server.py b/app/server.py index dfc9bcda..422d6d81 100644 --- a/app/server.py +++ b/app/server.py @@ -10,6 +10,11 @@ client = MongoClient(os.environ.get('MONGO_URI', 'mongodb://localhost:27017/')) db = client['query-server-v2'] +errorObj = { + 'type' : 'Internal Server Error', + 'status_code' : 500, + 'error' : 'Could not parse the page due to Internal Server Error' +} @app.route('/') def index(): @@ -67,8 +72,7 @@ def search(search_engine): return Response(xmlfeed, mimetype='application/xml') except Exception as e: - return (e) - + return Response(json.dumps(errorObj).encode('utf-8'),mimetype='application/json') @app.after_request def set_header(r): r.headers["Cache-Control"] = "no-cache" diff --git a/package.json b/package.json index 94edc743..a626bd07 100644 --- a/package.json +++ b/package.json @@ -4,8 +4,7 @@ "dependencies": { "bower": "^1.8.0" }, - "scripts": - { - "postinstall": "bower install" - } + "scripts": { + "postinstall": "bower install" + } }