From bb9ce50fd45f6005c79ca735124ff8f7d4c907dd Mon Sep 17 00:00:00 2001 From: enigmaeth Date: Sat, 10 Dec 2016 17:40:38 +0530 Subject: [PATCH 1/2] Delete README.md --- README.md | 5 ----- 1 file changed, 5 deletions(-) delete mode 100644 README.md diff --git a/README.md b/README.md deleted file mode 100644 index b2887ebb..00000000 --- a/README.md +++ /dev/null @@ -1,5 +0,0 @@ -# Query-Server -A simple Query Server that stores a query string on a server - -## Description -A simple web server application that can be used to process a query string. This string shall call the Google search result scraper and the output from the scraper is written to a file, named with the query string as file name. From cbdd223a39aceaee97edc93e1d7b431bf22d7e78 Mon Sep 17 00:00:00 2001 From: enigmaeth Date: Sat, 10 Dec 2016 17:41:26 +0530 Subject: [PATCH 2/2] initial commit --- README.md | 35 ++++++++++++++++++++++++ data/query_list.txt | 1 + package.json | 30 +++++++++++++++++++++ requirements.txt | 3 +++ rss-generator.py | 66 +++++++++++++++++++++++++++++++++++++++++++++ server.js | 46 +++++++++++++++++++++++++++++++ 6 files changed, 181 insertions(+) create mode 100644 README.md create mode 100644 data/query_list.txt create mode 100644 package.json create mode 100644 requirements.txt create mode 100644 rss-generator.py create mode 100644 server.js diff --git a/README.md b/README.md new file mode 100644 index 00000000..8ede5f39 --- /dev/null +++ b/README.md @@ -0,0 +1,35 @@ +# query-server +> Query Server that stores a query string on a server. + +This mini-tool can be used to process a query string. This string calls the Google search result scraper at [searss](https://github.com/fossasia/searss) and the output from the scraper is written to a file, named with the query string as file name. + + + +# Requirements +* Python 2 +* [Node.js](https://nodejs.org/en/) +* [PIP](https://pip.pypa.io/en/stable/installing/) +* [Mechanize](http://wwwsearch.sourceforge.net/mechanize/) +* [Feedgen](https://github.com/lkiesow/python-feedgen) +* [BeautifulSoup4](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) + +# Installing +Make sure you have [Nodejs](https://nodejs.org/en/) installed. +Running this tool requires installing the nodejs as well as python dependencies. +``` +$ git clone https://github.com/enigmaeth/query-server.git +$ cd query-server +$ npm install +$ pip install -r requirements.txt +``` + +# Running +To run the query server: +``` +$ npm start +``` +The search is prompted then. +``` +Search for >> +``` +Type in the query after the `>>` and hit enter. diff --git a/data/query_list.txt b/data/query_list.txt new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/data/query_list.txt @@ -0,0 +1 @@ + diff --git a/package.json b/package.json new file mode 100644 index 00000000..c3bc36f9 --- /dev/null +++ b/package.json @@ -0,0 +1,30 @@ +{ + "name": "query-server", + "version": "1.0.1", + "description": "", + "main": "server.js", + "dependencies": { + "express": "^4.14.0", + "json3": "^3.3.2" + }, + "devDependencies": {}, + "scripts": { + "test": "echo \"Error: no test specified\" && exit 1", + "start": "node server.js" + }, + "repository": { + "type": "git", + "url": "https://github.com/enigmaeth/chat-app.git" + }, + "keywords": [ + "socket.io", + "chat-app", + "nodejs" + ], + "author": "enigmaeth", + "license": "ISC", + "bugs": { + "url": "https://github.com/enigmaeth/chat-app/issues" + }, + "homepage": "https://github.com/enigmaeth/chat-app#readme" +} diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..b48a0811 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +beautifulsoup4>=4.5.1 +feedgen>=0.4.0 +mechanize>=0.2.5 diff --git a/rss-generator.py b/rss-generator.py new file mode 100644 index 00000000..20e586af --- /dev/null +++ b/rss-generator.py @@ -0,0 +1,66 @@ +import mechanize +from feedgen.feed import FeedGenerator +import urlparse +from bs4 import BeautifulSoup +import sys +import json +import os + +query = '' + +def generateFeed(urls): + ''' Generates RSS feel from the given urls ''' + fg = FeedGenerator() + fg.title('Google Search Results') + fg.link(href='http://google.com', rel='alternate') + fg.description('Google Seach Results') + for url in urls: + fe = fg.add_entry() + fe.title(url[0]) + fe.link({'href': url[1], 'rel':'alternate'}) + print fg.rss_str(pretty=True) + ##Write to file + file_name = os.path.dirname(os.path.abspath(__file__)) + '/data/' + query + ".xml" + fg.rss_file(file_name) + + +def google_search(query): + ''' Search google for the query and return set of urls + Returns: urls (list) + [[Tile1,url1], [Title2, url2],..] + ''' + urls = [] + response = get_results_page(query) + soup = BeautifulSoup(response.read(), 'html.parser') + # Search for all relevant 'a' tags + for a in soup.select('.r a'): + parsed_url = urlparse.urlparse(a['href']) + # Validate url + if 'url' in parsed_url.path: + urls.append([a.text, str(urlparse.parse_qs(parsed_url.query)['q'][0])]) + return urls + +def get_results_page(query): + ''' Fetch the google search results page + Returns : Results Page + ''' + br = mechanize.Browser() + br.set_handle_robots(False) # Google's robot.txt prevents from scrapping + br.addheaders = [('User-agent','Mozilla/5.0')] + br.open('http://www.google.com/') + br.select_form(name='f') + br.form['q'] = query + return br.submit() + +def read_in(): + lines = sys.stdin.readlines() + return json.loads(lines[0]) + +def main(): + global query + query = read_in() + ##query = "harambe" + urls = google_search(query) + generateFeed(urls) +if __name__ == "__main__": + main() diff --git a/server.js b/server.js new file mode 100644 index 00000000..da999fd9 --- /dev/null +++ b/server.js @@ -0,0 +1,46 @@ +var express = require('express'); +var path = require('path'); +var fs = require('fs'); +var util = require('util'); +var spawn = require('child_process').spawn; +var readline = require('readline'); + +var query = ''; +var dataString = ""; + +var queries = fs.readFileSync('data/query_list.txt').toString().split("\n"); +var rl = readline.createInterface(process.stdin, process.stdout); + +rl.setPrompt('\nSearch for >> '); +rl.prompt(); +rl.on('line', function(line) { + data = line; + if (queries.indexOf(data) == -1) { + dataString = ''; + queries.push(data); + console.log(" querying -> " + data); + + var py = spawn('python', ['rss-generator.py']); + py.stdout.on('data', function(data) { + dataString += data.toString(); + }); + py.stdout.on('end', function() { + console.log(dataString); + console.log(" saved to file : " + __dirname + '/data/' + data + '.xml'); + rl.prompt(); + }); + + py.stdin.write(JSON.stringify(data)); + py.stdin.end(); + + queries.forEach(function(v) { + fs.appendFile('data/query_list.txt', v + '\n', function(err) { + if (err) console.log(err); + }); + }); + } else { + console.log(" already queried -> " + data); + console.log(" saved to file : " + __dirname + '/data/' + data + '.xml'); + rl.prompt(); + } +}); \ No newline at end of file