Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .flake8.ini
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
[flake8]
max-line-length = 150
ignore = W191
exclude =
.git,
__pycache__
3 changes: 2 additions & 1 deletion .hound.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
python:
enabled: true
config_file: .flake8.ini


fail_on_violations: true
6 changes: 3 additions & 3 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
language: python
python:
- "2.7"
- "3.6"
- 2.7
- 3.6
install:
- pip install flake8 pytest
- pip install -r requirements.txt
Expand All @@ -10,4 +10,4 @@ before_script:
script:
- pytest --capture=sys
after_success:
- coveralls
- codecov
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
[![Dependency Status](https://david-dm.org/fossasia/query-server.svg)](https://david-dm.org/ossasia/query-server)
[![Join the chat at https://gitter.im/fossasia/query-server](https://badges.gitter.im/fossasia/query-server.svg)](https://gitter.im/fossasia/query-server?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)

> The query server can be used to search a keyword/phrase on a search engine (Google, Yahoo, Bing, Ask, DuckDuckGo, Yandex, Baidu and Exalead) and get the results as `json` or `xml`. The tool also stores the searched query string in a MongoDB database for analytical purposes. (The search engine scrapper is based on the scraper at [fossasia/searss](https://github.com/fossasia/searss).)
> The query server can be used to search a keyword/phrase on a search engine (Google, Yahoo, Bing, Ask, DuckDuckGo, Yandex, Baidu, Exalead, Quora and Youtube) and get the results as `json` or `xml`. The tool also stores the searched query string in a MongoDB database for analytical purposes. (The search engine scrapper is based on the scraper at [fossasia/searss](https://github.com/fossasia/searss).)

[![Deploy to Docker Cloud](https://files.cloud.docker.com/images/deploy-to-dockercloud.svg)](https://cloud.docker.com/stack/deploy/?repo=https://github.com/fossasia/query-server) [![Deploy](https://www.herokucdn.com/deploy/button.svg)](https://heroku.com/deploy?template=https://github.com/fossasia/query-server) [![Deploy on Scalingo](https://cdn.scalingo.com/deploy/button.svg)](https://my.scalingo.com/deploy?source=https://github.com/fossasia/query-server#master) [![Deploy to Bluemix](https://bluemix.net/deploy/button.png)](https://bluemix.net/deploy?repository=https://github.com/fossasia/query-server&branch=master)

Expand All @@ -23,7 +23,7 @@ The API(s) provided by query-server are as follows:

` GET /api/v1/search/<search-engine>?query=query&format=format `

> *search-engine* : [`google`, `ask`, `bing`, `duckduckgo`, `yahoo`, `yandex`, `baidu`, `exalead`]
> *search-engine* : [`google`, `ask`, `bing`, `duckduckgo`, `yahoo`, `yandex`, `baidu`, `exalead`, `quora`, `youtube`]

> *query* : query can be any string

Expand Down
9 changes: 7 additions & 2 deletions app/scrapers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@
from baidu import Baidu
from exalead import Exalead
from quora import Quora
from youtube import Youtube
from mojeek import Mojeek


scrapers = {
'g': Google(),
Expand All @@ -20,7 +23,9 @@
'yd': Yandex(),
'u': Baidu(),
'e': Exalead(),
'q': Quora()
'q': Quora(),
't': Youtube(),
'm': Mojeek()
}


Expand All @@ -34,7 +39,7 @@ def small_test():


def feedgen(query, engine, count=10):
if engine == 'q':
if engine in ['q', 't']:
urls = scrapers[engine].search_without_count(query)
else:
urls = scrapers[engine].search(query, count)
Expand Down
3 changes: 3 additions & 0 deletions app/scrapers/ask.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,13 @@ def parseResponse(self, soup):
[[Tile1,url1], [Title2, url2],..]
"""
urls = []
if soup.find('div', {'class': 'PartialSearchResults-noresults'}):
return None
for div in soup.findAll('div', {'class': 'PartialSearchResults-item'}):
title = div.div.a.text
url = div.div.a['href']
p = div.find('p', {'class': 'PartialSearchResults-item-abstract'})
desc = p.text.replace('\n', '')
urls.append({'title': title, 'link': url, 'desc': desc})
print('Ask parsed: ' + str(urls))
return urls
2 changes: 1 addition & 1 deletion app/scrapers/baidu.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,6 @@ def parseResponse(self, soup):
url = div.h3.a['href']
urls.append({'title': title, 'link': url})

print('parsed' + str(urls))
print('Baidu parsed: ' + str(urls))

return urls
2 changes: 1 addition & 1 deletion app/scrapers/bing.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,6 @@ def parseResponse(self, soup):
'desc': desc}
urls.append(url_entry)

print('parsed' + str(urls))
print('Bing parsed: ' + str(urls))

return urls
2 changes: 1 addition & 1 deletion app/scrapers/duckduckgo.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,6 @@ def parseResponse(self, soup):
urls.append({'title': links.getText(),
'link': links.get('href')})

print('parsed' + str(urls))
print('DuckDuckGo parsed: ' + str(urls))

return urls
3 changes: 2 additions & 1 deletion app/scrapers/generalized.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,8 @@ def search(self, query, numResults):
response = self.get_page(query, currentStart)
soup = BeautifulSoup(response.text, 'html.parser')
newResults = self.parseResponse(soup)

if newResults is None:
break
urls.extend(newResults)
currentStart = self.nextStart(currentStart, newResults)
return urls[: numResults]
Expand Down
2 changes: 1 addition & 1 deletion app/scrapers/google.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,6 @@ def parseResponse(self, soup):
links = h3.find('a')
urls.append({'title': links.getText(), 'link': links.get('href')})

print('parsed' + str(urls))
print('Google parsed: ' + str(urls))

return urls
26 changes: 26 additions & 0 deletions app/scrapers/mojeek.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from __future__ import print_function
from generalized import Scraper


class Mojeek(Scraper):
"""Scraper class for Mojeek"""

def __init__(self):
self.url = 'https://www.mojeek.co.uk/search'
self.defaultStart = 1
self.startKey = 's'

def parseResponse(self, soup):
""" Parse the response and return set of urls
Returns: urls (list)
[[Tile1,url1], [Title2, url2],..]
"""
urls = []
for a in soup.findAll('a', {'class': 'ob'}):
title = a.getText()
url = a.get('href')
urls.append({'title': title, 'link': url})

print('Mojeek parsed: ' + str(urls))

return urls
2 changes: 1 addition & 1 deletion app/scrapers/yahoo.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,6 @@ def parseResponse(self, soup):
'link': u
})

print('parsed' + str(urls))
print('Yahoo parsed: ' + str(urls))

return urls
2 changes: 1 addition & 1 deletion app/scrapers/yandex.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,6 @@ def parseResponse(self, soup):
for a in soup.findAll('a', {'class': 'link link_theme_normal'}):
urls.append({'title': a.getText(), 'link': a.get('href')})

print('parsed' + str(urls))
print('Yandex parsed: ' + str(urls))

return urls
28 changes: 28 additions & 0 deletions app/scrapers/youtube.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from __future__ import print_function
from generalized import Scraper


class Youtube(Scraper):
"""Scraper class for Youtube"""

def __init__(self):
self.url = 'https://www.youtube.com/results'
self.queryKey = 'search_query'

def parseResponse(self, soup):
""" Parse the response and return list of urls
Returns: urls (list)
[[Tile1,url1], [Title2, url2],..]
"""
urls = []
for a in soup.findAll('a'):
if a.get('href').startswith('/watch?'):
link = 'https://www.youtube.com' + str(a.get('href'))
if not a.getText().startswith('\n\n'):
urls.append({'title': a.getText(), 'link': link})
else:
continue

print('Youtube parsed: ' + str(urls))

return urls
8 changes: 3 additions & 5 deletions app/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@ def search(search_engine):

engine = search_engine
if engine not in ('google', 'bing', 'duckduckgo', 'yahoo', 'ask',
'yandex', 'ubaidu', 'exalead', 'quora'):
'yandex', 'ubaidu', 'exalead', 'quora', 'tyoutube',
'mojeek'):
err = [404, 'Incorrect search engine', qformat]
return bad_request(err)

Expand All @@ -49,10 +50,7 @@ def search(search_engine):
err = [400, 'Not Found - missing query', qformat]
return bad_request(err)

if engine[0] == 'q':
result = feedgen(query, engine[0])
else:
result = feedgen(query, engine[0], count)
result = feedgen(query, engine[0], count)
if not result:
err = [404, 'No response', qformat]
return bad_request(err)
Expand Down
Binary file added app/static/images/mojeek_icon.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added app/static/images/youtube_icon.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 2 additions & 0 deletions app/templates/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@ <h1><code>query-server</code></h1>
<button type="submit" value="ubaidu" class="btn btn-lg search btn-outline"><img src="{{ url_for('static', filename='images/baidu_icon.ico') }}" width="30px" alt="Baidu Icon"> Baidu</button>
<button type="submit" value="exalead" class="btn btn-lg search btn-outline"><img src="{{ url_for('static', filename='images/exalead_icon.png') }}" width="30px" alt="Exalead Icon"> Exalead</button>
<button type="submit" value="quora" class="btn btn-lg search btn-outline"><img src="{{ url_for('static', filename='images/quora_icon.png') }}" width="30px" alt="Quora Icon"> Quora</button>
<button type="submit" value="tyoutube" class="btn btn-lg search btn-outline"><img src="{{ url_for('static', filename='images/youtube_icon.png') }}" width="30px" alt="YouTube Icon"> YouTube</button>
<button type="submit" value="mojeek" class="btn btn-lg search btn-outline"><img src="{{ url_for('static', filename='images/mojeek_icon.png') }}" width="30px" alt="Quora Icon"> Mojeek</button>
</div>
</div>
<div class="col-sm-2">
Expand Down