Skip to content

Commit

Permalink
Merge branch 'release/v0.3.0'
Browse files Browse the repository at this point in the history
  • Loading branch information
fated committed Nov 17, 2014
2 parents 74b1800 + 0566778 commit 04c343f
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 36 deletions.
7 changes: 5 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
Calibre Metadata Source Plugin for Amazon.cn
=========

This plugin allows Calibre to read book information from Amazon.cn when you choose to download/fetch metadata. Calibre currently comes with plugins for a number of information sources such as Amazon and Googlebooks. Adding this plugin can potentially increase both the success rate and quality of information retrieved for some of your chinese books.
This plugin allows Calibre to read book information from Amazon.cn when you choose to download/fetch metadata. Calibre currently comes with plugins for a number of information sources such as Amazon and Googlebooks. Adding this plugin can potentially increase both the success rate and quality of information retrieved for some of your Chinese books.

### Main Features of v0.2.0
### Main Features of v0.3.0
This plugin can retrieve amazon_cn id, title, author, comments, rating, publisher, publication date, language, tags and covers from Amazon.cn. The amazon_cn id will also be displayed in the book details panel as "Amazon.cn" to be clicked on and taken directly to the website for that book.

### Special Notes:
Expand All @@ -22,6 +22,9 @@ If you find this plugin useful please feel free to show your appreciation. I hav
</a>

### Version History:
* __Version 0.3.0__ - 17 Nov 2014
Fix parsing issue caused by new Amazon style.
Fix extra CSS style info in title when parsing.
* __Version 0.2.0__ - 04 Nov 2013
Add support for parsing tags.
* __Version 0.1.0__ - 30 Oct 2013
Expand Down
89 changes: 57 additions & 32 deletions __init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
from __future__ import (unicode_literals, division, absolute_import,
print_function)

__license__ = 'GPL v3'
Expand All @@ -11,9 +11,9 @@
from threading import Thread
from Queue import Queue, Empty

from calibre import as_unicode
from calibre import as_unicode, random_user_agent
from calibre.ebooks.metadata import check_isbn
from calibre.ebooks.metadata.sources.base import (Source, Option, fixcase,
from calibre.ebooks.metadata.sources.base import (Source, Option, fixcase,
fixauthors)
from calibre.ebooks.metadata.book.base import Metadata
from calibre.utils.localization import canonicalize_lang
Expand All @@ -24,15 +24,16 @@ class Amazon_CN(Source):
description = _('Downloads metadata and covers from Amazon.cn')

author = 'Bruce Chou'
version = (0, 2, 0)
version = (0, 3, 0)
minimum_calibre_version = (0, 8, 0)

capabilities = frozenset(['identify', 'cover'])
touched_fields = frozenset(['title', 'authors', 'identifier:amazon_cn',
'identifier:isbn', 'rating', 'comments', 'publisher', 'pubdate',
'rating', 'comments', 'publisher', 'pubdate',
'languages', 'series'])
has_html_comments = True
supports_gzip_transfer_encoding = True
prefer_results_with_isbn = False

MAX_EDITIONS = 5

Expand All @@ -51,6 +52,12 @@ def test_fields(self, mi):
elif mi.is_null(key):
return key

@property
def user_agent(self):
# Pass in an index to random_user_agent() to test with a particular
# user agent
return random_user_agent()

def get_asin(self, identifiers):
for key, val in identifiers.iteritems():
key = key.lower()
Expand Down Expand Up @@ -86,7 +93,7 @@ def create_query(self, log, title=None, authors=None, identifiers={}): # {{{

# See the amazon detailed search page to get all options
q = {'search-alias': 'aps',
'unfiltered': '1',
'unfiltered': '1',
}
q['sort'] = 'relevance_rank'

Expand Down Expand Up @@ -117,9 +124,9 @@ def create_query(self, log, title=None, authors=None, identifiers={}): # {{{
# magic parameter to enable Chinese GBK encoding.
q['__mk_zh_CN'] = u'亚马逊网站'

encode_to = 'utf8'
encode_to = 'utf-8'
encoded_q = dict([(x.encode(encode_to, 'ignore'), y.encode(encode_to,
'ignore')) for x, y in
'ignore')) for x, y in
q.iteritems()])
url = 'http://www.amazon.cn/s/?' + urlencode(encoded_q)
return url
Expand All @@ -146,22 +153,35 @@ def parse_results_page(self, root): # {{{

def title_ok(title):
title = title.lower()
bad = ['[Kindle版]']
bad = [u'套装', u'[有声书]', u'[音频cd]']
for x in bad:
if x in title:
return False
return True

for div in root.xpath(r'//div[starts-with(@id, "result_")]'):
links = div.xpath(r'descendant::a[@class="title" and @href]')
if not links:
# New amazon markup
links = div.xpath('descendant::h3/a[@href]')
for a in links:
title = tostring(a, method='text', encoding=unicode)
if title_ok(title):
matches.append(a.get('href'))
break
for a in root.xpath(r'//li[starts-with(@id, "result_")]//a[@href and contains(@class, "s-access-detail-page")]'):
title = tostring(a, method='text', encoding=unicode)
if title_ok(title):
url = a.get('href')
if url.startswith('/'):
url = 'http://www.amazon.cn%s' % (url)
matches.append(url)

if not matches:
# Previous generation of results page markup
for div in root.xpath(r'//div[starts-with(@id, "result_")]'):
links = div.xpath(r'descendant::a[@class="title" and @href]')
if not links:
# New amazon markup
links = div.xpath('descendant::h3/a[@href]')
for a in links:
title = tostring(a, method='text', encoding=unicode)
if title_ok(title):
url = a.get('href')
if url.startswith('/'):
url = 'http://www.amazon.cn%s' % (url)
matches.append(url)
break

if not matches:
# This can happen for some user agents that Amazon thinks are
Expand All @@ -171,17 +191,21 @@ def title_ok(title):
for a in td.xpath(r'descendant::td[@class="dataColumn"]/descendant::a[@href]/span[@class="srTitle"]/..'):
title = tostring(a, method='text', encoding=unicode)
if title_ok(title):
matches.append(a.get('href'))
url = a.get('href')
if url.startswith('/'):
url = 'http:/www.amazon.cn%s' % (url)
matches.append(url)
break

# Keep only the top MAX_EDITIONS matches as the matches are sorted by relevance by Amazon so lower matches are not likely to be very relevant
# Keep only the top MAX_EDITIONS matches as the matches are sorted by relevance
# by Amazon so lower matches are not likely to be very relevant
return matches[:self.MAX_EDITIONS]
# }}}

def identify(self, log, result_queue, abort, title=None, authors=None,
def identify(self, log, result_queue, abort, title=None, authors=None,
identifiers={}, timeout=30): # {{{
'''
Note this method will retry without identifiers automatically if no
Note this method will retry without identifiers automatically if no
match is found with identifiers.
'''
from calibre.utils.cleantext import clean_ascii_chars
Expand All @@ -197,7 +221,8 @@ def identify(self, log, result_queue, abort, title=None, authors=None,
log.error('Insufficient metadata to construct query')
return
br = self.browser

if testing:
print ('Using user agent for amazon.cn: %s'%self.user_agent)
try:
raw = br.open_novisit(query, timeout=timeout).read().strip()
except Exception as e:
Expand All @@ -208,7 +233,7 @@ def identify(self, log, result_queue, abort, title=None, authors=None,
attr = getattr(e, 'args', [None])
attr = attr if attr else [None]
if isinstance(attr[0], socket.timeout):
msg = _('Amazon timed out. Try again later.')
msg = _('Amazon.cn timed out. Try again later.')
log.error(msg)
else:
msg = 'Failed to make identify query: %r'%query
Expand Down Expand Up @@ -259,7 +284,7 @@ def identify(self, log, result_queue, abort, title=None, authors=None,
log.error('No matches found with query: %r'%query)
return

from calibre_plugins.Amazon_CN.worker import Worker
from calibre_plugins.AMAZON_CN.worker import Worker
workers = [Worker(url, result_queue, br, log, i, self,
testing=testing) for i, url in enumerate(matches)]

Expand All @@ -282,14 +307,14 @@ def identify(self, log, result_queue, abort, title=None, authors=None,
return None
# }}}

def download_cover(self, log, result_queue, abort,
title=None, authors=None, identifiers={}, timeout=30,
get_best_cover=False):
def download_cover(self, log, result_queue, abort,
title=None, authors=None, identifiers={}, timeout=30,
get_best_cover=False): # {{{
cached_url = self.get_cached_cover_url(identifiers)
if cached_url is None:
log.info('No cached cover found, running identify')
rq = Queue()
self.identify(log, rq, abort, title=title, authors=authors,
self.identify(log, rq, abort, title=title, authors=authors,
identifiers=identifiers)
if abort.is_set():
return
Expand Down Expand Up @@ -319,11 +344,11 @@ def download_cover(self, log, result_queue, abort,
result_queue.put((self, cdata))
except:
log.exception('Failed to download cover from:', cached_url)
# }}}
# }}}

if __name__ == '__main__': # tests {{{
# To run these test use: calibre-debug -e __init__.py
from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
title_test, authors_test)

test_identify_plugin(Amazon_CN.name,
Expand Down
5 changes: 3 additions & 2 deletions worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ class Worker(Thread): # Get details {{{
Get book details from amazons book page in a separate thread
'''

def __init__(self, url, result_queue, browser, log, relevance, plugin,
def __init__(self, url, result_queue, browser, log, relevance, plugin,
timeout=20, testing=False):
Thread.__init__(self)
self.daemon = True
Expand Down Expand Up @@ -301,7 +301,8 @@ def parse_title(self, root):
h1.remove(child)
return self.totext(h1)
tdiv = root.xpath('//h1[contains(@class, "parseasinTitle")]')[0]
actual_title = tdiv.xpath('descendant::*[@id="btAsinTitle"]')
ttdiv = tdiv.xpath('descendant::*[@id="btAsinTitle"]')[0]
actual_title = ttdiv.xpath('descendant::*[@style="padding-left: 0"]')
if actual_title:
title = self.tostring(actual_title[0], encoding=unicode,
method='text').strip()
Expand Down

0 comments on commit 04c343f

Please sign in to comment.