diff --git a/CHANGES.txt b/CHANGES.txt index c49d8ef..c8b03f3 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -62,3 +62,18 @@ v0.2.0, 2014-12-22 -- v0.2.1, 2014-12-22 -- - fixed an oopsy in setup.py and some other minor tweaks to docs + +v0.3, 2015-07-03 -- + - Added plugin support + - Created plugin directory + - Removed the ImgurPageResolver class, replaced with the imgur.py plugin. + - Added instagram, flickr plugins + - Changed WebResolver defaults, load_images and use_js_ruleset now default to true. + - Added another rule to the js_ruleset + - Added several command options to resolveimg.py to help with debugging and + performance testing. + - Fixed some bugs + - changed data files installation directory after installing + from setup.py + - Added opengraph plugin + diff --git a/README.rst b/README.rst index c2bdb3d..00cfb38 100644 --- a/README.rst +++ b/README.rst @@ -16,7 +16,6 @@ USAGE try: i = imageresolver.ImageResolver() i.register(imageresolver.FileExtensionResolver()) - i.register(imageresolver.ImgurPageResolver()) i.register(imageresolver.WebpageResolver(load_images=True, parser='lxml',blacklist='easylist.txt')) url = sys.argv[1] @@ -84,13 +83,6 @@ FileExtensionResolver() METHODS Returns the url if the extention matches a possible image -ImgurPageResolver() METHODS ---------------------------- - -**resolve** *(string url)* - -Returns an Imgur image url if `url` matches the pattern of an Imgur page - WebpageResolver() METHODS ------------------------- @@ -137,18 +129,6 @@ By default this exception is skipped and logged but can be enabled with "skip_fe TODO ----------------- -Still missing the following resolvers: - -* ImgurAlbumResolver() - -* FlickrResolver() - -* OpengraphResolver() - -* InstagramResolver() - -I have no plans to implement a 9gag resolver. - Need to implement better caching. Future plan is to include a configurable cache method so images seen across sessions can be cached for better performance diff --git a/bin/resolveimg.py b/bin/resolveimg.py index c9e3f1f..1622f20 100644 --- a/bin/resolveimg.py +++ b/bin/resolveimg.py @@ -3,6 +3,7 @@ import sys import imageresolver import logging +import time from optparse import OptionParser logger = logging.getLogger('ImageResolver') @@ -14,10 +15,16 @@ opts.add_option("-r","--max-read", dest="max_read",help="Set the max read size") opts.add_option("-c","--chunk-size",dest="chunk_size",help="Chunk size to read on each pass") opts.add_option("-a","--read-all",dest="read_all",help="Read the entire image before checking size. Useful for some JPGs. Overrides --max-read") +opts.add_option("-b","--adblock", action="store_true",dest="use_adblock_filters",help="Use adblock filters.") +opts.add_option("-s","--no-ruleset", action="store_true",dest="use_js_ruleset",help="Use a custom ruleset for scoring.") +opts.add_option("--benchmark", action="store_true",dest="benchmark",help="Benchmark the total time it takes for the script to return an image") +opts.add_option("-n","--no-load-images", action="store_true",dest="load_images",help="Do not load images") +opts.add_option("-p","--parser", dest="parser",help="Choose a parser to use") (options,args) = opts.parse_args() kw_options = {} + if options.read_all: kw_options['read_all'] = True elif options.max_read: @@ -26,6 +33,16 @@ if options.chunk_size: kw_options['chunk_size'] = int(options.chunk_size) +if options.use_js_ruleset: + kw_options['use_js_ruleset'] = False + +if options.parser: + kw_options['parser'] = options.parser + +if options.load_images: + kw_options['load_images'] = False + +kw_options['use_adblock_filters'] = options.use_adblock_filters kw_options['debug'] = options.debug try: @@ -39,10 +56,16 @@ print "URL required. Please use the url option or pass a url as the first argument" sys.exit(-1) + +if options.benchmark: + t1 = time.time() + i = imageresolver.ImageResolver(**kw_options) i.register(imageresolver.FileExtensionResolver()) -i.register(imageresolver.ImgurPageResolver()) -i.register(imageresolver.WebpageResolver(load_images=True, parser='lxml')) +i.register(imageresolver.WebpageResolver(**kw_options)) print i.resolve(url) +if options.benchmark: + print 'TOTAL TIME', time.time() - t1 + diff --git a/docs/README.txt b/docs/README.txt index bfc4b30..4126245 100644 --- a/docs/README.txt +++ b/docs/README.txt @@ -16,7 +16,6 @@ USAGE try: i = imageresolver.ImageResolver() i.register(imageresolver.FileExtensionResolver()) - i.register(imageresolver.ImgurPageResolver()) i.register(imageresolver.WebpageResolver(load_images=True, parser='lxml',blacklist='easylist.txt')) url = sys.argv[1] @@ -84,13 +83,6 @@ FileExtensionResolver() METHODS Returns the url if the extention matches a possible image -ImgurPageResolver() METHODS ---------------------------- - -**resolve** *(string url)* - -Returns an Imgur image url if `url` matches the pattern of an Imgur page - WebpageResolver() METHODS ------------------------- @@ -137,18 +129,6 @@ By default this exception is skipped and logged but can be enabled with "skip_fe TODO ----------------- -Still missing the following resolvers: - -* ImgurAlbumResolver() - -* FlickrResolver() - -* OpengraphResolver() - -* InstagramResolver() - -I have no plans to implement a 9gag resolver. - Need to implement better caching. Future plan is to include a configurable cache method so images seen across sessions can be cached for better performance diff --git a/imageresolver/__init__.py b/imageresolver/__init__.py index edae329..42bcaa6 100644 --- a/imageresolver/__init__.py +++ b/imageresolver/__init__.py @@ -149,23 +149,11 @@ def resolve(self,url,**kwargs): return None -class ImgurPageResolver(object): - # works a little different than the JS version. - # it should drop references to galleries and find the image - # could be buggy! - def resolve(self,url,**kwargs): - logger.debug('Resolving using Imgur ' + str(url)) - parsed = urlparse(url) - if re.search( 'imgur.com(:80)*', parsed.netloc) and os.path.basename(parsed.path): - return 'http://i.imgur.com/' + os.path.basename(parsed.path) + '.jpg' - - return None - class WebpageResolver(object): def __init__(self,**kwargs): - self.load_images = kwargs.get('load_images',False) - self.use_js_ruleset = kwargs.get('use_js_ruleset',False) - self.use_adblock_filters = kwargs.get('use_adblock_filters',True) + self.load_images = kwargs.get('load_images',True) + self.use_js_ruleset = kwargs.get('use_js_ruleset',True) + self.use_adblock_filters = kwargs.get('use_adblock_filters',False) self.significant_surface = kwargs.get('significant_surface', 100*100) cwd = os.path.dirname(__file__) @@ -218,6 +206,7 @@ def _score(self,image): {'pattern':'1x1','score':-1}, {'pattern':'pixel','score':-1}, {'pattern':'ads','score':-1}, + {'pattern':'transparent','score':-1} ] for r in rules: @@ -254,12 +243,35 @@ def _score(self,image): return score + def plugin_resolve(self,url,soup,**kwargs): + plugins = {} + path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'plugins') + sys.path.insert(0, path) + for plugin_file in os.listdir(path): + filename, extension = os.path.splitext(plugin_file) + if extension == '.py' and filename != '__init__': + mod = __import__(filename) + plugins[filename] = mod.Plugin() + sys.path.pop(0) + + for plugin in plugins.values(): + image = plugin.get_image(url,soup) + if image: + return image + return None + + def resolve(self,url,**kwargs): logger.debug('Resolving as a webpage ' + str(url)) - ir = ImageResolver() content = ir.fetch(url) soup = BeautifulSoup(content,self.parser) + + plugin_image = self.plugin_resolve(url,soup) + + if plugin_image: + return plugin_image + images = soup.find_all('img') candidates = [] diff --git a/imageresolver/abpy b/imageresolver/abpy index a8ff334..6177472 160000 --- a/imageresolver/abpy +++ b/imageresolver/abpy @@ -1 +1 @@ -Subproject commit a8ff334b6795cfa07a68ff607221aa513c493987 +Subproject commit 61774722b8b58cfb3bbae70e26addb89d654958b diff --git a/imageresolver/plugins/__init__.py b/imageresolver/plugins/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/imageresolver/plugins/flickr.py b/imageresolver/plugins/flickr.py new file mode 100644 index 0000000..a810643 --- /dev/null +++ b/imageresolver/plugins/flickr.py @@ -0,0 +1,15 @@ +import re +import os +import requests +import logging +from bs4 import BeautifulSoup + +class Plugin: + def get_image(self, url, soup): + if re.search('http(s*):\/\/www.flickr.com\/photos\/([^\/]*)\/([^\/]*)\/(.*)', url): + logger = logging.getLogger('ImageResolver') + logger.debug('Resolving using plugin ' + str(os.path.basename(__file__)) + ' ' + str(url)) + tag = soup.find('img', {'class':'main-photo'}) + if tag: + return 'https:' + tag['src'] + return None diff --git a/imageresolver/plugins/imgur.py b/imageresolver/plugins/imgur.py new file mode 100644 index 0000000..69046d0 --- /dev/null +++ b/imageresolver/plugins/imgur.py @@ -0,0 +1,33 @@ +import re +import os +import requests +from bs4 import BeautifulSoup +from urlparse import urlparse +import logging + +class Plugin: + def get_image(self, url, soup): + if re.search('http(s*):\/\/(i\.|m\.)*imgur.com\/(gallery\/){0,1}(.*)', url): + logger = logging.getLogger('ImageResolver') + logger.debug('Resolving using plugin ' + str(os.path.basename(__file__)) + ' ' + str(url)) + parsed = urlparse(url) + + if parsed.path[1:8] == 'gallery': + logger.debug('Detected imgur gallery.') + tag = soup.find('div', {'id':'1','class':'album-image'}) + image = re.findall('i\.imgur.com\/.*\.\w+', str(tag)) + if len(image) >= 1: + return 'http://' + image[0] + + elif parsed.path[0:3] == '/a/': + logger.debug('Detected imgur album.') + tag = soup.find('meta',{'name':'twitter:image0:src'}) + if tag: + return tag['content'] + + else: + parsed = urlparse(url) + if re.search('imgur.com(:80)*', parsed.netloc) and os.path.basename(parsed.path): + return 'http://i.imgur.com/' + os.path.basename(parsed.path) + '.jpg' + return None + diff --git a/imageresolver/plugins/opengraph.py b/imageresolver/plugins/opengraph.py new file mode 100644 index 0000000..6fc1c0e --- /dev/null +++ b/imageresolver/plugins/opengraph.py @@ -0,0 +1,54 @@ +import re +import os +import logging +from bs4 import BeautifulSoup +from operator import itemgetter + +class Plugin: + def get_image(self, url, soup): + + ogtags = [{'type':'facebook','attribute':'property', 'name':'og:image', 'value':'content'}, + {'type':'facebook','attribute':'rel', 'name':'image_src', 'value':'href'}, + {'type':'twitter','attribute':'name', 'name':'twitter:image', 'value':'value'}, + {'type':'twitter','attribute':'name', 'name':'twitter:image', 'value':'content'}, + {'type':'twitter','attribute':'property', 'name':'twitter:image', 'value':'content'}, + {'type':'image','attribute':'itemprop', 'name':'image', 'value':'content'}] + + ogimages = [] + + for ogtag in ogtags: + tags = soup.find_all('meta', {ogtag['attribute']:ogtag['name']}) + if tags != []: + for image in tags: + try: + ogimages = ogimages + [{'url':image[ogtag['value']], 'type':ogtag['type'], 'score':0} for image in tags] + except KeyError as e: + pass + + ogimages_len = len(ogimages) + + # if more than 1 image, score and return the best one + if ogimages_len >= 1: + if ogimages_len == 1: + logger = logging.getLogger('ImageResolver') + logger.debug('Resolving using plugin ' + str(os.path.basename(__file__)) + ' ' + str(url)) + resolved_image = ogimages[0]['url'] + else: + for image in ogimages: + if re.search('(large|big)', image['url'], re.IGNORECASE): + image['score'] += 1 + if image['type'] == 'twitter': + image['score'] += 1 + + ogimages.sort(key=itemgetter('score'), reverse=True) + resolved_image = ogimages[0]['url'] + + if not re.search('^https?:', resolved_image): + if resolved_image.startswith('//'): + return 'http:' + resolved_image + else: + return resolved_image + + + return None + diff --git a/imageresolver/plugins/twitter.py b/imageresolver/plugins/twitter.py new file mode 100644 index 0000000..56e8fce --- /dev/null +++ b/imageresolver/plugins/twitter.py @@ -0,0 +1,21 @@ +import re +import os +import logging +from urlparse import urlparse +from bs4 import BeautifulSoup + +class Plugin: + def get_image(self, url, soup): + if re.search('http(s*):\/\/(mobile\.|m\.)*twitter.com\/[a-zA-z0-9]*\/status\/\d+', url): + logger = logging.getLogger('ImageResolver') + logger.debug('Resolving using plugin ' + str(os.path.basename(__file__)) + ' ' + str(url)) + parsed = urlparse(url) + if parsed.netloc.split('.')[0] == 'mobile': + tag = soup.find('img',{'class':'CroppedPhoto-img u-block'}) + if tag: + return tag['src'] + + else: + tag = soup.find('meta',{'property':'og:image'}) + if tag: + return tag['content'] diff --git a/setup.py b/setup.py index ef0bd8e..fd7bbd4 100644 --- a/setup.py +++ b/setup.py @@ -12,8 +12,8 @@ version=__version__, author='Chris Brown', author_email='chris.brown@nwyc.com', - packages=['imageresolver','imageresolver.abpy'], - data_files=[(os.path.join( get_python_lib(),'imageresolver','data'),[ os.path.join('imageresolver','data','whitelist.txt') , os.path.join('imageresolver','data','blacklist.txt')])], + packages=['imageresolver','imageresolver.abpy','imageresolver.plugins'], + package_data={'imageresolver': ['data/*.txt']}, scripts=['bin/resolveimg.py'], url='https://github.com/constituentvoice/ImageResolverPython', license='BSD', diff --git a/tests/test_imageresolver.py b/tests/test_imageresolver.py index 14cdc75..814fc26 100644 --- a/tests/test_imageresolver.py +++ b/tests/test_imageresolver.py @@ -3,7 +3,7 @@ import requests from os.path import dirname,abspath sys.path.append( dirname( dirname( dirname( abspath(__file__)) ) ) ) -from imageresolver import ImageResolver, FileExtensionResolver, ImgurPageResolver, WebpageResolver +from imageresolver import ImageResolver, FileExtensionResolver, PluginResolver, WebpageResolver class TestImageResolver(unittest.TestCase): def setUp(self): @@ -27,13 +27,13 @@ def test_fetch_image_info(self): self.assertEquals(ext,'.png') self.assertEquals(width,518) self.assertEquals(height,588) - - def test_resolve_imgur(self): + + def test_resolve_plugin(self): i = ImageResolver() - i.register(ImgurPageResolver()) + i.register(PluginResolver()) src = i.resolve(self.imgur_page) self.assertEquals(src,self.imgur_result) - + def test_resolve_fileext(self): i = ImageResolver() i.register(FileExtensionResolver())