Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -62,3 +62,18 @@ v0.2.0, 2014-12-22 --

v0.2.1, 2014-12-22 --
- fixed an oopsy in setup.py and some other minor tweaks to docs

v0.3, 2015-07-03 --
- Added plugin support
- Created plugin directory
- Removed the ImgurPageResolver class, replaced with the imgur.py plugin.
- Added instagram, flickr plugins
- Changed WebResolver defaults, load_images and use_js_ruleset now default to true.
- Added another rule to the js_ruleset
- Added several command options to resolveimg.py to help with debugging and
performance testing.
- Fixed some bugs
- changed data files installation directory after installing
from setup.py
- Added opengraph plugin

20 changes: 0 additions & 20 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ USAGE
try:
i = imageresolver.ImageResolver()
i.register(imageresolver.FileExtensionResolver())
i.register(imageresolver.ImgurPageResolver())
i.register(imageresolver.WebpageResolver(load_images=True, parser='lxml',blacklist='easylist.txt'))
url = sys.argv[1]

Expand Down Expand Up @@ -84,13 +83,6 @@ FileExtensionResolver() METHODS

Returns the url if the extention matches a possible image

ImgurPageResolver() METHODS
---------------------------

**resolve** *(string url)*

Returns an Imgur image url if `url` matches the pattern of an Imgur page

WebpageResolver() METHODS
-------------------------

Expand Down Expand Up @@ -137,18 +129,6 @@ By default this exception is skipped and logged but can be enabled with "skip_fe
TODO
-----------------

Still missing the following resolvers:

* ImgurAlbumResolver()

* FlickrResolver()

* OpengraphResolver()

* InstagramResolver()

I have no plans to implement a 9gag resolver.

Need to implement better caching. Future plan is to include a configurable cache method so images seen across sessions can be cached for better performance


Expand Down
27 changes: 25 additions & 2 deletions bin/resolveimg.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import sys
import imageresolver
import logging
import time
from optparse import OptionParser

logger = logging.getLogger('ImageResolver')
Expand All @@ -14,10 +15,16 @@
opts.add_option("-r","--max-read", dest="max_read",help="Set the max read size")
opts.add_option("-c","--chunk-size",dest="chunk_size",help="Chunk size to read on each pass")
opts.add_option("-a","--read-all",dest="read_all",help="Read the entire image before checking size. Useful for some JPGs. Overrides --max-read")
opts.add_option("-b","--adblock", action="store_true",dest="use_adblock_filters",help="Use adblock filters.")
opts.add_option("-s","--no-ruleset", action="store_true",dest="use_js_ruleset",help="Use a custom ruleset for scoring.")
opts.add_option("--benchmark", action="store_true",dest="benchmark",help="Benchmark the total time it takes for the script to return an image")
opts.add_option("-n","--no-load-images", action="store_true",dest="load_images",help="Do not load images")
opts.add_option("-p","--parser", dest="parser",help="Choose a parser to use")

(options,args) = opts.parse_args()

kw_options = {}

if options.read_all:
kw_options['read_all'] = True
elif options.max_read:
Expand All @@ -26,6 +33,16 @@
if options.chunk_size:
kw_options['chunk_size'] = int(options.chunk_size)

if options.use_js_ruleset:
kw_options['use_js_ruleset'] = False

if options.parser:
kw_options['parser'] = options.parser

if options.load_images:
kw_options['load_images'] = False

kw_options['use_adblock_filters'] = options.use_adblock_filters
kw_options['debug'] = options.debug

try:
Expand All @@ -39,10 +56,16 @@
print "URL required. Please use the url option or pass a url as the first argument"
sys.exit(-1)


if options.benchmark:
t1 = time.time()

i = imageresolver.ImageResolver(**kw_options)
i.register(imageresolver.FileExtensionResolver())
i.register(imageresolver.ImgurPageResolver())
i.register(imageresolver.WebpageResolver(load_images=True, parser='lxml'))
i.register(imageresolver.WebpageResolver(**kw_options))

print i.resolve(url)

if options.benchmark:
print 'TOTAL TIME', time.time() - t1

20 changes: 0 additions & 20 deletions docs/README.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ USAGE
try:
i = imageresolver.ImageResolver()
i.register(imageresolver.FileExtensionResolver())
i.register(imageresolver.ImgurPageResolver())
i.register(imageresolver.WebpageResolver(load_images=True, parser='lxml',blacklist='easylist.txt'))
url = sys.argv[1]

Expand Down Expand Up @@ -84,13 +83,6 @@ FileExtensionResolver() METHODS

Returns the url if the extention matches a possible image

ImgurPageResolver() METHODS
---------------------------

**resolve** *(string url)*

Returns an Imgur image url if `url` matches the pattern of an Imgur page

WebpageResolver() METHODS
-------------------------

Expand Down Expand Up @@ -137,18 +129,6 @@ By default this exception is skipped and logged but can be enabled with "skip_fe
TODO
-----------------

Still missing the following resolvers:

* ImgurAlbumResolver()

* FlickrResolver()

* OpengraphResolver()

* InstagramResolver()

I have no plans to implement a 9gag resolver.

Need to implement better caching. Future plan is to include a configurable cache method so images seen across sessions can be cached for better performance


Expand Down
44 changes: 28 additions & 16 deletions imageresolver/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,23 +149,11 @@ def resolve(self,url,**kwargs):

return None

class ImgurPageResolver(object):
# works a little different than the JS version.
# it should drop references to galleries and find the image
# could be buggy!
def resolve(self,url,**kwargs):
logger.debug('Resolving using Imgur ' + str(url))
parsed = urlparse(url)
if re.search( 'imgur.com(:80)*', parsed.netloc) and os.path.basename(parsed.path):
return 'http://i.imgur.com/' + os.path.basename(parsed.path) + '.jpg'

return None

class WebpageResolver(object):
def __init__(self,**kwargs):
self.load_images = kwargs.get('load_images',False)
self.use_js_ruleset = kwargs.get('use_js_ruleset',False)
self.use_adblock_filters = kwargs.get('use_adblock_filters',True)
self.load_images = kwargs.get('load_images',True)
self.use_js_ruleset = kwargs.get('use_js_ruleset',True)
self.use_adblock_filters = kwargs.get('use_adblock_filters',False)
self.significant_surface = kwargs.get('significant_surface', 100*100)

cwd = os.path.dirname(__file__)
Expand Down Expand Up @@ -218,6 +206,7 @@ def _score(self,image):
{'pattern':'1x1','score':-1},
{'pattern':'pixel','score':-1},
{'pattern':'ads','score':-1},
{'pattern':'transparent','score':-1}
]

for r in rules:
Expand Down Expand Up @@ -254,12 +243,35 @@ def _score(self,image):

return score

def plugin_resolve(self,url,soup,**kwargs):
plugins = {}
path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'plugins')
sys.path.insert(0, path)
for plugin_file in os.listdir(path):
filename, extension = os.path.splitext(plugin_file)
if extension == '.py' and filename != '__init__':
mod = __import__(filename)
plugins[filename] = mod.Plugin()
sys.path.pop(0)

for plugin in plugins.values():
image = plugin.get_image(url,soup)
if image:
return image
return None


def resolve(self,url,**kwargs):
logger.debug('Resolving as a webpage ' + str(url))

ir = ImageResolver()
content = ir.fetch(url)
soup = BeautifulSoup(content,self.parser)

plugin_image = self.plugin_resolve(url,soup)

if plugin_image:
return plugin_image

images = soup.find_all('img')

candidates = []
Expand Down
2 changes: 1 addition & 1 deletion imageresolver/abpy
Submodule abpy updated from a8ff33 to 617747
Empty file.
15 changes: 15 additions & 0 deletions imageresolver/plugins/flickr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import re
import os
import requests
import logging
from bs4 import BeautifulSoup

class Plugin:
def get_image(self, url, soup):
if re.search('http(s*):\/\/www.flickr.com\/photos\/([^\/]*)\/([^\/]*)\/(.*)', url):
logger = logging.getLogger('ImageResolver')
logger.debug('Resolving using plugin ' + str(os.path.basename(__file__)) + ' ' + str(url))
tag = soup.find('img', {'class':'main-photo'})
if tag:
return 'https:' + tag['src']
return None
33 changes: 33 additions & 0 deletions imageresolver/plugins/imgur.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import re
import os
import requests
from bs4 import BeautifulSoup
from urlparse import urlparse
import logging

class Plugin:
def get_image(self, url, soup):
if re.search('http(s*):\/\/(i\.|m\.)*imgur.com\/(gallery\/){0,1}(.*)', url):
logger = logging.getLogger('ImageResolver')
logger.debug('Resolving using plugin ' + str(os.path.basename(__file__)) + ' ' + str(url))
parsed = urlparse(url)

if parsed.path[1:8] == 'gallery':
logger.debug('Detected imgur gallery.')
tag = soup.find('div', {'id':'1','class':'album-image'})
image = re.findall('i\.imgur.com\/.*\.\w+', str(tag))
if len(image) >= 1:
return 'http://' + image[0]

elif parsed.path[0:3] == '/a/':
logger.debug('Detected imgur album.')
tag = soup.find('meta',{'name':'twitter:image0:src'})
if tag:
return tag['content']

else:
parsed = urlparse(url)
if re.search('imgur.com(:80)*', parsed.netloc) and os.path.basename(parsed.path):
return 'http://i.imgur.com/' + os.path.basename(parsed.path) + '.jpg'
return None

54 changes: 54 additions & 0 deletions imageresolver/plugins/opengraph.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import re
import os
import logging
from bs4 import BeautifulSoup
from operator import itemgetter

class Plugin:
def get_image(self, url, soup):

ogtags = [{'type':'facebook','attribute':'property', 'name':'og:image', 'value':'content'},
{'type':'facebook','attribute':'rel', 'name':'image_src', 'value':'href'},
{'type':'twitter','attribute':'name', 'name':'twitter:image', 'value':'value'},
{'type':'twitter','attribute':'name', 'name':'twitter:image', 'value':'content'},
{'type':'twitter','attribute':'property', 'name':'twitter:image', 'value':'content'},
{'type':'image','attribute':'itemprop', 'name':'image', 'value':'content'}]

ogimages = []

for ogtag in ogtags:
tags = soup.find_all('meta', {ogtag['attribute']:ogtag['name']})
if tags != []:
for image in tags:
try:
ogimages = ogimages + [{'url':image[ogtag['value']], 'type':ogtag['type'], 'score':0} for image in tags]
except KeyError as e:
pass

ogimages_len = len(ogimages)

# if more than 1 image, score and return the best one
if ogimages_len >= 1:
if ogimages_len == 1:
logger = logging.getLogger('ImageResolver')
logger.debug('Resolving using plugin ' + str(os.path.basename(__file__)) + ' ' + str(url))
resolved_image = ogimages[0]['url']
else:
for image in ogimages:
if re.search('(large|big)', image['url'], re.IGNORECASE):
image['score'] += 1
if image['type'] == 'twitter':
image['score'] += 1

ogimages.sort(key=itemgetter('score'), reverse=True)
resolved_image = ogimages[0]['url']

if not re.search('^https?:', resolved_image):
if resolved_image.startswith('//'):
return 'http:' + resolved_image
else:
return resolved_image


return None

21 changes: 21 additions & 0 deletions imageresolver/plugins/twitter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import re
import os
import logging
from urlparse import urlparse
from bs4 import BeautifulSoup

class Plugin:
def get_image(self, url, soup):
if re.search('http(s*):\/\/(mobile\.|m\.)*twitter.com\/[a-zA-z0-9]*\/status\/\d+', url):
logger = logging.getLogger('ImageResolver')
logger.debug('Resolving using plugin ' + str(os.path.basename(__file__)) + ' ' + str(url))
parsed = urlparse(url)
if parsed.netloc.split('.')[0] == 'mobile':
tag = soup.find('img',{'class':'CroppedPhoto-img u-block'})
if tag:
return tag['src']

else:
tag = soup.find('meta',{'property':'og:image'})
if tag:
return tag['content']
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
version=__version__,
author='Chris Brown',
author_email='chris.brown@nwyc.com',
packages=['imageresolver','imageresolver.abpy'],
data_files=[(os.path.join( get_python_lib(),'imageresolver','data'),[ os.path.join('imageresolver','data','whitelist.txt') , os.path.join('imageresolver','data','blacklist.txt')])],
packages=['imageresolver','imageresolver.abpy','imageresolver.plugins'],
package_data={'imageresolver': ['data/*.txt']},
scripts=['bin/resolveimg.py'],
url='https://github.com/constituentvoice/ImageResolverPython',
license='BSD',
Expand Down
Loading