Skip to content

Commit

Permalink
pollers deal with spider queues too
Browse files Browse the repository at this point in the history
  • Loading branch information
dmclain committed Nov 16, 2012
1 parent c687904 commit 98a014a
Show file tree
Hide file tree
Showing 6 changed files with 44 additions and 48 deletions.
2 changes: 1 addition & 1 deletion scrapy_heroku/__init__.py
@@ -1,2 +1,2 @@


__version__ = (0, 1, 0, 'dev', 0) __version__ = (0, 1, 0)
25 changes: 6 additions & 19 deletions scrapy_heroku/app.py
@@ -1,5 +1,4 @@
from os import environ from os import environ
import urlparse


from twisted.application.service import Application from twisted.application.service import Application
from twisted.application.internet import TimerService, TCPServer from twisted.application.internet import TimerService, TCPServer
Expand All @@ -10,33 +9,21 @@
IEnvironment) IEnvironment)
from scrapyd.launcher import Launcher from scrapyd.launcher import Launcher
from scrapyd.eggstorage import FilesystemEggStorage from scrapyd.eggstorage import FilesystemEggStorage
from scrapyd.poller import QueuePoller
from scrapyd.environ import Environment from scrapyd.environ import Environment
from scrapyd.website import Root from scrapyd.website import Root


from .scheduler import Psycopg2SpiderScheduler from .scheduler import Psycopg2SpiderScheduler
from .poller import Psycopg2QueuePoller




def application(config): def application(config):
app = Application("Scrapyd") app = Application("Scrapyd")
http_port = environ.get('PORT', config.getint('http_port', 6800)) http_port = int(environ.get('PORT', config.getint('http_port', 6800)))
url = urlparse.urlparse(environ.get('DATABASE_URL')) config.cp.set('scrapyd', 'database_url', environ.get('DATABASE_URL'))


# Remove query strings. poller = Psycopg2QueuePoller(config)
path = url.path[1:]
path = path.split('?', 2)[0]

args = {
'dbname': path,
'user': url.username,
'password': url.password,
'host': url.hostname,
'port': url.port,
}

poller = QueuePoller(config)
eggstorage = FilesystemEggStorage(config) eggstorage = FilesystemEggStorage(config)
scheduler = Psycopg2SpiderScheduler(config, **args) scheduler = Psycopg2SpiderScheduler(config)
environment = Environment(config) environment = Environment(config)


app.setComponent(IPoller, poller) app.setComponent(IPoller, poller)
Expand Down
8 changes: 8 additions & 0 deletions scrapy_heroku/poller.py
@@ -0,0 +1,8 @@
from scrapyd.poller import QueuePoller

from .utils import get_spider_queues


class Psycopg2QueuePoller(QueuePoller):
def update_projects(self):
self.queues = get_spider_queues(self.config)
11 changes: 3 additions & 8 deletions scrapy_heroku/scheduler.py
Expand Up @@ -2,16 +2,14 @@


from scrapyd.interfaces import ISpiderScheduler from scrapyd.interfaces import ISpiderScheduler


from .spiderqueue import Psycopg2SpiderQueue from .utils import get_spider_queues
from .utils import get_project_list




class Psycopg2SpiderScheduler(object): class Psycopg2SpiderScheduler(object):
implements(ISpiderScheduler) implements(ISpiderScheduler)


def __init__(self, config, **pg_args): def __init__(self, config):
self.config = config self.config = config
self.pg_args = pg_args
self.update_projects() self.update_projects()


def schedule(self, project, spider_name, **spider_args): def schedule(self, project, spider_name, **spider_args):
Expand All @@ -22,7 +20,4 @@ def list_projects(self):
return self.queues.keys() return self.queues.keys()


def update_projects(self): def update_projects(self):
self.queues = {} self.queues = get_spider_queues(self.config)
for project in get_project_list(self.config):
table = 'scrapy_%s_queue' % project
self.queues[project] = Psycopg2SpiderQueue(table, **self.pg_args)
21 changes: 17 additions & 4 deletions scrapy_heroku/spiderqueue.py
@@ -1,14 +1,27 @@
import psycopg2 import psycopg2
import cPickle import cPickle
import json import json
import urlparse
from zope.interface import implements from zope.interface import implements


from scrapyd.interfaces import ISpiderQueue from scrapyd.interfaces import ISpiderQueue




class Psycopg2PriorityQueue(object): class Psycopg2PriorityQueue(object):
def __init__(self, table='scrapy_queue', **kwargs): def __init__(self, config, table='scrapy_queue'):
conn_string = ' '.join('%s=%s' % item for item in kwargs.items()) url = urlparse.urlparse(config.get('database_url'))
# Remove query strings.
path = url.path[1:]
path = path.split('?', 2)[0]

args = {
'dbname': path,
'user': url.username,
'password': url.password,
'host': url.hostname,
'port': url.port,
}
conn_string = ' '.join('%s=%s' % item for item in args.items())
self.table = table self.table = table
self.conn = psycopg2.connect(conn_string) self.conn = psycopg2.connect(conn_string)
q = "create table if not exists %s " \ q = "create table if not exists %s " \
Expand Down Expand Up @@ -98,8 +111,8 @@ def decode(self, text):
class Psycopg2SpiderQueue(object): class Psycopg2SpiderQueue(object):
implements(ISpiderQueue) implements(ISpiderQueue)


def __init__(self, table='spider_queue', **kwargs): def __init__(self, config, table='spider_queue'):
self.q = JsonPsycopg2PriorityQueue(table, **kwargs) self.q = JsonPsycopg2PriorityQueue(config, table)


def add(self, name, **spider_args): def add(self, name, **spider_args):
d = spider_args.copy() d = spider_args.copy()
Expand Down
25 changes: 9 additions & 16 deletions scrapy_heroku/utils.py
@@ -1,18 +1,11 @@
import os from scrapyd.utils import get_project_list
from ConfigParser import NoSectionError


from spiderqueue import Psycopg2SpiderQueue


def get_project_list(config):
"""Get list of projects by inspecting the eggs dir and the ones defined in def get_spider_queues(config):
the scrapyd.conf [settings] section queues = {}
""" for project in get_project_list(config):
eggs_dir = config.get('eggs_dir', 'eggs') table = 'scrapy_%s_queue' % project
if os.path.exists(eggs_dir): queues[project] = Psycopg2SpiderQueue(config, table=table)
projects = os.listdir(eggs_dir) return queues
else:
projects = []
try:
projects += [x[0] for x in config.cp.items('settings')]
except NoSectionError:
pass
return projects

0 comments on commit 98a014a

Please sign in to comment.