Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

first version of a simple bill id filter

  • Loading branch information...
commit 78fc6a926d382acde0c85242a288dd9ecc01c6b1 1 parent ae84657
@h4ck3rm1k3 authored
Showing with 54 additions and 18 deletions.
  1. +40 −11 billy/bin/update.py
  2. +14 −1 billy/scrape/__init__.py
  3. +0 −6 billy/utils.py
View
51 billy/bin/update.py
@@ -1,4 +1,9 @@
#!/usr/bin/env python
+from billy.utils import configure_logging, term_for_session
+configure_logging("startup") # we need this before the command line args are read in
+import logging
+_log = logging.getLogger('billy')
+
import bson.binary
from bson.binary import ALL_UUID_SUBTYPES
from bson.binary import OLD_UUID_SUBTYPE
@@ -7,7 +12,7 @@
import sys
import json
import glob
-import logging
+
import inspect
import argparse
import traceback
@@ -19,7 +24,6 @@
# code snippet, to be included in 'sitecustomize.py'
import sys
-
from pymongo.errors import OperationFailure
def info(type, value, tb):
@@ -47,10 +51,10 @@ def info(type, value, tb):
from billy.conf import settings, base_arg_parser
from billy.scrape import (ScrapeError, JSONDateEncoder, get_scraper,
check_sessions)
-from billy.utils import configure_logging, term_for_session
+
from billy.scrape.validator import DatetimeValidator
-_log = logging.getLogger('billy')
+
def _clear_scraped_data(output_dir, scraper_type=''):
# make or clear directory for this type
@@ -118,13 +122,18 @@ def _run_scraper(scraper_type, options, metadata):
for time in times:
scraper.validate_term(time, scraper.latest_only)
+ #
+ if (not(options.billid is False)):
+ scraper.set_filter_bill_id(options.billid)
+
+
# run scraper against year/session/term
for time in times:
# old style
if _is_old_scrape(scraper.scrape):
for chamber in options.chambers:
scraper.scrape(chamber, time)
- else:
+ else:
scraper.scrape(time, chambers=options.chambers)
if scraper_type == 'events' and len(options.chambers) == 2:
@@ -173,7 +182,7 @@ def _do_imports(abbrev, args):
dist['_id'] = '%(abbr)s-%(chamber)s-%(name)s' % dist
dist['boundary_id'] = dist['boundary_id'] % dist
dist['num_seats'] = int(dist['num_seats'])
- _log.debug(dist)
+# _log.debug(dist)
db.districts.save(dist, safe=True)
else:
_log.warning("%s not found, continuing without "
@@ -242,21 +251,31 @@ def main(old_scrape_compat=False):
for arg in ('upper', 'lower'):
what.add_argument('--' + arg, action='append_const',
dest='chambers', const=arg)
+
+
for arg in ('bills', 'legislators', 'committees', 'votes', 'events'):
what.add_argument('--' + arg, action='append_const', dest='types',
const=arg)
+
for arg in ('scrape', 'import', 'report'):
parser.add_argument('--' + arg, dest='actions',
action="append_const", const=arg,
help='only run %s step' % arg)
+
# special modes for debugging
scrape.add_argument('--nonstrict', action='store_false', dest='strict',
default=True, help="don't fail immediately when"
" encountering validation warning")
+
scrape.add_argument('--fastmode', help="scrape in fast mode",
action="store_true", default=False)
+
+ scrape.add_argument('--billid', help="scrape only a single bill",
+ action="store", default=False)
+
+
# scrapelib overrides
scrape.add_argument('-r', '--rpm', action='store', type=int,
dest='SCRAPELIB_RPM')
@@ -339,9 +358,13 @@ def main(old_scrape_compat=False):
terms=%s""" % (args.module, ','.join(args.actions), ','.join(args.types),
','.join(args.sessions), ','.join(args.terms))
_log.info(plan)
-
scrape_data = {}
+ if args.billid is False :
+ _log.debug("No billid filter.")
+ else:
+ _log.debug("Search for billid: %s" % args.billid)
+
if 'scrape' in args.actions:
_clear_scraped_data(args.output_dir)
@@ -387,17 +410,23 @@ def main(old_scrape_compat=False):
exec_start = dt.datetime.utcnow()
# scraper order matters
- order = ('legislators', 'committees', 'votes', 'bills', 'events')
+ if args.billid is False :
+ order = ('legislators', 'committees', 'votes', 'bills', 'events')
+ else:
+ _log.debug("going to process bills")
+ order = ('bills',) # only process the bills
+
_traceback = None
try:
for stype in order:
+ _log.debug("consider to process %s" % stype )
if stype in args.types:
-
+ _log.debug("going to process %s" % stype )
scraper_results= _run_scraper(stype, args, metadata)
- _log.debug(scraper_results)
-
run_record += scraper_results
+ else:
+ _log.debug("skipping %s" % stype )
except Exception as e:
_traceback = _, _, exc_traceback = sys.exc_info()
View
15 billy/scrape/__init__.py
@@ -76,6 +76,19 @@ class Scraper(scrapelib.Scraper):
latest_only = False
+
+ """
+ Filter only this bill id
+ """
+ def set_filter_bill_id (self, billid):
+# _log.debug("old self.filter_bill_id was :%s" % self.filter_bill_id)
+ self.filter_bill_id = billid
+# _log.debug("net self.filter_bill_id= %s" % self.filter_bill_id)
+
+ def get_filter_bill_id (self):
+# _log.debug("self.filter_bill_id %s" % self.filter_bill_id)
+ return self.filter_bill_id
+
def __init__(self, metadata, output_dir=None, strict_validation=None,
fastmode=False, **kwargs):
"""
@@ -98,7 +111,7 @@ def __init__(self, metadata, output_dir=None, strict_validation=None,
kwargs['cache_write_only'] = False
super(Scraper, self).__init__(**kwargs)
-
+ self.filter_bill_id = False
self.metadata = metadata
self.output_dir = output_dir
View
6 billy/utils.py
@@ -77,12 +77,6 @@ def extract_fields(d, fields, delimiter='|'):
def configure_logging(module=None):
-#logging.basicConfig(level=logging.DEBUG)
-#h = logging.StreamHandler()
-#f = logging.Formatter("%(levelname)s %(asctime)s %(funcName)s %(lineno)d %(message)s")
-#h.setFormatter(f)
-#x.addHandler(h)
-
if module:
format = ("BILLY:%(pathname)s %(asctime)s %(name)s %(levelname)s " + module + " %(funcName)s %(lineno)d %(message)s")
else:
Please sign in to comment.
Something went wrong with that request. Please try again.