Skip to content
This repository has been archived by the owner on Jul 22, 2023. It is now read-only.

Commit

Permalink
Added --dry-run.
Browse files Browse the repository at this point in the history
Dry-run mode fetches subject.txt and calculates how many threads and posts would need to be fetched to bring the database up to date, but doesn't actually fetch them.
Incidentally, the (approximate) number of posts that will be fetched is now displayed prior to scraping in regular mode as well.
  • Loading branch information
Cairnarvon committed Oct 1, 2010
1 parent e16fda6 commit 34127e6
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 4 deletions.
8 changes: 7 additions & 1 deletion progscrape.1
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
.TH progscrape 1 "July 2010"
.TH progscrape 1 "September 2010"
.SH NAME
progscrape \- \fBXarn\fR's Shiichan webscraper
.SH SYNOPSIS
Expand Down Expand Up @@ -53,6 +53,12 @@ Read a list of thread IDs on standard input and only scrape those (provided they
\fB\-\-threads\fR=\fITHREADS\fR
How many scraper threads to use. If this is set to \fBauto\fR, progscrape will try to determine a sensible number based on the number of threads it has to scrape. (default \fBauto\fR)
.TP
\fB\-\-dry\-run\fR
Calculate how many threads and posts would need to be fetched to bring the database up to date, but don't actually fetch the posts.
.TP
\fB\-\-no\-dry\-run\fR
Turn off dry run mode. (default)
.TP
\fB\-h\fR, \fB\-\-help\fR
Display help message and exit.
.SH "REPORTING BUGS"
Expand Down
25 changes: 22 additions & 3 deletions progscrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
progress_bar = False
threads = -1

dry_run = False


# Make sure we're using a compatible version

Expand Down Expand Up @@ -81,6 +83,11 @@
print "\t\033[1m--threads\033[0m"
print "\t\tHow many scraper threads to use. (default: %s)" % ('auto' if threads == -1 else str(threads))
print
print "\t\033[1m--dry-run\033[0m"
print "\t\033[1m--no-dry-run\033[0m"
print "\t\tJust figure out how many threads would have to be retrieved,"
print "\t\tdon't actually retrieve them. (default: %s)" % ("no", "yes")[dry_run]
print
print "\t\033[1m--help\033[0m"
print "\t\033[1m-h\033[0m"
print "\t\tdisplay this message and exit"
Expand All @@ -94,6 +101,7 @@
'progress-bar', 'no-progress-bar',
'base-url=', 'port=', 'board=',
'partial', 'aborn', 'no-aborn',
'dry-run', 'no-dry-run',
'charset=', 'threads=', 'help'])
except:
print "Invalid argument! Use \033[1m--help\033[0m for help."
Expand Down Expand Up @@ -151,6 +159,10 @@
threads = 1
except ValueError:
print "Not a number: \033[1m%s\033[0m" % arg
elif opt == '--dry-run':
dry_run = True
elif opt == '--no-dry-run':
dry_run = False

if len(args) > 0:
db_name = args[0]
Expand Down Expand Up @@ -254,7 +266,7 @@ def urlopen(url, connection=None):
<>
(\d*) # Time of last post
\\n$""", re.VERBOSE)
to_update = []
to_update, tot_posts = [], 0

for line in subjecttxt.read().splitlines(True):
line = unicode(line, "latin-1")
Expand All @@ -272,11 +284,14 @@ def urlopen(url, connection=None):
db.execute('INSERT INTO threads VALUES (?, ?, ?)',
(data[3], data[0], 0))
to_update.append((data[3], data[6], 1))
tot_posts += int(data[4])

elif int(result[0]) < int(data[6]):
i = db.execute('select max(id) from posts where thread = ?',
(data[3],)).fetchone()
to_update.append((data[3], data[6], i[0] + 1 if i[0] else 1))
i = i[0] if i[0] else 0
to_update.append((data[3], data[6], i + 1))
tot_posts += int(data[4]) - i

except:
# Failed to parse line; skip it
Expand All @@ -303,7 +318,11 @@ def urlopen(url, connection=None):
if tot < threads:
threads = tot

print "%d threads to update." % tot
print "%d threads to update (approx. %d posts)." % (tot, tot_posts)

if dry_run:
print "Dry run; exiting."
sys.exit(0)

if threads < 1:
threads = min(tot, 1000) * 31 / 1000 + 1
Expand Down
1 change: 1 addition & 0 deletions progscrape.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ __progscrape()
--verify-trips --no-verify-trips \
--base-url --port --board --charset \
--aborn --no-aborn --partial --threads \
--dry-run --no-dry-run \
--help -h' -- $cur ) )
elif [ ! -z $(type -t _filedir) ]; then
_filedir db
Expand Down

0 comments on commit 34127e6

Please sign in to comment.