This repository has been archived by the owner on Jul 22, 2023. It is now read-only.
forked from Cairnarvon/progscrape
/
postcount.py
executable file
·158 lines (122 loc) · 4.22 KB
/
postcount.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
#!/usr/bin/python
"""
This script examines subject.txt and figures out how many posts or threads or
both there should be in total based on that.
If you're using progscrape with the HTML interface or the JSON interface with
--no-aborn, the difference between the number this script finds and the number
of posts in your database should be equal to the number of deleted posts.
If you're using the JSON interface with --aborn (default), the two numbers should
be the same (and deleted posts will show up as SILENT!ABORN &c. as per the
README).
You don't need to run this script yourself, but it's a nice first step to
verify database integrity.
"""
base_url = "http://dis.4chan.org"
board = '/prog/'
mode = 0 # 0 = postcount, 1 = threadcount, 2 = both
verbose = False
import urllib2, gzip, re, sys
from getopt import getopt
from StringIO import StringIO
def usage():
print "\033[1mUSAGE\033[0m"
print "\t%s [\033[4mOPTION\033[0m...]" % sys.argv[0]
print
print "\033[1mOPTIONS\033[0m"
print "\t\033[1m--base-url\033[0m \033[4murl\033[0m"
print "\t\tSpecify base URL. (default: \033[7m%s\033[0m)" % base_url
print
print "\t\033[1m--board\033[0m \033[4mboard\033[0m"
print "\t\tSpecify board to examine. (default: \033[7m%s\033[0m)" % board
print
print "\t\033[1m--mode posts\033[0m|\033[1mthreads\033[0m|\033[1mboth\033[0m"
print "\t\tSpecify which we should count (default: %s)" % ('posts',
'threads',
'both')[mode]
print
print "\t\033[1m--verbose\033[0m"
print "\t\033[1m--no-verbose\033[0m"
print "\t\tControl verbosity. (default: %s)" % ('no', 'yes')[verbose]
print
print "\t\033[1m-h\033[0m"
print "\t\033[1m--help\033[0m"
print "\t\tDisplay this message and exit."
print
# Parse command line arguments
try:
optlist, args = getopt(sys.argv[1:], 'h', ('base-url=', 'board=',
'mode=', 'verbose', 'no-verbose',
'help'))
except:
print "Invalid argument!"
usage()
sys.exit(1)
for (opt, arg) in optlist:
if opt == '--base-url':
if arg[-1] == '/':
arg = arg[:-1]
base_url = arg
elif opt == '--board':
if arg[-1] != '/':
arg += '/'
if arg[0] != '/':
arg = '/' + arg
board = arg
elif opt == '--mode':
if arg in ('post', 'posts'):
mode = 0
elif arg in ('thread', 'threads'):
mode = 1
elif arg == 'both':
mode = 2
else:
print >> sys.stderr, "Invalid option: --mode=\033[1m%s\033[0m" % arg
sys.exit(1)
elif opt == '--verbose':
verbose = True
elif opt == '--no-verbose':
verbose = False
elif opt in ('-h', '--help'):
usage()
sys.exit(0)
if base_url[:7] != 'http://':
base_url = 'http://' + base_url
board_url = base_url + board
# Fetch subject.txt
subjecttxt = urllib2.Request(board_url + "subject.txt")
subjecttxt.add_header('Accept-Encoding', 'gzip')
subjecttxt = urllib2.build_opener().open(subjecttxt)
if subjecttxt.headers.get('Content-Encoding') == 'gzip':
subjecttxt = gzip.GzipFile(fileobj=StringIO(subjecttxt.read()))
subjecttxt = subjecttxt.read().splitlines(True)
# Counting functions
def postcount():
global subjecttxt
regex = re.compile("^.*<>.*?<>.*?<>-?\d*<>(\d*)<>.*?<>\d*\\n$")
posts = 0
for line in subjecttxt:
m = regex.search(unicode(line, 'latin-1', 'ignore'))
if m is not None:
posts += int(m.group(1))
else:
print >> sys.stderr, "subject.txt fail:", line
return posts
def threadcount():
global subjecttxt
return len(subjecttxt)
# Show results
if not verbose:
if mode == 0:
print postcount()
elif mode == 1:
print threadcount()
else:
print postcount(), threadcount()
else:
print "%s on %s has" % (board, base_url),
if mode == 0:
print postcount(), "posts."
elif mode == 1:
print threadcount(), "threads."
else:
print postcount(), "posts in", threadcount(), "threads."