-
Notifications
You must be signed in to change notification settings - Fork 4
/
malcrawler.py
executable file
·377 lines (293 loc) · 10.7 KB
/
malcrawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
import sys
import time
import codecs
import gevent
import logging
import urlnorm
import datetime
import urllib
import urlparse
import requests
import tldextract
from gsb import client
from pprint import pprint
from gsb import datastore
from bs4 import BeautifulSoup
from spam.surbl import SurblChecker
from spam.spamhaus import SpamHausChecker
# Unicode fixup
UTF8Writer = codecs.getwriter('utf8')
sys.stdout = UTF8Writer(sys.stdout)
urlsseen = set()
urlschecked = dict()
cookiejar = None
ds = None
sbc = None
safebrowse_apikey = 'YourAPIKeyHere'
debug = False
want_safebrowse = True
want_spamhaus = False
def RateLimited(maxPerSecond):
"""
Decorator for rate limiting
"""
minInterval = 1.0 / float(maxPerSecond)
def decorate(func):
lastTimeCalled = [0.0]
def rateLimitedFunction(*args,**kargs):
elapsed = time.clock() - lastTimeCalled[0]
leftToWait = minInterval - elapsed
if leftToWait>0:
time.sleep(leftToWait)
ret = func(*args,**kargs)
lastTimeCalled[0] = time.clock()
return ret
return rateLimitedFunction
return decorate
def safebrowse_init(apikey, storename):
global ds, sbc
chunk_range_str = None
num_expressions = None
num_addchunks = None
num_subchunks = None
ds = datastore.DataStore(storename)
sbc = client.Client(ds,
apikey=apikey,
use_mac=True)
def find_url(txt):
urlfinder = re.compile( # stolen from django
r'^https?://' # http:// or https://
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|' # domain...
r'localhost|' # localhost...
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
r'(?::\d+)?' # optional port
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
urllist = [ mgroups[0] for mgroups in urlfinder.findall(txt)]
return urllist
def fix_urls(urls, hostinfo):
ret = []
for url in urls:
if url:
if not urlparse.urlparse(url).scheme:
if not url.startswith('//'):
url = url.encode('utf8','ignore')
url = hostinfo['scheme'] + "://" + hostinfo['hostname'] + '/' + url
url = urlnorm.norm(url)
else:
url = hostinfo['scheme'] + ':' + url
if url.endswith('#'):
url = url[:-1]
if url.startswith('javascript:'):
continue
#print "fixed up url on %s: %s" % (hostinfo['hostname'], url)
ret.append(url)
return ret
def get_domain(url):
domain = tldextract.extract(url)
result = domain.domain + "." + domain.tld
return result
def check_surbl(url):
global urlschecked
domain = get_domain(url)
# check for links we cannot handle
if url.startswith('http') or url.startswith('https'):
# short cirquit (caching is good!)
if urlschecked.has_key("surbl-" + domain):
return urlschecked["surbl-" + domain]
checker = SurblChecker()
try:
ret = checker.is_spam(url)
except IndexError as e:
print "Whoops, trying again later."
return False
urlschecked["surbl-" + domain] = ret
return ret
else:
return False
def check_spamhaus(url):
global urlschecked, want_spamhaus
domain = get_domain(url)
if not want_spamhaus:
return False
# check for links we cannot handle
if url.startswith('http') or url.startswith('https'):
# short cirquit (caching is good!)
if urlschecked.has_key("sh-" + domain):
return urlschecked["sh-" + domain]
checker = SpamHausChecker()
try:
ret = checker.is_spam(url)
except Exception as e:
print "Whoops, trying again later: %s" % e
return False
urlschecked["sh-" + domain] = ret
return ret
else:
return False
def check_safebrowse(url):
global urlschecked, want_safebrowse, cookiejar, sbc
ret = False
if not want_safebrowse:
return False
if url.startswith('javascript:'):
ret = False
try:
url = urllib.quote(url, safe="%/:=&?~#+!$,;'@()*[]").encode('utf-8')
url = get_domain(url)
if urlschecked.has_key('sb-' + url):
return urlschecked['sb-' + url]
## Lookup API (slow!)
# checkurl = "https://sb-ssl.google.com/safebrowsing/api/lookup?client=firefox&apikey=%s&appver=1.5.2&pver=3.0" % safebrowse_apikey
# payload = {'1': url}
# ret = requests.post(checkurl, data=payload)
matches = sbc.CheckUrl(url, debug_info=True)
if len(matches) == 0:
ret = False
else:
for listname, match, addchunknum in matches:
if ret:
ret += '%s: addchunk number: %d: %s\n' % (listname, addchunknum, match)
else:
ret = '%s: addchunk number: %d: %s\n' % (listname, addchunknum, match)
except Exception as ex:
print "SBC: Skipped this url: %s\nReason: %s" % (url, ex)
ret = False
urlschecked['sb-' + url] = ret
return ret
def extract_urls(r, hostinfo):
global urlsseen
# Make sure r actually contains something, otherwise
# we throw exceptions
if r == None:
return
urls = []
# check mime type and act accordingly
if r.headers['content-type'].startswith('text/html'):
soup = BeautifulSoup(r.content)
urls = [link.get('src') for link in soup.find_all('script')]
urls += [link.get('href') for link in soup.find_all('a')]
urls += [link.get('src') for link in soup.find_all('iframe')]
urls += [link.get('href') for link in soup.find_all('link')]
urls += [link.get('url') for link in soup.find_all('applet')]
urls += [link.get('data') for link in soup.find_all('object')]
print "Found %d references in markup" % len(urls)
elif r.headers['content-type'].startswith('application/javascript'):
# just look for stuff that looks like a URI
urls = find_url(r.text)
pprint(urls)
elif r.headers['content-type'].startswith('text/plain'):
# just look for stuff that looks like a URI
urls = find_url(r.text)
pprint(urls)
else:
# anything else?
return []
if urls:
# fix up b0rked urls (e.g. relative links)
urls = fix_urls(urls, hostinfo)
# preventively strip out urls we already seen
for url in urls:
if url in urlsseen:
urls.remove(url)
for url in urls:
if check_surbl(url):
print "Malicious domain found on %s:\n\t %s" % (hostinfo['fullurl'], url)
f = open('assets.txt', 'a')
f.write('SURBL :' + str(hostinfo['fullurl']) + '\t=>\t' + url + '\n')
f.close
if check_spamhaus(url):
print "Spamhaus domain found on %s:\n\t %s" % (hostinfo['fullurl'], url)
f = open('assets.txt', 'a')
f.write('SPAMHAUS:' + str(hostinfo['fullurl']) + '\t=>\t' + url + '\n')
f.close
ret = check_safebrowse(url)
if ret:
print "SAFEBROWSE: %url -> %s" % (hostinfo['fullurl'], ret)
f = open('assets.txt', 'a')
f.write('SAFEBROWSE: %s -> %s\n' % (hostinfo['fullurl'], ret))
f.close
print "Saw %d new links on this page." % len(urls)
return urls
else:
return []
def print_url(r, *args, **kwargs):
global urlsseen
if r == None:
return
urlsseen.add(r.url)
def recurse_url(urls, domain):
global urlsseen, cookiejar
domain = get_domain(domain)
while True:
if len(urls) == 0:
return
# prune
for url in urls:
if url in urlsseen:
urls.remove(url)
print "urls contains %d elements" % len(urls)
# remove None values from urls
urls = [x for x in urls if x is not None]
hooks = {'response': print_url}
rs = []
urlindex = 0
for url in urls:
# don't investigate a link if we have already seen it.
if url in urlsseen:
#print "Not fetching %s. (%d in cache, %d pending)" % (url, len(urlsseen), len(urls))
if url in urls:
urls.remove(url)
continue
else:
urlsseen.add(url)
if get_domain(url) != domain:
#print "%s != %s, not fetching" % (get_domain(url), domain)
continue
if url.startswith('javascript:'):
continue
if url.startswith('mailto:'):
continue
if url:
url_lists = []
print "Fetching %s. (%d in cache, %d pending)" % (url, len(urlsseen), len(urls))
headers = { # Let's pretend we're internet explorer, because we can
'User-Agent': 'Mozilla/5.0 (compatible; MSIE 10.6; Windows NT 6.1; Trident/5.0; InfoPath.2; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727) 3gpp-gba UNTRUSTED/1.0',
}
try:
response = requests.get(url, hooks=hooks, headers=headers, cookies=cookiejar)
except Exception as ex:
print "Whoops... %s" % ex
continue # fuck it
cookiejar = response.cookies
pprint(cookiejar.get_dict())
hostinfo = { 'hostname': urlparse.urlparse(url).hostname.encode('utf8'),
'scheme': urlparse.urlparse(url).scheme.encode('utf8'),
'fullurl':url.encode('utf8')}
items = extract_urls(response, hostinfo)
url_lists.append(items)
url_lists = [x for x in url_lists if x is not None]
urls += sum(url_lists, []) # flatten
urlindex += 1
def main():
global debug, safebrowse_apikey
if debug:
logging.basicConfig(level=logging.DEBUG)
if want_safebrowse:
print "Checking datastore for SBC"
safebrowse_init(safebrowse_apikey, 'sbcstore')
if len(sys.argv) < 2:
sys.exit('Need list of urls to crawl')
urllist = []
for line in open(sys.argv[1]):
url = line.strip()
if not url.startswith('http'):
url = 'http://' + url
print "added %s" % url
urllist.append(url)
for url in urllist:
recurse_url([url], url)
if __name__ == '__main__':
sys.exit(main())