malcrawler.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import re
import sys
import time
import codecs
import gevent
import logging
import urlnorm
import datetime
import urllib
import urlparse
import requests
import tldextract

from gsb import client
from pprint import pprint
from gsb import datastore
from bs4 import BeautifulSoup
from spam.surbl import SurblChecker
from spam.spamhaus import SpamHausChecker

# Unicode fixup
UTF8Writer = codecs.getwriter('utf8')
sys.stdout = UTF8Writer(sys.stdout)

urlsseen = set()
urlschecked = dict()
cookiejar = None
ds = None
sbc = None

safebrowse_apikey = 'YourAPIKeyHere'
debug = False
want_safebrowse = True
want_spamhaus = False

def RateLimited(maxPerSecond):
    """
        Decorator for rate limiting
    """
    minInterval = 1.0 / float(maxPerSecond)
    def decorate(func):
        lastTimeCalled = [0.0]
        def rateLimitedFunction(*args,**kargs):
            elapsed = time.clock() - lastTimeCalled[0]
            leftToWait = minInterval - elapsed
            if leftToWait>0:
                time.sleep(leftToWait)
            ret = func(*args,**kargs)
            lastTimeCalled[0] = time.clock()
            return ret
        return rateLimitedFunction
    return decorate

def safebrowse_init(apikey, storename):

    global ds, sbc

    chunk_range_str = None
    num_expressions = None
    num_addchunks = None
    num_subchunks = None

    ds = datastore.DataStore(storename)

    sbc = client.Client(ds,
                        apikey=apikey,
                        use_mac=True)


def find_url(txt):
    urlfinder = re.compile( # stolen from django
        r'^https?://'  # http:// or https://
        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|'  # domain...
        r'localhost|'  # localhost...
        r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
        r'(?::\d+)?'  # optional port
        r'(?:/?|[/?]\S+)$', re.IGNORECASE)
    urllist = [ mgroups[0] for mgroups in urlfinder.findall(txt)]
    return urllist

def fix_urls(urls, hostinfo):

    ret = []
    for url in urls:
        if url:
            if not urlparse.urlparse(url).scheme:
                if not url.startswith('//'):

                    url = url.encode('utf8','ignore')
                    url = hostinfo['scheme'] + "://" + hostinfo['hostname'] + '/' + url
                    url = urlnorm.norm(url)
                else:
                    url = hostinfo['scheme'] + ':' + url


                if url.endswith('#'):
                    url = url[:-1]

                if url.startswith('javascript:'):
                    continue

                #print "fixed up url on %s: %s" % (hostinfo['hostname'], url)
            ret.append(url)
    return ret

def get_domain(url):
    domain = tldextract.extract(url)
    result = domain.domain + "." + domain.tld
    return result


def check_surbl(url):
    global urlschecked

    domain = get_domain(url)

    # check for links we cannot handle
    if url.startswith('http') or url.startswith('https'):
        # short cirquit (caching is good!)
        if urlschecked.has_key("surbl-" + domain):
            return urlschecked["surbl-" + domain]
        checker = SurblChecker()
        try:
            ret = checker.is_spam(url)
        except IndexError as e:
            print "Whoops, trying again later."
            return False
        urlschecked["surbl-" + domain] = ret
        return ret
    else:
        return False

def check_spamhaus(url):
    global urlschecked, want_spamhaus

    domain = get_domain(url)

    if not want_spamhaus:
        return False

    # check for links we cannot handle
    if url.startswith('http') or url.startswith('https'):
        # short cirquit (caching is good!)
        if urlschecked.has_key("sh-" + domain):
            return urlschecked["sh-" + domain]
        checker = SpamHausChecker()
        try:
            ret = checker.is_spam(url)
        except Exception as e:
            print "Whoops, trying again later: %s" % e
            return False
        urlschecked["sh-" + domain] = ret
        return ret
    else:
        return False

def check_safebrowse(url):
    global urlschecked, want_safebrowse, cookiejar, sbc

    ret = False

    if not want_safebrowse:
        return False

    if url.startswith('javascript:'):
        ret = False

    try:
        url = urllib.quote(url, safe="%/:=&?~#+!$,;'@()*[]").encode('utf-8')

        url = get_domain(url)

        if urlschecked.has_key('sb-' + url):
            return urlschecked['sb-' + url]

        ## Lookup API (slow!)
        # checkurl = "https://sb-ssl.google.com/safebrowsing/api/lookup?client=firefox&apikey=%s&appver=1.5.2&pver=3.0" % safebrowse_apikey
        # payload = {'1': url}
        # ret = requests.post(checkurl, data=payload)

        matches = sbc.CheckUrl(url, debug_info=True)
        if len(matches) == 0:
            ret = False
        else:
            for listname, match, addchunknum in matches:
                if ret:
                    ret += '%s: addchunk number: %d: %s\n' % (listname, addchunknum, match)
                else:
                    ret = '%s: addchunk number: %d: %s\n' % (listname, addchunknum, match)


    except Exception as ex:
        print "SBC: Skipped this url: %s\nReason: %s" % (url, ex)
        ret = False

    urlschecked['sb-' + url] = ret

    return ret

def extract_urls(r, hostinfo):

    global urlsseen

    # Make sure r actually contains something, otherwise
    # we throw exceptions
    if r == None:
        return

    urls = []

    # check mime type and act accordingly
    if r.headers['content-type'].startswith('text/html'):
        soup = BeautifulSoup(r.content)

        urls = [link.get('src') for link in soup.find_all('script')]
        urls += [link.get('href') for link in soup.find_all('a')]
        urls += [link.get('src') for link in soup.find_all('iframe')]
        urls += [link.get('href') for link in soup.find_all('link')]
        urls += [link.get('url') for link in soup.find_all('applet')]
        urls += [link.get('data') for link in soup.find_all('object')]
        print "Found %d references in markup" % len(urls)
    elif r.headers['content-type'].startswith('application/javascript'):
        # just look for stuff that looks like a URI
        urls = find_url(r.text)
        pprint(urls)
    elif r.headers['content-type'].startswith('text/plain'):
        # just look for stuff that looks like a URI
        urls = find_url(r.text)
        pprint(urls)
    else:
        # anything else?
        return []

    if urls:
        # fix up b0rked urls (e.g. relative links)
        urls = fix_urls(urls, hostinfo)

        # preventively strip out urls we already seen
        for url in urls:
            if url in urlsseen:
                urls.remove(url)

        for url in urls:
            if check_surbl(url):
                print "Malicious domain found on %s:\n\t %s" % (hostinfo['fullurl'], url)
                f = open('assets.txt', 'a')
                f.write('SURBL :' + str(hostinfo['fullurl']) + '\t=>\t' + url + '\n')
                f.close
            if check_spamhaus(url):
                print "Spamhaus domain found on %s:\n\t %s" % (hostinfo['fullurl'], url)
                f = open('assets.txt', 'a')
                f.write('SPAMHAUS:' + str(hostinfo['fullurl']) + '\t=>\t' + url + '\n')
                f.close
            ret = check_safebrowse(url)
            if ret:
                print "SAFEBROWSE: %url -> %s" % (hostinfo['fullurl'], ret)
                f = open('assets.txt', 'a')
                f.write('SAFEBROWSE: %s -> %s\n' % (hostinfo['fullurl'], ret))
                f.close


        print "Saw %d new links on this page." % len(urls)
        return urls
    else:
        return []


def print_url(r, *args, **kwargs):
    global urlsseen

    if r == None:
        return

    urlsseen.add(r.url)


def recurse_url(urls, domain):
    global urlsseen, cookiejar

    domain = get_domain(domain)

    while True:
        if len(urls) == 0:
            return

        # prune
        for url in urls:
            if url in urlsseen:
                urls.remove(url)

        print "urls contains %d elements" % len(urls)

        # remove None values from urls
        urls = [x for x in urls if x is not None]

        hooks = {'response': print_url}

        rs = []
        urlindex = 0
        for url in urls:

            # don't investigate a link if we have already seen it.
            if url in urlsseen:
                #print "Not fetching %s. (%d in cache, %d pending)" % (url, len(urlsseen), len(urls))
                if url in urls:
                    urls.remove(url)
                continue
            else:
                urlsseen.add(url)

            if get_domain(url) != domain:
                #print "%s != %s, not fetching" % (get_domain(url), domain)
                continue

            if url.startswith('javascript:'):
                continue


            if url.startswith('mailto:'):
                continue

            if url:
                url_lists = []

                print "Fetching %s. (%d in cache, %d pending)" % (url, len(urlsseen), len(urls))
                headers = {  # Let's pretend we're internet explorer, because we can
                    'User-Agent': 'Mozilla/5.0 (compatible; MSIE 10.6; Windows NT 6.1; Trident/5.0; InfoPath.2; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727) 3gpp-gba UNTRUSTED/1.0',
                }
                try:
                    response = requests.get(url, hooks=hooks, headers=headers, cookies=cookiejar)
                except Exception as ex:
                    print "Whoops... %s" % ex
                    continue # fuck it

                cookiejar = response.cookies
                pprint(cookiejar.get_dict())


                hostinfo = { 'hostname': urlparse.urlparse(url).hostname.encode('utf8'),
                             'scheme': urlparse.urlparse(url).scheme.encode('utf8'),
                             'fullurl':url.encode('utf8')}
                items = extract_urls(response, hostinfo)
                url_lists.append(items)
                url_lists = [x for x in url_lists if x is not None]
                urls += sum(url_lists, []) # flatten
                urlindex += 1


def main():
    global debug, safebrowse_apikey

    if debug:
        logging.basicConfig(level=logging.DEBUG)

    if want_safebrowse:
        print "Checking datastore for SBC"
        safebrowse_init(safebrowse_apikey, 'sbcstore')

    if len(sys.argv) < 2:
        sys.exit('Need list of urls to crawl')

    urllist = []
    for line in open(sys.argv[1]):
        url = line.strip()
        if not url.startswith('http'):
            url = 'http://' + url
        print "added %s" % url
        urllist.append(url)

    for url in urllist:
        recurse_url([url], url)

if __name__ == '__main__':
    sys.exit(main())