In [1]:
from IPython.core.display import display, HTML, Markdown
display(HTML("<style>.container { width:98% !important; }</style>"))

In [2]:
%matplotlib inline

In [3]:
import re
import os
import sys
import json
import collections
import geoip2.database
import geoip2.errors
from cachetools import cached
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import tldextract
import requests
import netaddr
import datetime

In [4]:
start_time = datetime.datetime.now()

In [5]:
gcp_ips = requests.get('http://www.gstatic.com/ipranges/cloud.json').json()
gcp_ips = [(rec['ipv4Prefix'], '{} ({})'.format(rec['service'], rec['scope'])) for rec in gcp_ips['prefixes'] if 'ipv4Prefix' in rec]
gcp_ips = dict(gcp_ips)
gcp_ips = netaddr.IPSet(gcp_ips.keys())

In [6]:
aws_ips = requests.get('https://ip-ranges.amazonaws.com/ip-ranges.json').json()
aws_ips = [(rec['ip_prefix'], '{} ({})'.format(rec['service'], rec['region'])) for rec in aws_ips['prefixes'] if 'ip_prefix' in rec]
aws_ips = dict(aws_ips)
aws_ips = netaddr.IPSet(aws_ips.keys())

In [7]:
# https://www.microsoft.com/en-us/download/confirmation.aspx?id=56519
azure_ips = json.load(open('data/ServiceTags_Public_20200601.json'))
azure_ips = [(rec['properties']['addressPrefixes'], '{}'.format(rec['properties']['platform'])) for rec in azure_ips['values']]
azure_ips = [(ip, note) for ips, note in azure_ips for ip in ips]
azure_ips = dict(azure_ips)
azure_ips = netaddr.IPSet(azure_ips.keys())

In [8]:
@cached(cache={})
def is_azure(ip):
    return ip is not None and ip in azure_ips

@cached(cache={})
def is_aws(ip):
    return ip is not None and ip in aws_ips

@cached(cache={})
def is_gcp(ip):
    return ip is not None and ip in gcp_ips

In [9]:
PATTERN = re.compile(r'''
    ^(?P<domain>\S+)\s
    MX\s
    (?P<preference>\d+)\s
    (?P<mailserver>\S+)\s
    (?P<adns_status>\S+)\s
    (?P<adns_code>\d+)\s
    (?P<adns_reason>\S+)\s
    "(?P<fail_message>[^"]+)"\s
    (\(\s*(?P<ip_resolutions>.*?)\s*\)|\?)''',
    re.VERBOSE|re.IGNORECASE
)

NS_PATTERN = re.compile(r'''
    ^(?P<domain>\S+)\s
    NS\s
    (?P<nameserver>\S+)\s
    (?P<adns_status>\S+)\s
    (?P<adns_code>\d+)\s
    (?P<adns_reason>\S+)\s
    "(?P<fail_message>[^"]+)"\s
    (\(\s*(?P<ip_resolutions>.*?)\s*\)|\?)''',
    re.VERBOSE|re.IGNORECASE
)

maxmind_asn = geoip2.database.Reader('GeoLite2-ASN_20200616/GeoLite2-ASN.mmdb')
maxmind_city = geoip2.database.Reader('GeoLite2-City_20200616/GeoLite2-City.mmdb')

alexa = dict([(domain, int(rank)) for (rank, domain) in [line.strip().split(',', 1) for line in open('data/top-1m.csv')]])

@cached(cache={})
def maxmind(ip):
    result = {'ip': ip}

    try:
        record = maxmind_asn.asn(ip)
        result['asn'] = record.autonomous_system_number
        result['asname'] = record.autonomous_system_organization
    except geoip2.errors.AddressNotFoundError:
        result['asn'] =  None
        result['asname'] = None

    try: 
        record = maxmind_city.city(ip)
        result['cc'] = record.country.iso_code
        result['country'] = record.country.name
        result['city'] = record.city.name
    except geoip2.errors.AddressNotFoundError:
        result['cc'] = None
        result['country'] = None
        result['city'] = None
    
    return result


def parse_line(line):
    mat = PATTERN.match(line)
    if mat:
        record = mat.groupdict()
        record['domain'] = record['domain'].lower()
        record['mailserver'] = record['mailserver'].lower()
        
        if 'ip_resolutions' in record and record.get('ip_resolutions'):
            record['ip_resolutions'] = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})', record['ip_resolutions'])
        if record['ip_resolutions'] is None:
            record['ip_resolutions'] = []
        return record
    else:
        return None

def parse_NS_line(line):
    mat = NS_PATTERN.match(line)
    if mat:
        record = mat.groupdict()
        record['domain'] = record['domain'].lower()
        record['nameserver'] = record['nameserver'].lower()
        
        if 'ip_resolutions' in record and record.get('ip_resolutions'):
            record['ip_resolutions'] = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})', record['ip_resolutions'])
        if record['ip_resolutions'] is None:
            record['ip_resolutions'] = []
        return record
    else:
        return None

def tldextract_enrich(host):
    if host:
        tldinfo = tldextract.extract(host)
        return tldinfo.registered_domain, tldinfo.suffix
    else:
        return '', ''

def enrich_record(record):
    record['maxmind'] = [maxmind(ip) for ip in record['ip_resolutions']]
    record['mailserver_registered_domain'], record['mailserver_suffix'] = tldextract_enrich(record['mailserver'])

    record['alexa_domain'] = alexa.get(record['domain'])
    record['alexa_mailserver_registered_domain'] = alexa.get(record['mailserver_registered_domain'])
    return record

def bulk_dns(hosts, file_name, rrtype='a', ignore_cache=False):
    if rrtype == 'a':
        regex = re.compile(r'^(?P<host>\S+)\s(?P<rrtype>\S+)\s(?P<ip>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})')
        table_key = 'host'
        table_val = 'ip'
        key_transform = lambda x:x
        input_transform = lambda x:x
    elif rrtype == 'ptr':
        regex = re.compile(r'^(?P<rev_ip>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\.in-addr\.arpa\s(?P<rrtype>\S+)\s(?P<host>\S+)')
        table_key = 'rev_ip'
        table_val = 'host'
        key_transform = lambda rev_ip: '.'.join(reversed(rev_ip.split('.')))
        input_transform = lambda ip: ('.'.join(reversed(ip.split('.'))))+'.in-addr.arpa'
    elif rrtype == 'soa':
        # 128.211.130.in-addr.arpa SOA ns-gce-public1.googledomains.com cloud-dns-hostmaster@google.com 1 21600 3600 259200 300
        regex = re.compile(r'^(?P<rev_ip>\d{1,3}\.\d{1,3}\.\d{1,3})\.in-addr\.arpa\s(?P<rrtype>\S+)\s(?P<host_hostmaster>\S+\s\S+)')
        table_key = 'rev_ip'
        table_val = 'host_hostmaster'
        key_transform = lambda rev_ip: '.'.join(reversed(rev_ip.split('.')))
        input_transform = lambda ip: ('.'.join(reversed(ip.split('.')[:3])))+'.in-addr.arpa.'
    else:
        raise Exception(f"unsupported rrtype: {rrtype}")
    
    results_file = f'{file_name}-{rrtype}-adnshost-results.txt'
    adns_input = f'/tmp/{file_name}-{rrtype}-adnshost-input.txt'
    
    if ignore_cache or not os.path.exists(results_file):
        with open(adns_input, 'w') as o:
            print('\n'.join([input_transform(d) for d in hosts if d]), file=o)

        command = f'''
        cat {adns_input} | adnshost \
            --asynch \
            --config "nameserver 8.8.8.8" \
            --type {rrtype} \
            --pipe \
            --cname-loose \
            ----addr-ipv4-only > {results_file}
        '''
        print(re.sub(r'\s+', ' ', command))
        os.system(command)
        #os.unlink(adns_input)

    table = collections.defaultdict(set)
    for line in open(results_file):
        mat = regex.search(line.strip())
        if mat:
            d = mat.groupdict()
            table[key_transform(d[table_key])].add(d[table_val])
    return table

In [10]:
df = pd.DataFrame([enrich_record(rec) for rec in [parse_line(line) for line in open('all-popular-domains-MX-20200620.txt.unique')] if rec])

In [11]:
df['mx_ip'] = df.ip_resolutions.map(lambda l: l[0] if l else None)

In [12]:
def get_maxmind_field(mm, name):
    return mm[0].get(name) if len(mm) > 0 else None

df['maxmind_ip']      = df.maxmind.map(lambda mm: get_maxmind_field(mm, 'ip'))
df['maxmind_cc']      = df.maxmind.map(lambda mm: get_maxmind_field(mm, 'cc'))
df['maxmind_asn']     = df.maxmind.map(lambda mm: get_maxmind_field(mm, 'asn'))
df['maxmind_asname']  = df.maxmind.map(lambda mm: get_maxmind_field(mm, 'asname'))
df['maxmind_country'] = df.maxmind.map(lambda mm: get_maxmind_field(mm, 'country'))
df['maxmind_city']    = df.maxmind.map(lambda mm: get_maxmind_field(mm, 'city'))

In [13]:
df['is_azure'] = df.mx_ip.map(is_azure)
df['is_aws'] = df.mx_ip.map(is_aws)
df['is_gcp'] = df.mx_ip.map(is_gcp)

In [14]:
all_ips = set()
for idx, row in df.ip_resolutions.iteritems():
    for ip in row:
        all_ips.add(ip)
print('Found {} unqiue IPs'.format(len(all_ips)))

Found 759103 unqiue IPs


In [15]:
table = bulk_dns(all_ips, 'mx_ips', 'ptr', ignore_cache=True)
df['mx_ip_ptrs'] = df.ip_resolutions.map(lambda ips: [list(table.get(ip))[0] for ip in ips if table.get(ip)])
df['mx_ip_ptr'] = df.mx_ip_ptrs.map(lambda ptrs: ptrs[0] if ptrs else '')

 cat /tmp/mx_ips-ptr-adnshost-input.txt | adnshost --asynch --config "nameserver 8.8.8.8" --type ptr --pipe --cname-loose ----addr-ipv4-only > mx_ips-ptr-adnshost-results.txt 


In [16]:
def three_octects(ip):
    return ip[:ip.rindex('.')]

table = bulk_dns(all_ips, 'mx_ips', 'soa', ignore_cache=True)
df['mx_ip_soas'] = df.ip_resolutions.map(lambda ips: [list(table.get(three_octects(ip)))[0] for ip in ips if table.get(three_octects(ip))])

df['mx_ip_soa'] = df.mx_ip_soas.map(lambda soas: soas[0] if soas else '')
df['mx_ip_soa_nameserver'] = df.mx_ip_soa.map(lambda val: val.split(' ')[0])
df['mx_ip_soa_hostmaster'] = df.mx_ip_soa.map(lambda val: val.split(' ')[1] if len(val.split(' ')) > 1 else '')
df['mx_ip_ptr_registered_domain'], df['mx_ip_ptr_suffix'] = zip(*df.mx_ip_ptr.map(tldextract_enrich))

 cat /tmp/mx_ips-soa-adnshost-input.txt | adnshost --asynch --config "nameserver 8.8.8.8" --type soa --pipe --cname-loose ----addr-ipv4-only > mx_ips-soa-adnshost-results.txt 


In [17]:
email_provider_domains = {
    'activegate-ss.jp': 'Activegate SS',
    'antispameurope.com': 'hornetsecurity',
    'appriver.com': 'AppRiver',
    'arsmtp.com': 'AppRiver',
    'avgcloud.net': 'AVG',
    'baesystems.com': 'BAE Systems',
    'barracuda.de': 'Barracuda',
    'barracuda.net': 'Barracuda',
    'barracudamoto.co.uk': 'Barracuda',
    'barracudanetworks.com': 'Barracuda',
    'borderware.com': 'Watchguard',
    'canit.ca': 'Can-It Pro',
    'clearswift.com':'Clearswift',
    'ctmail.com': 'Cyren',
    'deteque.com': 'Deteque',
    'everycloudtech.com': 'hornetsecurity',
    'everycloudtech.us': 'hornetsecurity',
    'exchangedefender.com': 'Exchange Defender',
    'fireeyecloud.com': 'FireEye',
    'fireeyegov.com': 'FireEye',
    'fortimail.com': 'Fortinet',
    'frontbridge.com': 'Microsoft Frontbridge',
    'fusemail.net': 'fusemail',
    'futurespam.com': 'hornetsecurity',
    'ibarracuda.nl': 'iBarracuda',
    'ik2.com': 'MXGuarddog',
    'ik2.eu': 'MXGuarddog',
    'iphmx.com': 'Cisco Ironport',
    'kaspersky-labs.com': 'Kaspersky',
    'mailanyone.net': 'Protonmail',
    'mailchannels.net': 'mailchannels',
    'mailcontrol.com': 'Forcepoint',
    'mailhop.co': 'DuoCircle',
    'mailhop.co.uk': 'DuoCircle',
    'mailhop.org': 'DuoCircle',
    'mailhop.us': 'DuoCircle',
    'mbox.net': 'BAE Systems',
    'mcafee.com': 'Mcafee',
    'messagelabs.com': 'Symmantec',
    'mimecast-offshore.com': 'Mimecast',
    'mimecast.co.za': 'Mimecast',
    'mimecast.com': 'Mimecast',
    'mpmailmx.com': 'hornetsecurity',
    'mpmailmx.com': 'Manage Protect',
    'mpmailmx.net': 'hornetsecurity',
    'mx-relay.com': 'MX Relay',
    'mxlogic.net': 'Mcafee',
    # not really a security provider
#     'mxproc.com': 'bounce.io / betterbounces',
#     'h-email.net':'bounce.io / betterbounces', 
#     'b-io.co': 'bounce.io / betterbounces',
    'mxrecord.io': 'Area1',
    'mxsmtp.com': 'Trustwave',
    'mxthunder.com': 'SpamHero',
    'mxthunder.net': 'SpamHero',
    'pandasecurity.com': 'Panda Security',
    'perimeterusa.com': 'BAE Systems',
    'postoffice.net': 'BAE Systems',
    'ppe-hosted.com': 'Proofpoint',
    'pphosted.com': 'Proofpoint',
    'protonmail.ch': 'Protonmail',
    'rcimx.com': 'EdgeWave',
    'rcimx.net': 'EdgeWave',
    'rcimx.net': 'Edgewave',
    'reflexion.net': 'Reflexion',
    'retarus.com': 'Retarus',
    'rmx.de': 'Retarus',
    'securemx.jp': 'SecureMX',
    'securence.com': 'Securence.com',
    'sendio.com': 'Sendio',
    'snwlhosted.com': 'Sonic Wall',
    'snwlhostedeu.com': 'Sonic Wall',
    'sonicwall.com': 'Sonic Wall',
    'sophos.com': 'Sophos',
    'spamexperts.com': 'Solarwinds',
    'antispamcloud.com': 'Solarwinds',
    'spamhero.com': 'SpamHero',
    'spamhero.net': 'SpamHero',
    'spamina.com': 'hornetsecurity',
    'spamtitan.com': 'SpamTitan',
    'symantec.com': 'Symantec',
    'theemaillaundry.net': 'the email laundry',
    'trendmicro.com': 'Trend Micro',
    'trendmicro.eu': 'Trend Micro',
    'trendmicro.tw': 'Trend Micro',
    'trustwave.com': 'Trustwave',
    'ual.com': 'BAE Systems',
    'usa.net': 'BAE Systems',
    'vadesecure.com': 'vadesecure',
    'watchguard.com': 'Watchguard',
    'zixmail.net': 'ZixMail',
    'zixsmbhosted.com': 'ZixMail',
    'psmtp.com': 'Postini',
    'mailinblack.com': 'Mail in Black',
    'scanscope.net': 'Censornet',
    'emailservice.io': 'Mailprotector',
}

email_provider_asns = {
    'AppRiver AG': 'AppRiver',
    'APPRIVER LLC': 'AppRiver',
    'AS-APPRIVER': 'AppRiver',
    'AS-CISCOHPS-APAC': 'Cisco Ironport',
    'AS-IRONP-VEGA': 'Cisco Ironport',
    'AS-TRENDMICRO-COM': 'Trend Micro',
    'AS2-TRENDMICRO-COM': 'Trend Micro',
    'ASN-REFLEXION': 'Reflexion',
    'BAE Systems Applied Intelligence US Corp.': 'BAE Systems',
    'BAE Systems Inc.': 'BAE Systems',
    'BAE-NET-ASN': 'BAE Systems',
    'Barracuda Networks, Inc.': 'Barracuda',
    'BARRACUDA-NETWORKS-INC': 'Barracuda',
    'Cisco Systems Ironport Division': 'Cisco Ironport',
    'Cisco Systems, Inc.': 'Cisco Ironport',
    'CISCOSYSTEMS': 'Cisco Ironport',
    'CSC-IGN-EMEA': 'BAE Systems', 
    'Edgewave, Inc.': 'Edgewave',
    'FireEye, Inc.': 'FireEye',
    'Forcepoint Cloud Ltd': 'Forcepoint',
    'Fortinet Inc.': 'Fortinet',
    'FORTINET': 'Fortinet',
    'HKN GmbH': 'hornetsecurity',
    'IRONPORT-SYSTEMS-INC': 'Cisco Ironport',
    'Kaspersky Lab AO': 'Kaspersky',
    'MCAFEE': 'Mcafee',
    'McAfee, Inc.': 'Mcafee',
    'MessageLabs Inc.': 'Symmantec',
    'Messagelabs Limited': 'Symmantec',
    'Messagelabs-AS': 'Symmantec',
    'Mimecast Australia Pty Ltd': 'Mimecast',
    'Mimecast North America Inc': 'Mimecast',
    'Mimecast Services Limited': 'Mimecast',
    'MIMECAST': 'Mimecast',
    'MimecastSA': 'Mimecast',
    'MX Logic, Inc.': 'Mcafee',
    'PERIMETER-ESECURITY': 'BAE Systems',
    'Proofpoint, Inc.': 'Proofpoint',
    'PROOFPOINT-ASN-US-EAST': 'Proofpoint',
    'PROOFPOINT-ASN-US-WEST': 'Proofpoint',
    'PROOFPOINT-UT7': 'Proofpoint',
    'Proton Technologies AG': 'Protonmail',
    'Reflexion Networks, Inc.': 'Reflexion',
    'retarus GmbH': 'Retarus',
    'SNWL-COLO-SJL': 'Sonic Wall',
    'SonicWALL, Inc.': 'Sonic Wall',
    'Symantec Corporation': 'Symantec',
    'SYMANTEC': 'Symantec',
    'SYMANTEC-CORPORATION': 'Symantec',
    'TREND MICRO INCORPORATED': 'Trend Micro',
    'Trustwave Holdings, Inc.': 'Trustwave',
    'TRUSTWAVE-ASN': 'Trustwave',
    'WatchGuard Technologies, Inc.': 'Watchguard',
    'WATCHGUARD-TECHNOLOGIES-INC': 'Watchguard',
    'Zix Corporation': 'ZixMail',
    'ZIXI': 'ZixMail',
    'DETEQUE': 'Deteque',
}

In [18]:
def email_provider_normalized(tup):
    for item in tup:
        if item != '':
            return item
    return ''

df['email_provider_from_asnname'] = df.maxmind_asname.map(email_provider_asns).replace(np.nan, '')
df['email_provider_from_mailserver'] = df.mailserver_registered_domain.map(email_provider_domains).replace(np.nan, '')
df['email_provider_from_ptr'] = df.mx_ip_ptr_registered_domain.map(email_provider_domains).replace(np.nan, '')
df['email_provider'] = pd.Series(zip(df.email_provider_from_mailserver, df.email_provider_from_asnname, df.email_provider_from_ptr)).map(email_provider_normalized)

In [19]:
pd.DataFrame(df.mailserver_registered_domain.unique(), columns=['mailserver_registered_domain']).to_csv('mailserver_registered_domain.csv', index=False)

mailserver_registered_domain.csv is then used for Bulk NS lookups using adns (offline)

# Enrich with NS records (of the MX registered domains)

In [20]:
mailserver_ns = [parse_NS_line(line) for line in open('mailserver_registered_domain-NS-20200620.txt')]
mailserver_ns = pd.DataFrame(mailserver_ns).rename(columns={'domain': 'mailserver_registered_domain', 'nameserver': 'mailserver_registered_domain_nameserver'}).fillna('')
mailserver_ns = mailserver_ns[['mailserver_registered_domain','mailserver_registered_domain_nameserver']].\
    drop_duplicates().\
    groupby('mailserver_registered_domain').\
    aggregate(lambda s: ','.join(sorted(s))).reset_index()

In [23]:
mx2ns = dict(zip(mailserver_ns.mailserver_registered_domain, mailserver_ns.mailserver_registered_domain_nameserver))

In [25]:
df['mailserver_registered_domain_nameserver'] = df.mailserver_registered_domain.map(mx2ns).fillna('')
df['mailserver_registered_domain_nameserver1'] = df.mailserver_registered_domain_nameserver.map(lambda ns: ns.split(',')[0] if ns else '')
df['mailserver_registered_domain_nameserver2'] = df.mailserver_registered_domain_nameserver.map(lambda ns: ns.split(',')[1] if len(ns.split(',')) > 1 else '')
df['mailserver_registered_domain_nameserver3'] = df.mailserver_registered_domain_nameserver.map(lambda ns: ns.split(',')[2] if len(ns.split(',')) > 2 else '')

In [26]:
df.head(3).T

Unnamed: 0,0,1,2
domain,clothes2order.com,famima.vn,brandofsacrifice.com
preference,10,1,5
mailserver,alt4.aspmx.l.google.com,mail.famima.vn,alt2.aspmx.l.google.com
adns_status,ok,ok,ok
adns_code,0,0,0
adns_reason,ok,ok,ok
fail_message,OK,OK,OK
ip_resolutions,[209.85.233.26],[103.252.255.41],[142.250.13.26]
maxmind,"[{'ip': '209.85.233.26', 'asn': 15169, 'asname...","[{'ip': '103.252.255.41', 'asn': 45544, 'asnam...","[{'ip': '142.250.13.26', 'asn': 15169, 'asname..."
mailserver_registered_domain,google.com,famima.vn,google.com


In [27]:
df.to_csv('mx-intel-enriched.csv', index=False)

In [28]:
end_time = datetime.datetime.now()
print(start_time)
print(end_time)

2020-06-26 19:44:57.526145
2020-06-26 21:04:12.404703
