In [1]:
from IPython.core.display import display, HTML, Markdown
display(HTML("<style>.container { width:98% !important; }</style>"))

In [2]:
%matplotlib inline

In [3]:
import re
import os
import sys
import json
import collections
import geoip2.database
import geoip2.errors
from cachetools import cached
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import tldextract
import requests
import netaddr
import datetime
from email_security_providers import *

In [4]:
start_time = datetime.datetime.now()

In [5]:
df = pd.read_csv('mx-intel-enriched.csv')

In [6]:
df.head(3).T

Unnamed: 0,0,1,2
domain,clothes2order.com,famima.vn,brandofsacrifice.com
preference,10,1,5
mailserver,alt4.aspmx.l.google.com,mail.famima.vn,alt2.aspmx.l.google.com
adns_status,ok,ok,ok
adns_code,0,0,0
adns_reason,ok,ok,ok
fail_message,OK,OK,OK
ip_resolutions,['209.85.233.26'],['103.252.255.41'],['142.250.13.26']
maxmind,"[{'ip': '209.85.233.26', 'asn': 15169, 'asname...","[{'ip': '103.252.255.41', 'asn': 45544, 'asnam...","[{'ip': '142.250.13.26', 'asn': 15169, 'asname..."
mailserver_registered_domain,google.com,famima.vn,google.com


In [7]:
display(Markdown('## Top PTR Base Domains'))
pd.DataFrame(df.mx_ip_ptr_registered_domain.value_counts())[:30]

## Top PTR Base Domains

Unnamed: 0,mx_ip_ptr_registered_domain
1e100.net,235037
mailspamprotection.com,123626
amazonaws.com,66536
mimecast.com,39035
deteque.com,36682
aruba.it,34335
outlook.com,29368
loopia.se,13559
messagelabs.com,7654
stackmail.com,6930


In [8]:
display(Markdown('## Top MX Base Domains'))
pd.DataFrame(df.mailserver_registered_domain.value_counts())[:30]

## Top MX Base Domains

Unnamed: 0,mailserver_registered_domain
google.com,4018222
googlemail.com,1245901
secureserver.net,638784
outlook.com,539918
ovh.net,256850
registrar-servers.com,246956
mailspamprotection.com,197132
one.com,119823
zoho.com,108058
emailsrvr.com,94131


In [9]:
display(Markdown('## Top MX Countries'))
pd.DataFrame(df.maxmind_country.value_counts())[:30]

## Top MX Countries

Unnamed: 0,maxmind_country
United States,8435220
Germany,719477
France,482903
United Kingdom,313638
Japan,267244
Russia,220472
Netherlands,181612
Canada,177305
Denmark,152091
Bulgaria,131683


In [10]:
display(Markdown('## Top MX Cities'))
df[['maxmind_city', 'maxmind_country']].groupby(['maxmind_city', 'maxmind_country']).size().to_frame().reset_index().sort_values(0, ascending=False)[:30]

## Top MX Cities

Unnamed: 0,maxmind_city,maxmind_country,0
511,Althornbach,Germany,107517
2309,Boardman,United States,89798
21028,Vienna,Austria,66573
2594,Boydton,United States,60932
967,Ashburn,United States,59790
606,Amsterdam,Netherlands,56521
8215,Helsinki,Finland,54116
5375,Dublin,Ireland,53313
13125,Moscow,Russia,52179
5027,Des Moines,United States,51147


In [11]:
display(Markdown('## Top MX ASNs'))
pd.DataFrame(df.maxmind_asname.value_counts())[:30]

## Top MX ASNs

Unnamed: 0,maxmind_asname
GOOGLE,5249464
AS-26496-GO-DADDY-COM-LLC,657744
MICROSOFT-CORP-MSN-AS-BLOCK,502882
OVH SAS,358379
NAMECHEAP-NET,335341
1&1 Ionos Se,323412
UNIFIEDLAYER-AS-1,199197
AMAZON-02,191057
GOOGLE-PRIVATE-CLOUD,126473
RACKSPACE,119195


In [12]:
display(Markdown('## Top MX ASN and MX IP PTRs'))
df[df.mx_ip_ptr_registered_domain != ''][['domain', 'maxmind_asname', 'mx_ip_ptr_registered_domain']].\
    groupby(['maxmind_asname', 'mx_ip_ptr_registered_domain']).\
        count().reset_index().sort_values('domain', ascending=False)[:10]

## Top MX ASN and MX IP PTRs

Unnamed: 0,maxmind_asname,mx_ip_ptr_registered_domain,domain
3479,GOOGLE,1e100.net,235037
3496,GOOGLE-PRIVATE-CLOUD,mailspamprotection.com,77391
338,AMAZON-02,amazonaws.com,59405
6322,MIMECAST,mimecast.com,39035
2332,DETEQUE,deteque.com,36682
1017,Aruba S.p.A.,aruba.it,34335
6311,MICROSOFT-CORP-MSN-AS-BLOCK,outlook.com,29368
3485,GOOGLE,mailspamprotection.com,25477
3494,GOOGLE-2,mailspamprotection.com,20758
6159,Loopia AB,loopia.se,13559


In [13]:
display(Markdown('## Top MX ASN and MX registered domains'))
df[['domain', 'maxmind_asname', 'mailserver_registered_domain']].\
    groupby(['maxmind_asname', 'mailserver_registered_domain']).\
        count().reset_index().sort_values('domain', ascending=False)[:10]

## Top MX ASN and MX registered domains

Unnamed: 0,maxmind_asname,mailserver_registered_domain,domain
539669,GOOGLE,google.com,3980844
539674,GOOGLE,googlemail.com,1230764
114844,AS-26496-GO-DADDY-COM-LLC,secureserver.net,627438
933872,MICROSOFT-CORP-MSN-AS-BLOCK,outlook.com,492562
1100217,OVH SAS,ovh.net,256228
967475,NAMECHEAP-NET,registrar-servers.com,246046
551221,GOOGLE-PRIVATE-CLOUD,mailspamprotection.com,126416
1124673,One.com A/S,one.com,118895
1752400,ZOHO-AS,zoho.com,107503
1208030,RACKSPACE,emailsrvr.com,93884


In [14]:
display(Markdown('## Top MX registered domains and MX IP PTRs'))
domain_ptr = df[['domain', 'mailserver_registered_domain', 'mx_ip_ptr_registered_domain']].\
    groupby(['mailserver_registered_domain', 'mx_ip_ptr_registered_domain']).\
        count().reset_index()
domain_ptr[
    (domain_ptr.mailserver_registered_domain != domain_ptr.mx_ip_ptr_registered_domain) & 
    (domain_ptr.mx_ip_ptr_registered_domain != '')
].sort_values('domain', ascending=False)

## Top MX registered domains and MX IP PTRs

Unnamed: 0,mailserver_registered_domain,mx_ip_ptr_registered_domain,domain
29993,google.com,1e100.net,148056
29995,googlemail.com,1e100.net,86960
31051,h-email.net,amazonaws.com,27200
55121,pickelhost.com,amazonaws.com,17665
32596,hostedmxserver.com,deteque.com,16695
...,...,...,...
26421,fondazioneicsa.it,aruba.it,1
26422,fondazioneisal.it,aruba.it,1
26423,fondazioneitaliainsalute.org,aruba.it,1
26424,fondazioneitaliani.it,aruba.it,1


In [15]:
domain_ptr[domain_ptr.mx_ip_ptr_registered_domain == 'amazonaws.com'].sort_values('domain', ascending=False)

Unnamed: 0,mailserver_registered_domain,mx_ip_ptr_registered_domain,domain
31051,h-email.net,amazonaws.com,27200
55121,pickelhost.com,amazonaws.com,17665
7298,b-io.co,amazonaws.com,6448
45766,messagelabs.com,amazonaws.com,6356
43414,mailerhost.net,amazonaws.com,2526
...,...,...,...
29496,gkfriend.com,amazonaws.com,1
29089,gimbo.net,amazonaws.com,1
28699,germantowntreetrimming.com,amazonaws.com,1
28066,gamingpost.net,amazonaws.com,1


In [16]:
domain_ptr[domain_ptr.mx_ip_ptr_registered_domain == 'amazonaws.com'].domain.sum()

66536

In [17]:
domain_ptr[domain_ptr.mx_ip_ptr_registered_domain == 'googleusercontent.com'].sort_values('domain', ascending=False)

Unnamed: 0,mailserver_registered_domain,mx_ip_ptr_registered_domain,domain
49465,neen.it,googleusercontent.com,11
49686,netmar.com,googleusercontent.com,9
63938,sexual.toys,googleusercontent.com,4
21670,ecs.co.uk,googleusercontent.com,2
8020,bayviewvillageshops.com,googleusercontent.com,2
...,...,...,...
25760,finduxevents.com,googleusercontent.com,1
25408,festivalmedianetwork.com,googleusercontent.com,1
24977,fayetteflyers.com,googleusercontent.com,1
24843,farost.net,googleusercontent.com,1


In [18]:
domain_ptr[domain_ptr.mx_ip_ptr_registered_domain == 'googleusercontent.com'].domain.sum()

380

In [19]:
df[df.mx_ip_soa_nameserver != ''][['domain', 'maxmind_asname', 'mx_ip_soa_nameserver']].\
    groupby(['maxmind_asname', 'mx_ip_soa_nameserver']).\
        count().reset_index().sort_values('domain', ascending=False)[:10]

Unnamed: 0,maxmind_asname,mx_ip_soa_nameserver,domain
16,PROOFPOINT-ASN-US-EAST,ns1.proofpoint.com,2705
17,PROOFPOINT-ASN-US-WEST,ns1.proofpoint.com,1664
21,UNIFIEDLAYER-AS-1,ns1.unifiedlayer.com,1343
9,INMOTI-1,ns.inmotionhosting.com,149
5,CYBERCON,rdns1.ezhostingserver.com,66
11,LIQUIDWEB,ns.sourcedns.com,47
0,ASMALLORANGE1,rdns1.asonoc.com,39
15,PRIVATESYSTEMS,ptr01.privatesystems.net,24
12,"Linode, LLC",ns1.linode.com,18
10,INTUIT-QCY-DC,dns1.p06.nsone.net,17


In [20]:
pd.DataFrame(df.mx_ip_soa_nameserver.value_counts())[:10]

Unnamed: 0,mx_ip_soa_nameserver
ns1.proofpoint.com,4369
ns1.unifiedlayer.com,1343
ns.inmotionhosting.com,149
rdns1.ezhostingserver.com,66
ns.sourcedns.com,47
rdns1.asonoc.com,39
ns1.myhostcenter.com,37
ptr01.privatesystems.net,24
ns1.linode.com,18
dns1.p06.nsone.net,17


In [21]:
pd.DataFrame(df.mx_ip_soa_hostmaster.value_counts())[:10]

Unnamed: 0,mx_ip_soa_hostmaster
ops@proofpoint.com,3713
abuse@unifiedlayer.com,1343
x-ops@proofpoint.com,656
root@ns.inmotionhosting.com,149
admin@ezhostingserver.com,66
admin@sourcedns.com,47
servers@asonoc.com,39
hostmaster@myhostcenter.com,37
dns@privatesystems.net,24
dns@linode.com,18


In [22]:
display(Markdown('### MX Domains whose IPs are hosted in Azure'))
pd.DataFrame(df[df.is_azure].groupby(['mailserver_registered_domain']).size()).reset_index().sort_values(0, ascending=False)[:20]

### MX Domains whose IPs are hosted in Azure

Unnamed: 0,mailserver_registered_domain,0
743,icoremail.net,364
1350,scanscope.net,310
955,mailinblack.com,259
236,cali.co.uk,141
209,brightberri.net,141
454,dotmailer.co.uk,130
997,menufy.com,120
467,dsmail.es,91
694,helionmail.com,89
147,azure.com,44


In [23]:
display(Markdown('### MX Domains whose IPs are hosted in AWS'))
pd.DataFrame(df[df.is_aws].groupby(['mailserver_registered_domain']).size()).reset_index().sort_values(0, ascending=False)[:20]

### MX Domains whose IPs are hosted in AWS

Unnamed: 0,mailserver_registered_domain,0
8300,h-email.net,27200
11971,mailgun.org,25229
15314,pickelhost.com,17665
12569,messagelabs.com,13035
960,amazonaws.com,11657
1902,b-io.co,6448
2051,barracudanetworks.com,4394
18116,sophos.com,4197
20226,trendmicro.com,4136
11970,mailguard.com.au,4106


In [24]:
display(Markdown('### MX Domains whose IPs are hosted in GCP'))
pd.DataFrame(df[df.is_gcp].groupby(['mailserver_registered_domain']).size()).reset_index().sort_values(0, ascending=False)[:20]

### MX Domains whose IPs are hosted in GCP

Unnamed: 0,mailserver_registered_domain,0
6239,mailspamprotection.com,62642
7132,neen.it,171
5261,jouwweb.nl,143
1828,ccnotifier.nl,59
9228,siteground.biz,44
4004,getontheweb.com,40
10923,uservers.net,37
9053,sgvps.net,33
1696,capnova.com,24
6234,mailcannon.net,18


In [25]:
def domain_count_by_key(p_df, keyname='email_provider'):
    '''
    Conceptually similar to this SQL query:
        SELECT 
            keyname, 
            COUNT(DISTINCT(domain)) as count
        FROM 
            p_df
        WHERE
            keyname!=''
        GROUP BY
            keyname
        ORDER BY
            count DESC
    '''
    return p_df[p_df[keyname] != ''][[keyname, 'domain']].\
            drop_duplicates().\
            groupby([keyname]).\
            size().\
            sort_values(0, ascending=False).\
            to_frame().\
            reset_index().\
            rename(columns={0: 'count'})

In [26]:
display(Markdown('### Top Email Security Providers'))
domain_count_by_key(df)

### Top Email Security Providers

Unnamed: 0,email_provider,count
0,Proofpoint,42310
1,Mimecast,36064
2,Deteque,34759
3,Barracuda,22897
4,Solarwinds,13961
5,Symmantec,13832
6,AppRiver,11149
7,Protonmail,8158
8,Trend Micro,8054
9,Cisco Ironport,7504


In [27]:
display(Markdown('### Top Email Providers (as determined by ASN Name)'))
domain_count_by_key(df, 'email_provider_from_asnname')

### Top Email Providers (as determined by ASN Name)

Unnamed: 0,email_provider_from_asnname,count
0,Proofpoint,40571
1,Mimecast,35385
2,Deteque,34759
3,AppRiver,10081
4,Symantec,7320
5,Cisco Ironport,6982
6,Symmantec,6887
7,Protonmail,4557
8,Forcepoint,3680
9,hornetsecurity,2009


In [28]:
display(Markdown('### Top Email Security Providers (as determined by MX Domain)'))
domain_count_by_key(df, 'email_provider_from_mailserver')

### Top Email Security Providers (as determined by MX Domain)

Unnamed: 0,email_provider_from_mailserver,count
0,Proofpoint,41735
1,Mimecast,36055
2,Barracuda,22885
3,Symmantec,13813
4,Solarwinds,11973
5,AppRiver,11143
6,Protonmail,8156
7,Cisco Ironport,7124
8,Trend Micro,7047
9,Forcepoint,3924


In [29]:
display(Markdown('### Top Email Security Providers (as determined by PTR Domain)'))
domain_count_by_key(df, 'email_provider_from_ptr')

### Top Email Security Providers (as determined by PTR Domain)

Unnamed: 0,email_provider_from_ptr,count
0,Deteque,27047
1,Mimecast,20158
2,Symmantec,7361
3,Solarwinds,1989
4,SecureMX,1213
5,Proofpoint,1015
6,AppRiver,495
7,Panda Security,334
8,SpamHero,333
9,Mail in Black,44


In [30]:
display(Markdown('### Top Email Security Providers Hosted in AWS'))
domain_count_by_key(df[df.is_aws])

### Top Email Security Providers Hosted in AWS

Unnamed: 0,email_provider,count
0,Symmantec,12864
1,Trend Micro,6493
2,Barracuda,2253
3,Sophos,2114
4,vadesecure,1583
5,Mailprotector,1234
6,FireEye,963
7,DuoCircle,921
8,SpamTitan,662
9,AVG,393


In [31]:
display(Markdown('### Top Email Providers Hosted in Azure'))
domain_count_by_key(df[df.is_azure])

### Top Email Providers Hosted in Azure

Unnamed: 0,email_provider,count
0,Mail in Black,244
1,Censornet,200


In [32]:
display(Markdown('### Top Email Providers Hosted in GCP'))
domain_count_by_key(df[df.is_gcp | df.maxmind_asname.str.contains('GOOGLE')])

### Top Email Providers Hosted in GCP

Unnamed: 0,email_provider,count


In [33]:
display(Markdown('### Top Email Security Providers (self-hosted)'))
domain_count_by_key(df[df.email_provider_from_asnname != ''])

### Top Email Security Providers (self-hosted)

Unnamed: 0,email_provider,count
0,Proofpoint,42310
1,Mimecast,36064
2,Deteque,34759
3,Barracuda,22897
4,Solarwinds,13961
5,Symmantec,13832
6,AppRiver,11149
7,Protonmail,8158
8,Trend Micro,8054
9,Cisco Ironport,7504


In [34]:
display(Markdown('### Top Email Security Providers hosted in Linode'))
domain_count_by_key(df[df.maxmind_asname == 'Linode, LLC'])

### Top Email Security Providers hosted in Linode

Unnamed: 0,email_provider,count


In [35]:
display(Markdown('### Top Email Security Providers hosted in Digital Ocean'))
domain_count_by_key(df[df.maxmind_asname == 'DIGITALOCEAN-ASN'])

### Top Email Security Providers hosted in Digital Ocean

Unnamed: 0,email_provider,count


In [36]:
display(Markdown('### Top Email Security Providers hosted in Rackspace'))
domain_count_by_key(df[df.maxmind_asname == 'RACKSPACE'])

### Top Email Security Providers hosted in Rackspace

Unnamed: 0,email_provider,count


In [37]:
display(Markdown('### Top Non-self hosted ASNs of Email Security Providers'))
df[(df.email_provider != '') & (~df.maxmind_asname.isin(email_provider_asns.keys()))].maxmind_asname.value_counts().to_frame().reset_index()[:30]

### Top Non-self hosted ASNs of Email Security Providers

Unnamed: 0,index,maxmind_asname
0,GOOGLE,5249464
1,AS-26496-GO-DADDY-COM-LLC,657744
2,MICROSOFT-CORP-MSN-AS-BLOCK,502882
3,OVH SAS,358379
4,NAMECHEAP-NET,335341
5,1&1 Ionos Se,323412
6,UNIFIEDLAYER-AS-1,199197
7,AMAZON-02,191057
8,GOOGLE-PRIVATE-CLOUD,126473
9,RACKSPACE,119195


In [38]:
display(Markdown('### Top unlabeled MX registered domains'))
df[(~df.mailserver_registered_domain.isin(email_provider_domains.keys()))].mailserver_registered_domain.value_counts().to_frame().reset_index()[:30]

### Top unlabeled MX registered domains

Unnamed: 0,index,mailserver_registered_domain
0,google.com,4018222
1,googlemail.com,1245901
2,secureserver.net,638784
3,outlook.com,539918
4,ovh.net,256850
5,registrar-servers.com,246956
6,mailspamprotection.com,197132
7,one.com,119823
8,zoho.com,108058
9,emailsrvr.com,94131


In [39]:
f1000 = [domain.strip() for domain in open('data/f1000-domains.txt')]
f100 = f1000[:100]
f50 = f1000[:100]
f10 = f1000[:10]

In [40]:
pd.options.display.max_rows = 100
display(Markdown('### Fortune 1000 Email Security Providers'))
domain_count_by_key(df[df.domain.isin(f1000)])

### Fortune 1000 Email Security Providers

Unnamed: 0,email_provider,count
0,Proofpoint,340
1,Cisco Ironport,75
2,Mimecast,65
3,Symmantec,54
4,FireEye,14
5,Trend Micro,4
6,Forcepoint,4
7,Barracuda,4
8,Postini,2
9,Fortinet,1


In [41]:
display(Markdown('### Fortune 100 Email Security Providers'))
domain_count_by_key(df[df.domain.isin(f100)])

### Fortune 100 Email Security Providers

Unnamed: 0,email_provider,count
0,Proofpoint,35
1,Cisco Ironport,8
2,Symmantec,5
3,Mimecast,1
4,FireEye,1


In [42]:
display(Markdown('### Fortune 50 Email Security Providers'))
domain_count_by_key(df[df.domain.isin(f50)])

### Fortune 50 Email Security Providers

Unnamed: 0,email_provider,count
0,Proofpoint,35
1,Cisco Ironport,8
2,Symmantec,5
3,Mimecast,1
4,FireEye,1


In [43]:
display(Markdown('### Fortune 10 Email Security Providers'))
domain_count_by_key(df[df.domain.isin(f10)])

### Fortune 10 Email Security Providers

Unnamed: 0,email_provider,count
0,Proofpoint,3
1,Symmantec,1


In [44]:
display(Markdown('### Fortune 10 Summary'))
df[df.domain.isin(f10)][['domain', 'mailserver_registered_domain', 'email_provider']].drop_duplicates().sort_values('email_provider')

### Fortune 10 Summary

Unnamed: 0,domain,mailserver_registered_domain,email_provider
283064,walmart.com,pphosted.com,Proofpoint
364491,ge.com,pphosted.com,Proofpoint
2980609,cvshealth.com,pphosted.com,Proofpoint
6274,ford.com,messagelabs.com,Symmantec
895915,gm.com,gm.com,
1003703,apple.com,apple.com,
1019948,berkshirehathaway.com,outlook.com,
3214516,exxonmobil.com,exxonmobil.com,
5247656,chevron.com,chevron.com,


In [45]:
display(Markdown('### Top Mailserver 2LDs'))
domain_count_by_key(df, 'mailserver_registered_domain')[:20]

### Top Mailserver 2LDs

Unnamed: 0,mailserver_registered_domain,count
0,google.com,1031219
1,outlook.com,530855
2,googlemail.com,511455
3,secureserver.net,320525
4,ovh.net,102265
5,mailspamprotection.com,65745
6,yandex.net,51512
7,emailsrvr.com,47438
8,registrar-servers.com,46698
9,dreamhost.com,45791


In [46]:
display(Markdown('### Top Mailserver 2LDs hosted in AWS'))
domain_count_by_key(df[df.is_aws], 'mailserver_registered_domain')[:20]

### Top Mailserver 2LDs hosted in AWS

Unnamed: 0,mailserver_registered_domain,count
0,h-email.net,27184
1,pickelhost.com,17653
2,messagelabs.com,12864
3,mailgun.org,12763
4,amazonaws.com,11430
5,b-io.co,6437
6,trendmicro.com,3970
7,trendmicro.eu,2525
8,mailerhost.net,2522
9,barracudanetworks.com,2204


In [47]:
display(Markdown('### Top Mailserver 2LDs hosted in Azure'))
domain_count_by_key(df[df.is_azure], 'mailserver_registered_domain')[:20]

### Top Mailserver 2LDs hosted in Azure

Unnamed: 0,mailserver_registered_domain,count
0,icoremail.net,355
1,mailinblack.com,244
2,scanscope.net,200
3,cali.co.uk,141
4,menufy.com,120
5,dotmailer.co.uk,66
6,helionmail.com,64
7,azure.com,42
8,brightberri.net,37
9,dsmail.es,32


In [48]:
display(Markdown('### Top Mailserver 2LDs hosted in GCP'))
domain_count_by_key(df[df.is_gcp], 'mailserver_registered_domain')[:20]

### Top Mailserver 2LDs hosted in GCP

Unnamed: 0,mailserver_registered_domain,count
0,mailspamprotection.com,44678
1,jouwweb.nl,143
2,neen.it,80
3,ccnotifier.nl,59
4,getontheweb.com,40
5,uservers.net,37
6,siteground.biz,32
7,sgvps.net,30
8,capnova.com,24
9,mailcannon.net,18


In [49]:
for domainname, provider in email_provider_domains.items():
    tmp = df[df.mailserver_registered_domain == domainname]
    tmp = tmp[tmp.mailserver_registered_domain_nameserver1 != '']
    if len(tmp.mailserver_registered_domain_nameserver1.unique()) > 0:
        print('{} Nameservers for {} ({})'.format(len(tmp.mailserver_registered_domain_nameserver1.unique()), domainname, provider))
        print(tmp.mailserver_registered_domain_nameserver1.value_counts())
        print('---\n')

1 Nameservers for activegate-ss.jp (Activegate SS)
ns-1269.awsdns-30.org    73
Name: mailserver_registered_domain_nameserver1, dtype: int64
---

1 Nameservers for antispameurope.com (hornetsecurity)
godzilla-haj2.antispameurope.de    8201
Name: mailserver_registered_domain_nameserver1, dtype: int64
---

1 Nameservers for appriver.com (AppRiver)
hugh.ns.cloudflare.com    331
Name: mailserver_registered_domain_nameserver1, dtype: int64
---

1 Nameservers for arsmtp.com (AppRiver)
mdns1.appriver.com    21902
Name: mailserver_registered_domain_nameserver1, dtype: int64
---

1 Nameservers for avgcloud.net (AVG)
ns-1350.awsdns-40.org    789
Name: mailserver_registered_domain_nameserver1, dtype: int64
---

1 Nameservers for baesystems.com (BAE Systems)
udns1.cscdns.net    11
Name: mailserver_registered_domain_nameserver1, dtype: int64
---

1 Nameservers for barracuda.de (Barracuda)
Series([], Name: mailserver_registered_domain_nameserver1, dtype: int64)
---

1 Nameservers for barracuda.net (B

1 Nameservers for sendio.com (Sendio)
Series([], Name: mailserver_registered_domain_nameserver1, dtype: int64)
---

1 Nameservers for snwlhosted.com (Sonic Wall)
Series([], Name: mailserver_registered_domain_nameserver1, dtype: int64)
---

1 Nameservers for snwlhostedeu.com (Sonic Wall)
Series([], Name: mailserver_registered_domain_nameserver1, dtype: int64)
---

1 Nameservers for sonicwall.com (Sonic Wall)
Series([], Name: mailserver_registered_domain_nameserver1, dtype: int64)
---

1 Nameservers for sophos.com (Sophos)
a1-100.akam.net    4424
Name: mailserver_registered_domain_nameserver1, dtype: int64
---

1 Nameservers for spamexperts.com (Solarwinds)
ns-1092.awsdns-08.org    11545
Name: mailserver_registered_domain_nameserver1, dtype: int64
---

1 Nameservers for antispamcloud.com (Solarwinds)
ns-1287.awsdns-32.org    2056
Name: mailserver_registered_domain_nameserver1, dtype: int64
---

1 Nameservers for spamhero.com (SpamHero)
ns.dnsbox.net    1131
Name: mailserver_registered_do

In [50]:
def provider_search(search):
    '''
    provider a mailserver domain name or string, and get a summary of the data we have about it
    '''
    tmp = df[['domain', 'mailserver', 'mailserver_registered_domain', 'maxmind_asname']][df.mailserver.str.contains(search) | df.maxmind_asname.str.contains(search)]
    print(tmp.mailserver_registered_domain.value_counts())
    print('---')
    print(tmp.mailserver.value_counts())
    print('---')
    print(tmp.maxmind_asname.value_counts())
    print('---')
    print(tmp.domain.value_counts())

In [51]:
def explore_mailservers_by_known_asns():
    for asn, provider in email_provider_asns.items():
        tmp = df[df.maxmind_asname == asn]
        print('MXs for {} ({})'.format(asn, provider))
        print(tmp.mailserver_registered_domain.value_counts())
        print('---')

In [52]:
def explore_asns_by_known_mailservers():
    for domainname, provider in email_provider_domains.items():
        tmp = df[df.mailserver_registered_domain == domainname]
        print('ASNs for {} ({})'.format(domainname, provider))
        print(tmp.maxmind_asname.value_counts())
        print('---')

In [53]:
def identify_more_provider_asns():
    import textdistance
    asns = df.maxmind_asname.map(lambda asn: str(asn)).unique()
    for asname,provider in email_provider_asns.items():
        count = 0
        for asn in asns:
            asname_score = textdistance.jaro_winkler(asn.lower(), asname.lower())
            provider_score = textdistance.jaro_winkler(asn.lower(), provider.lower())
            if asname.lower() in asn.lower() or provider.lower() in asn.lower() or asname_score > 0.8 or provider_score > 0.8:
                count += 1
                print('"{}": "{}",'.format(asn, provider))
        if count == 0:
            print('NONE FOUND: "{}" ("{}")'.format(asname,provider))

In [54]:
display(Markdown('### Top Mailserver Nameservers'))
df['mailserver_registered_domain_nameserver1'].value_counts().to_frame()[:30]

### Top Mailserver Nameservers

Unnamed: 0,mailserver_registered_domain_nameserver1
ns1.google.com,5270713
a1-245.akam.net,638790
ns1.msft.net,540042
ns-1and1.ui-dns.biz,295656
edns4.ultradns.biz,264138
dns10.ovh.net,256981
ns1.clev1.net,197145
dns1.p03.nsone.net,132351
a.b-one-dns.net,119823
ns1.p256.dynect.net,94131


In [55]:
display(Markdown('### Quick summary reports for analysis/exploration ...'))
#mask = df.mailserver_registered_domain_nameserver1.fillna('').str.contains('cscdns.net')
#mask = df.maxmind_asname == 'Linode, LLC'
#mask = df.maxmind_asname == 'DIGITALOCEAN-ASN'
#mask = df.maxmind_asname == 'RACKSPACE'
#mask = df.maxmind_asname.fillna('').str.contains('Alibaba')
#mask = df.mailserver_registered_domain == 'h-email.net'
#mask = df.mx_ip_ptr_registered_domain == 'deteque.com'
#mask = df.mx_ip_ptr_registered_domain == 'spamcloud.md'
#mask = df.domain.isin(f100)
mask = df.mailserver_registered_domain == 'psmtp.com'

display(df[mask].mailserver_registered_domain.value_counts().to_frame())
display(df[mask].maxmind_asname.value_counts().to_frame())
display(df[mask].mx_ip_ptr_registered_domain.value_counts().to_frame())
display(df[mask].mailserver_registered_domain_nameserver1.value_counts().to_frame())
display(df[mask].email_provider.value_counts().to_frame())
display(df[mask].mailserver.value_counts().to_frame())
#display(df[mask].domain.value_counts().to_frame())

### Quick summary reports for analysis/exploration ...

Unnamed: 0,mailserver_registered_domain
psmtp.com,6241


Unnamed: 0,maxmind_asname


Unnamed: 0,mx_ip_ptr_registered_domain


Unnamed: 0,mailserver_registered_domain_nameserver1
ns1.google.com,6241


Unnamed: 0,email_provider
Postini,6241


Unnamed: 0,mailserver
quinstreet.com.mail7.psmtp.com,149
quinstreet.com.mail5.psmtp.com,149
quinstreet.com.mail8.psmtp.com,149
quinstreet.com.mail6.psmtp.com,149
prestoinc.com.s9b2.psmtp.com,21
...,...
hybridmotorcycles.co.uk.s200b2.psmtp.com,1
stephenscountyschools.org.s9b1.psmtp.com,1
natchezgrandhotel.com.s7a2.psmtp.com,1
rkymtnhi.com.rkymtnhi.mail2.psmtp.com,1


In [56]:
end_time = datetime.datetime.now()
print(start_time)
print(end_time)

2020-06-26 21:19:18.876282
2020-06-26 21:26:40.483762
