In [16]:
import json
import itertools
import glob
from collections import defaultdict, Counter
from tqdm import tqdm

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()


In [24]:
DATA = "peerlist.json" # "Known peers" data

# Process file line by line as it's quite large 
def processLine(line):
    """Process a single DHT record."""

    line = json.loads(line)
    
    VP = line['VANTAGE']
    
    # Dedup peer IDs for a single VP
    thisPeers = set(line['Addrs'].keys())

    # Dedup peer IPs / ports for a single VP
    thisAddrs = set(itertools.chain.from_iterable(line['Addrs'].values()))
    peers[VP].update(thisPeers)
    addresses[VP].update(thisAddrs)

# Load and process daa
peers, addresses = defaultdict(set), defaultdict(set)
for line in tqdm(open(DATA)): processLine(line)

13707it [00:31, 433.51it/s]


In [54]:
# Generate list of seen IPs
IPs = []
for vp, addrs in tqdm(addresses.items()):
    for addr in addrs:
        # Basic parsing of multiaddr format
        if addr:
            chunks = addr.split("/")
            
            # /ipfs/<peerID>
            if chunks[1] == 'ipfs':
                # Pure P2P circuit
                p2p = True
                ipType, proto, port = "p2p", "", ""
                a = chunks[2]

            # /<proto>/<ip>/<ipfs>/<peerID>
            elif chunks[1]  == 'p2p-circuit':
                if len(chunks) < 3: continue
                p2p = True
                ipType, proto, port = "p2p", "", ""
                a = chunks[3]

            # /onion3/<onionID>
            elif 'onion' in chunks[1]:
                p2p = False
                proto, ipType = 'onion', 'onion'
                a = chunks[2].split(":")[0]
                port = chunks[2].split(":")[1]                
                
            # /<proto>/<ip>/<transport>/<port>
            else: 
                try:
                    ipType, a, proto, port = chunks[1:5]
                except:
                    print(addr)
                    continue
                p2p = ("p2p-circuit" in addr)
                
            record = {'IPVersion': ipType, "Address": a, "Protocol": proto, "Port": port, "P2P": p2p, "VP": vp}
            IPs.append(record)

  0%|          | 0/9 [00:00<?, ?it/s]

/ip4/180.119.57.156
/ip4/182.245.227.125
/ip4/180.119.57.56
/dnsaddr/ipfs.lubar.me
/ip4/39.186.6.127
/ip4/182.245.226.184


 11%|█         | 1/9 [00:02<00:19,  2.38s/it]

/dnsaddr/bootstrap.libp2p.io
/ip4/180.119.57.156
/ip4/182.245.227.125
/ip4/182.245.226.184
/ip4/182.242.57.155


 22%|██▏       | 2/9 [00:03<00:13,  1.98s/it]

/dnsaddr/ipfs.lubar.me
/dnsaddr/bootstrap.libp2p.io
/ip4/39.186.6.127
/ip4/182.245.227.125
/ip4/182.245.226.184


 33%|███▎      | 3/9 [00:04<00:10,  1.81s/it]

/dnsaddr/ipfs.lubar.me
/dnsaddr/bootstrap.libp2p.io
/ip4/180.119.57.156
/ip4/39.186.6.127
/ip4/182.245.226.184
/ip4/182.242.57.155


 44%|████▍     | 4/9 [00:06<00:09,  1.84s/it]

/dnsaddr/ipfs.lubar.me
/dnsaddr/bootstrap.libp2p.io
/ip4/180.119.57.156
/ip4/39.186.6.127
/ip4/182.245.227.125
/ip4/182.245.226.184
/ip4/182.242.57.155


 56%|█████▌    | 5/9 [00:08<00:06,  1.71s/it]

/dnsaddr/ipfs.lubar.me
/dnsaddr/bootstrap.libp2p.io
/ip4/39.186.6.127
/ip4/182.245.227.125
/ip4/182.245.226.184


 67%|██████▋   | 6/9 [00:09<00:04,  1.55s/it]

/ip4/114.230.150.179
/dnsaddr/ipfs.lubar.me
/dnsaddr/bootstrap.libp2p.io
/ip4/180.119.57.156
/ip4/182.245.227.125
/ip4/182.245.226.184
/ip4/182.242.57.155


 78%|███████▊  | 7/9 [00:10<00:02,  1.46s/it]

/dnsaddr/ipfs.lubar.me
/ip4/180.119.57.156
/ip4/39.186.6.127
/ip4/182.245.227.125
/ip4/182.245.226.184


 89%|████████▉ | 8/9 [00:11<00:01,  1.38s/it]

/ip4/180.119.57.56
/dnsaddr/ipfs.lubar.me
/dnsaddr/bootstrap.libp2p.io
/ip4/180.119.57.156
/ip4/39.186.6.127
/ip4/182.245.226.184
/ip4/180.119.57.56
/dnsaddr/ipfs.lubar.me
/dnsaddr/bootstrap.libp2p.io


100%|██████████| 9/9 [00:13<00:00,  1.42s/it]


In [55]:
# Map hostnames 
df = pd.DataFrame(IPs)
names = {'fruchter-ipfs-probe': "chs1",
 'fruchter-ipfs-probe-ams.c.ipri-229620.internal': "ams",
 'fruchter-ipfs-probe-ams2.c.ipri-229620.internal': "ams2",
 'fruchter-ipfs-probe-bom.c.ipri-229620.internal': "bom",
 'fruchter-ipfs-probe-chs2.c.ipri-229620.internal': "chs2",
 'fruchter-ipfs-probe-gru.c.ipri-229620.internal': "gru",
 'fruchter-ipfs-probe-hel.c.ipri-229620.internal': "hel",
 'fruchter-ipfs-probe-lax.c.ipri-229620.internal': "lax",
 'fruchter-ipfs-probe-sin.c.ipri-229620.internal': "sin"
 }
df["VP"] = df.VP.map(names)

In [82]:
# Get list of top addresses by VP
counts = df.groupby("VP").Address.value_counts()
topAddrs = pd.DataFrame(counts.groupby('VP').head(n=16000))
topAddrs.columns = ['Count']
topAddrs = topAddrs.reset_index()

In [178]:
# Geolocate IPs
ipsForGeo = set(topAddrs['Address'])

import geoip2.database, geoip2.errors
mm_asn = geoip2.database.Reader("GeoLite2-ASN.mmdb")
mm_city = geoip2.database.Reader("GeoLite2-City.mmdb")

def ipinfo(ip):
    """Use MaxMind databases to look up info about an IP."""
    asn = mm_asn.asn(ip)
    loc = mm_city.city(ip)

    out = {
        'asn': asn.autonomous_system_number,
        'aso': asn.autonomous_system_organization,
        'lat': loc.location.latitude,
        'lon': loc.location.longitude,
        'continent': loc.continent.code,
        'country': loc.country.iso_code,
        'city': loc.city.name
    }

    # Correct one inaccuracy in the database
    if out['aso'] == 'No.31,Jin-rong Street':
        out['aso'] = 'China Telecom Backbone'
    return out

In [179]:
# Use MaxMind to get info
ipResults = []
ipsNotFound = []
for ip in tqdm(ipsForGeo): 
    try:
        result = ipinfo(ip)
        result['ip'] = ip
        ipResults.append(result)
    except geoip2.errors.AddressNotFoundError:
        ipsNotFound.append(ip)
    except ValueError:
        continue

100%|██████████| 61030/61030 [00:06<00:00, 9091.39it/s] 


In [180]:
# Deal with any others that weren't in the DB
excludedPrefixes = ["192", "169", "127", "172", "100", "0", "198.51", "203.0.113", "224", "240", "255", "10"]
ipsNotFound = [ip for ip in ipsNotFound if not any(ip.startswith(prefix) for prefix in excludedPrefixes)]

def customIPLookup(ip):
    # A few manual lookups to deal with the stragglers
    
    if ip.startswith("2002:644c"):
        result = {
            'ip': ip,
            'asn': '7725',
            'aso': 'Comcast Cable Communications, LLC',
            'lat': '39.8283',
            'lon': '-98.5795',
            'continent': 'NA',
            'country': 'US',
            'city': ''
        }
    elif ip.startswith("115.174"):
        result = {
            'ip': ip,
            'asn': '',
            'aso': '',
            'lat': '39.9289',
            'lon': '116.3880',
            'continent': 'AS',
            'country': 'CN',
            'city': 'Beijing'
        }
    elif ip.startswith("211.161"):
        result = {
            'ip': ip,
            'asn': '9395',
            'aso': 'BeiJing Guoxin bilin Telecom Technology Co.,Ltd',
            'lat': '31.1825',
            'lon': '121.3850',
            'continent': 'AS',
            'country': 'CN',
            'city': 'Shanghai'
        }
    elif ip == '193.19.254.155':
         result = {
            'ip': ip,
            'asn': '198820',
            'aso': 'Limited Liability Company Radio Network',
            'lat': '49.3833',
            'lon': '23.5500',
            'continent': 'EU',
            'country': 'UA',
            'city': 'Rykhtychi'
        }
    else:
        return False
    
    return result

for ip in tqdm(ipsNotFound):
    result = customIPLookup(ip)
    if result: 
        ipResults.append(result)

100%|██████████| 1304/1304 [00:00<00:00, 184054.80it/s]


In [187]:
# Merge lookup resuls with dataframe of all IPs
dfLoc = pd.DataFrame(ipResults)
dfIPs = dfLoc.merge(right=df, left_on='ip', right_on='Address')

In [189]:
dfIPs.head()

Unnamed: 0,asn,aso,city,continent,country,ip,lat,lon,Address,IPVersion,P2P,Port,Protocol,VP
0,12975,Palestine Telecommunications Company (PALTEL),Gaza,AS,PS,188.161.180.225,31.5019,34.4666,188.161.180.225,ip4,False,28516,tcp,ams2
1,12975,Palestine Telecommunications Company (PALTEL),Gaza,AS,PS,188.161.180.225,31.5019,34.4666,188.161.180.225,ip4,False,11494,tcp,ams2
2,12975,Palestine Telecommunications Company (PALTEL),Gaza,AS,PS,188.161.180.225,31.5019,34.4666,188.161.180.225,ip4,False,11821,tcp,ams2
3,12975,Palestine Telecommunications Company (PALTEL),Gaza,AS,PS,188.161.180.225,31.5019,34.4666,188.161.180.225,ip4,False,28297,tcp,ams2
4,34569,Networx-Bulgaria Ltd.,Rousse,EU,BG,109.120.215.11,43.8564,25.9708,109.120.215.11,ip4,False,4001,tcp,ams


In [194]:
asoGroup = dfIPs.groupby("VP").aso.value_counts()

VP    aso                                                     
ams   China Telecom Backbone                                      156961
      HK Broadband Network Ltd.                                    31397
      Amazon.com, Inc.                                             18568
      CHINA UNICOM China169 Backbone                               16808
      Google LLC                                                   13932
      China Telecom (Group)                                        10141
      China Mobile communications corporation                       8197
      Henan Mobile Communications Co.,Ltd                           4846
      Cloudflare, Inc.                                              4680
      Guangdong Mobile Communication Co.Ltd.                        3119
      AS Number for CHINANET jiangsu province backbone              2788
      Comcast Cable Communications, LLC                             2408
      HGC Global Communications Limited                      

In [196]:
dfIPs.groupby("VP").country.value_counts().groupby("VP").head(n=5)

VP    country
ams   CN         206794
      US          42963
      HK          34562
      FR           3137
      DE           1530
ams2  CN         224871
      US          53157
      HK          34724
      KR           3408
      FR           2037
bom   CN         202200
      US          45899
      HK          40999
      FR           1720
      KR           1666
chs1  CN         399239
      US          98388
      HK          46492
      FR           4520
      KR           2891
chs2  CN         272034
      US          67203
      HK          55982
      KR           2538
      FR           2412
gru   CN         181068
      US          35411
      HK          34225
      DE           1855
      FR           1608
hel   CN         205106
      US          57603
      HK          54709
      FR           3034
      DE           2744
lax   CN         240522
      US          49921
      HK          40301
      FR           2862
      DE           1658
sin   CN         177474
  

# Mapping

In [240]:
import folium
import folium.plugins as plugins
locs = dfIPs[ ['lat', 'lon'] ].itertuples(index=False, name=None)
locs = [i for i in tqdm(locs)]


3270615it [00:01, 2235836.31it/s]


In [252]:
# Turn lat and lon columns into comma delimited string
dfIPs['LatLon'] = dfIPs.lat.astype(str).str.cat( dfIPs.lon.astype(str), sep=',')

# Count locations for weighting
weightedLocations = dfIPs.groupby("LatLon").LatLon.value_counts()
weightedLocations = [i for i in pd.DataFrame(weightedLocations).itertuples(index=True, name=None)]

# Turn back into lat, lon, weight tuples
locForPlot = []
for loc in weightedLocations:
    lat, lon = map(float, loc[0][0].split(","))
    weight = loc[1]
    locForPlot.append( (lat, lon, weight) )

In [384]:
# Find the highest weight for color scaling
max_count = max(locForPlot, key=lambda i: i[2])[2] 

# Create the heatmap
hmap = folium.Map(location=(25,2), zoom_start=2, tiles='cartodbdark_matter',)
hm = plugins.HeatMap(
    locForPlot,
    min_opacity=0.25, max_val=max_count, radius=5, blur=1, max_zoom=1
)
hmap.add_child(hm)

In [451]:
from IPython.display import display
# Heatmap which excludes the top `1-quantile` percent of peers
def quantileHeatmap(coords, q=0.99, direction='lte', folium_args={}):
    """
    Create heatmap which plots a heatmap of all data inside or outside
    of the given quantile.
    
    direction = lte for exclusion, gte for inclusion.
    
    Takes a quantile and list of lat, lon, weight tuples and plots.
    
    """
    
    quantileCounts = [i[2] for i in coords]
    
    if direction == 'lte':                  
        locForPlotNoTop = [i for i in coords if i[2] <= np.quantile(quantileCounts, q)]
    elif direction == 'gte':
        locForPlotNoTop = [i for i in coords if i[2] >= np.quantile(quantileCounts, q)]
    else:
        raise ValueError("Direction must be `lte` or `gte`.")
                      
    # Create map
    max_count = max(locForPlotNoTop, key=lambda i: i[2])[2] 
    hmap = folium.Map(location=(25,2), zoom_start=2, tiles='cartodbdark_matter',)

    min_opacity = folium_args.get('min_opacity') or 0.25
    radius = folium_args.get('radius') or 5
    blur = folium_args.get('blur') or 2

    hm = plugins.HeatMap(
        locForPlotNoTop,
        min_opacity=min_opacity, max_val=max_count, radius=radius, blur=blur, max_zoom=1
    )
    hmap.add_child(hm)
    return hmap

m = quantileHeatmap(locForPlot, 0.99)
display(m)

In [455]:
# Now let's display the top 1%
m = quantileHeatmap(locForPlot, 0.99, direction='gte', folium_args={'min_opacity': 0.4, 'radius': 8, 'blur': 4})
display(m)

In [389]:
# How many addresses
addr = df.Address.value_counts()
len(addr)

377127

In [406]:
# How many unique addresses
ap = set()
for v in peers.values():
    ap.update(v)
len(ap)

352033

In [428]:
# Top 5 ASOs
top5ASO = pd.DataFrame(asoGroup.groupby('VP').head())
top5ASO.columns = ['N']
top5ASO = top5ASO.reset_index()

In [445]:
topCountry = dfIPs.country.value_counts(normalize=True)[0:10]
topASO = dfIPs.aso.value_counts(normalize=True)[0:20]

In [442]:
topCountry

CN    0.645186
US    0.157050
HK    0.119309
FR    0.007328
KR    0.005441
DE    0.004996
CA    0.003871
SG    0.003861
IL    0.003252
NL    0.002598
Name: country, dtype: float64

In [446]:
topASO

China Telecom Backbone                              0.491368
HK Broadband Network Ltd.                           0.100965
Google LLC                                          0.072590
Amazon.com, Inc.                                    0.049674
CHINA UNICOM China169 Backbone                      0.047826
China Telecom (Group)                               0.029389
China Mobile communications corporation             0.026892
Cloudflare, Inc.                                    0.015641
AS Number for CHINANET jiangsu province backbone    0.014575
Henan Mobile Communications Co.,Ltd                 0.012167
Guangdong Mobile Communication Co.Ltd.              0.009263
HGC Global Communications Limited                   0.008751
Comcast Cable Communications, LLC                   0.006584
CHINANET Guangdong province network                 0.005259
DigitalOcean, LLC                                   0.003911
HKT Limited                                         0.003389
Charter Communications I