In [4]:
# install Python packages used in this notebook
!pip install pandas numpy python-geohash geohash2 folium

Collecting python-geohash
  Downloading python-geohash-0.8.5.tar.gz
Building wheels for collected packages: python-geohash
  Running setup.py bdist_wheel for python-geohash ... [?25ldone
[?25h  Stored in directory: /gpfs/fs01/user/sa73-1acf9232f65bd2-cf1c60ef4a00/.cache/pip/wheels/38/0e/1e/d00bb723727485c0fb7951cbec8f55b91f5f5b11913d5559a0
Successfully built python-geohash
Installing collected packages: python-geohash
Successfully installed python-geohash-0.8.5


In [5]:
import os
import geohash
import geohash2
import numpy as np
import pandas as pd
import folium
from folium.features import DivIcon

pd.set_option('max_rows', 15)

# find the newest directory, in case there are old directories left over from previous runs
maxmindDirectory = sorted( [ f for f in os.listdir() if os.path.isdir(f) and f.startswith('GeoLite2-City-CSV') ] )[-1]

# load the MaxMind network and location data 
maxmindNetworks = pd.read_csv(maxmindDirectory + '/GeoLite2-City-Blocks-IPv4.csv', header=0)
maxmindLocations = pd.read_csv(maxmindDirectory + '/GeoLite2-City-Locations-en.csv', header=0)

# discard networks with no location
maxmindNetworks = maxmindNetworks.dropna(subset=['geoname_id'])

# cast location codes to integers
maxmindNetworks['geoname_id'] = maxmindNetworks['geoname_id'].astype('int32')
maxmindLocations['geoname_id'] = maxmindLocations['geoname_id'].astype('int32')

def cleanName(name):
    if name=='nan': return '---'
    return name.replace(',', '')

# remove commas and NaN's from networks
columns = ['postal_code']
for column in columns:
    maxmindNetworks[column] = maxmindNetworks[column].apply(lambda name: cleanName(str(name)))
    
# remove commas and NaN's from locations
columns = ['subdivision_1_iso_code','subdivision_1_name','subdivision_2_iso_code','subdivision_2_name','city_name','metro_code']
for column in columns:
    maxmindLocations[column] = maxmindLocations[column].apply(lambda name: cleanName(str(name)))

In [6]:
# display raw network data
maxmindNetworks.head(15)

Unnamed: 0,network,geoname_id,registered_country_geoname_id,represented_country_geoname_id,is_anonymous_proxy,is_satellite_provider,postal_code,latitude,longitude,accuracy_radius
0,1.0.0.0/24,2151718,2077456.0,,0,0,3095,-37.7,145.1833,1000.0
1,1.0.1.0/24,1810821,1814991.0,,0,0,---,26.0614,119.3061,50.0
2,1.0.2.0/23,1810821,1814991.0,,0,0,---,26.0614,119.3061,50.0
3,1.0.4.0/22,2077456,2077456.0,,0,0,---,-33.494,143.2104,1000.0
4,1.0.8.0/21,1809858,1814991.0,,0,0,---,23.1167,113.25,50.0
5,1.0.16.0/20,1850147,1861060.0,,0,0,190-0031,35.685,139.7514,500.0
6,1.0.32.0/19,1809858,1814991.0,,0,0,---,23.1167,113.25,50.0
7,1.0.64.0/20,1854383,1861060.0,,0,0,700-0827,34.6617,133.935,10.0
8,1.0.80.0/22,1854383,1861060.0,,0,0,700-0827,34.6617,133.935,10.0
9,1.0.84.0/23,1854383,1861060.0,,0,0,700-0827,34.6617,133.935,10.0


In [7]:
# display raw location data
maxmindLocations.head(15)

Unnamed: 0,geoname_id,locale_code,continent_code,continent_name,country_iso_code,country_name,subdivision_1_iso_code,subdivision_1_name,subdivision_2_iso_code,subdivision_2_name,city_name,metro_code,time_zone
0,18918,en,EU,Europe,CY,Cyprus,04,Ammochostos,---,---,Protaras,---,Asia/Famagusta
1,32909,en,AS,Asia,IR,Iran,07,Ostan-e Tehran,---,---,Shahre Jadide Andisheh,---,Asia/Tehran
2,49518,en,AF,Africa,RW,Rwanda,---,---,---,---,---,---,Africa/Kigali
3,49747,en,AF,Africa,SO,Somalia,BK,Bakool,---,---,Oddur,---,Africa/Mogadishu
4,51537,en,AF,Africa,SO,Somalia,---,---,---,---,---,---,Africa/Mogadishu
5,53654,en,AF,Africa,SO,Somalia,BN,Banaadir,---,---,Mogadishu,---,Africa/Mogadishu
6,54225,en,AF,Africa,SO,Somalia,SH,Lower Shabeelle,---,---,Merca,---,Africa/Mogadishu
7,55671,en,AF,Africa,SO,Somalia,JH,Lower Juba,---,---,Kismayo,---,Africa/Mogadishu
8,57289,en,AF,Africa,SO,Somalia,WO,Woqooyi Galbeed,---,---,Hargeisa,---,Africa/Mogadishu
9,58933,en,AF,Africa,SO,Somalia,NU,Nugaal,---,---,Garoowe,---,Africa/Mogadishu


In [None]:
# display relevant network data
maxmindNetworks[['network','geoname_id','postal_code','latitude','longitude']].set_index('network').head(15)

Unnamed: 0_level_0,geoname_id,postal_code,latitude,longitude
network,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1.0.0.0/24,2151718,3095,-37.7,145.1833
1.0.1.0/24,1810821,---,26.0614,119.3061
1.0.2.0/23,1810821,---,26.0614,119.3061
1.0.4.0/22,2077456,---,-33.494,143.2104
1.0.8.0/21,1809858,---,23.1167,113.25
1.0.16.0/20,1850147,190-0031,35.685,139.7514
1.0.32.0/19,1809858,---,23.1167,113.25
1.0.64.0/20,1854383,700-0827,34.6617,133.935
1.0.80.0/22,1854383,700-0827,34.6617,133.935
1.0.84.0/23,1854383,700-0827,34.6617,133.935


In [None]:
# add geohash of latitude/longitude and display relevant network data again
maxmindNetworks['geohash6'] = maxmindNetworks.apply(lambda row: geohash2.encode(row['latitude'],row['longitude'],precision=6),axis=1)
maxmindNetworks[['network','geoname_id','postal_code','latitude','longitude','geohash6']].set_index('network').head(15)

In [None]:
# display relevant location data
maxmindLocations[['geoname_id','country_iso_code','country_name','subdivision_1_iso_code','subdivision_1_name','subdivision_2_iso_code','subdivision_2_name','city_name']].set_index('geoname_id').head(15)

In [None]:
# merge relevant network and location data and display results
maxmindNetworkLocations = maxmindNetworks[['network','geoname_id','latitude','longitude','geohash6']].join(maxmindLocations[['geoname_id','country_iso_code','country_name','subdivision_1_name','subdivision_2_name','city_name']].set_index(['geoname_id']),on='geoname_id')
maxmindNetworkLocations.set_index('network').head(15)

In [12]:
# group networks by location
maxmindNetworksByLocation = maxmindNetworkLocations.groupby('geoname_id')

In [13]:
# summarize each group of network locations

def groupSummary(geoname_id):
    group = maxmindNetworksByLocation.get_group(geoname_id)
    networkCount = len(group)
    coordinateCount = len(group.groupby('geohash6').count())
    averageLatitude = group['latitude'].mean()
    averageLongitude = group['longitude'].mean()
    geohash6 = geohash2.encode(averageLatitude,averageLongitude,precision=6)
    return [geoname_id,networkCount,coordinateCount,averageLatitude,averageLongitude,geohash6]

columns = ['geoname_id','networkCount','coordinateCount','averageLatitude','averageLongitude','geohash6']
maxmindLocationSummary = pd.DataFrame( [ groupSummary(g) for g in list(maxmindNetworksByLocation.groups.keys()) ], columns=columns )

In [14]:
# display network location summary
maxmindLocationSummary.set_index('geoname_id').head(15)

Unnamed: 0_level_0,networkCount,coordinateCount,averageLatitude,averageLongitude,geohash6
geoname_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2883584,1,1,49.1167,7.5667,u0trkj
7602180,1,1,-6.2854,176.3153,ryvkhp
524294,27,2,55.57117,42.042233,ucvuhs
2883591,6,1,54.3833,9.4333,u1wtqs
2621448,28,1,55.8398,9.25,u1ytdm
2621449,9,1,56.9943,9.9909,u4ph5r
3407882,9,1,-3.2047,-52.2124,6z6vgb
524299,10,1,68.8167,32.8333,uspr7p
2621456,10,1,55.7343,11.5484,u3bjnf
524305,226,3,68.977949,33.09093,usr83j


In [15]:
# add additional location data to summary
maxmindLocationSummary = maxmindLocationSummary.join(maxmindLocations[['geoname_id','country_name','subdivision_1_name','city_name']].set_index('geoname_id'), on='geoname_id')

In [16]:
# display summary with additional location datafor locations with more than two unique latitude/longitude coordinates
maxmindLocationSummary.set_index('geoname_id').head(15)

Unnamed: 0_level_0,networkCount,coordinateCount,averageLatitude,averageLongitude,geohash6,country_name,subdivision_1_name,city_name
geoname_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2883584,1,1,49.1167,7.5667,u0trkj,Germany,Rheinland-Pfalz,Kroppen
7602180,1,1,-6.2854,176.3153,ryvkhp,Tuvalu,Nanumanga,Tokelau Village
524294,27,2,55.57117,42.042233,ucvuhs,Russia,Vladimirskaya Oblast',Murom
2883591,6,1,54.3833,9.4333,u1wtqs,Germany,Schleswig-Holstein,Kropp
2621448,28,1,55.8398,9.25,u1ytdm,Denmark,South Denmark,Give
2621449,9,1,56.9943,9.9909,u4ph5r,Denmark,North Denmark,Gistrup
3407882,9,1,-3.2047,-52.2124,6z6vgb,Brazil,Para,Altamira
524299,10,1,68.8167,32.8333,uspr7p,Russia,Murmansk,Murmashi
2621456,10,1,55.7343,11.5484,u3bjnf,Denmark,Zealand,Gislinge
524305,226,3,68.977949,33.09093,usr83j,Russia,Murmansk,Murmansk


In [17]:
# display summary for locations with more than two unique latitude/longitude coordinates
maxmindLocationSummary.sort_values('coordinateCount',ascending=False).set_index('geoname_id').head(15)

Unnamed: 0_level_0,networkCount,coordinateCount,averageLatitude,averageLongitude,geohash6,country_name,subdivision_1_name,city_name
geoname_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
756135,2149,199,52.244754,21.003443,u3qcjy,Poland,Mazovia,Warsaw
2950159,5479,193,52.513743,13.403263,u33d8z,Germany,Land Berlin,Berlin
3530597,4371,163,19.431441,-99.138599,9g3w81,Mexico,Mexico City,Mexico City
745044,3613,123,41.022007,28.978656,sxk977,Turkey,Istanbul,Istanbul
6167865,10114,101,43.666551,-79.431134,dpz82t,Canada,Ontario,Toronto
4699066,7078,99,29.778965,-95.445854,9vk1kj,United States,Texas,Houston
2193733,4101,99,-36.869048,174.767408,rckq2b,New Zealand,Auckland,Auckland
6077243,9190,97,45.509495,-73.582533,f25dvs,Canada,Quebec,Montreal
2911298,3018,95,53.561205,10.013645,u1x0ey,Germany,Hamburg,Hamburg
3094802,641,94,50.058009,19.971596,u2yhy2,Poland,Lesser Poland Voivodeship,Krakow


In [18]:
# this function plots the networks in a location on a map

def drawmap(geoname_id,zoomLevel):

    countryName = maxmindLocations.loc[maxmindLocations['geoname_id']==geoname_id,'country_name'].item()
    subdivisionName = maxmindLocations.loc[maxmindLocations['geoname_id']==geoname_id,'subdivision_1_name'].item()
    cityName = maxmindLocations.loc[maxmindLocations['geoname_id']==geoname_id,'city_name'].item()

    group = maxmindNetworksByLocation.get_group(geoname_id)
    networkCount = len(group)
    averageLatitude, averageLongitude = group[['latitude','longitude']].mean()

    map = folium.Map(location=[averageLatitude, averageLongitude], zoom_start=zoomLevel)

    points = group.groupby(['latitude','longitude'])
    coordinateCount = len(list(points.groups.keys()))

    for key, group in points:
        latitude,longitude = key
        folium.features.Circle(location=[latitude, longitude], radius=200, color='blue').add_to(map)

    print('location ' + str(geoname_id) + ': ' + str(networkCount) + ' networks at ' + str(coordinateCount) + ' coordinates in ' + cityName + ', ' + subdivisionName + ', ' + countryName)
    return map

In [19]:
drawmap(3114472,11) # Pamplona

location 3114472: 233 networks at 6 coordinates in Pamplona, Navarre, Spain


In [20]:
drawmap(4335045,12) # New Orleans

location 4335045: 828 networks at 23 coordinates in New Orleans, Louisiana, United States


In [21]:
drawmap(2867714,11) # Munich

location 2867714: 2538 networks at 67 coordinates in Munich, Bavaria, Germany


In [22]:
drawmap(2950159,10) # Berlin

location 2950159: 5479 networks at 197 coordinates in Berlin, Land Berlin, Germany


In [23]:
# this function finds a geohah that encloses all of the networks in a location and plots them on a map

def findGeohashBBox(minLatitude, minLongitude, maxLatitude, maxLongitude):

    for p in range(12,0,-1):
        geohashCode = geohash2.encode( (maxLatitude+minLatitude)/2, (maxLongitude+minLongitude)/2, precision=p)
        geohashBBox = geohash.bbox(geohashCode)    
        if minLatitude < geohashBBox['s']: continue
        if maxLatitude > geohashBBox['n']: continue
        if minLongitude < geohashBBox['w']: continue
        if maxLongitude > geohashBBox['e']: continue
        return (geohashCode, geohashBBox)
    print('no geohash found that encloses latitude ' + str(minLatitude) + ' to ' + str(maxLatitude) + ', longitude ' + str(minLongitude) + ' to ' + str(maxLongitude))
    return (None, None)

def drawmapWithGeohashBBox(geoname_id,zoomLevel):

    countryName = maxmindLocations.loc[maxmindLocations['geoname_id']==geoname_id,'country_name'].item()
    subdivisionName = maxmindLocations.loc[maxmindLocations['geoname_id']==geoname_id,'subdivision_1_name'].item()
    cityName = maxmindLocations.loc[maxmindLocations['geoname_id']==geoname_id,'city_name'].item()

    group = maxmindNetworksByLocation.get_group(geoname_id)
    networkCount = len(group)
    minLatitude, minLongitude = group[['latitude','longitude']].min()
    avgLatitude, avgLongitude = group[['latitude','longitude']].mean()
    maxLatitude, maxLongitude = group[['latitude','longitude']].max()

    map = folium.Map(location=[avgLatitude, avgLongitude], zoom_start=zoomLevel)

    points = group.groupby(['latitude','longitude'])
    coordinateCount = len(list(points.groups.keys()))

    for key, group in points:
        latitude,longitude = key
        folium.features.Circle(location=[latitude, longitude], radius=200, color='blue').add_to(map)

    (geohashCode, geohashBBox) = findGeohashBBox(minLatitude, minLongitude, maxLatitude, maxLongitude)
    if geohashCode is None: return None
    
    folium.features.RectangleMarker([(geohashBBox['s'], geohashBBox['w']),(geohashBBox['n'], geohashBBox['e'])],weight=0, fill_color='blue', fill_opacity=0.2,).add_to(map)
        
    print('geohash "' + geohashCode + '" for location ' + str(geoname_id) + ': ' + str(networkCount) + ' networks at ' + str(coordinateCount) + ' coordinates in ' + cityName + ', ' + subdivisionName + ', ' + countryName)
    return map

In [24]:
drawmapWithGeohashBBox(3114472,11) # Pamplona

geohash "ezwg" for location 3114472: 233 networks at 6 coordinates in Pamplona, Navarre, Spain


In [25]:
drawmapWithGeohashBBox(4335045,12) # New Orleans

no geohash found that encloses latitude 29.9049 to 30.0801, longitude -90.2057 to -89.879


In [26]:
drawmapWithGeohashBBox(2867714,8) # Munich

geohash "u28" for location 2867714: 2538 networks at 67 coordinates in Munich, Bavaria, Germany


In [27]:
drawmapWithGeohashBBox(2950159,8) # Berlin

geohash "u33" for location 2950159: 5479 networks at 197 coordinates in Berlin, Land Berlin, Germany


In [28]:
# this function finds a geohah that contains the centroid of the networks in a location and plots them on a map

def drawmapWithGeohashCentroid(geoname_id,zoomLevel):

    countryName = maxmindLocations.loc[maxmindLocations['geoname_id']==geoname_id,'country_name'].item()
    subdivisionName = maxmindLocations.loc[maxmindLocations['geoname_id']==geoname_id,'subdivision_1_name'].item()
    cityName = maxmindLocations.loc[maxmindLocations['geoname_id']==geoname_id,'city_name'].item()

    group = maxmindNetworksByLocation.get_group(geoname_id)
    networkCount = len(group)

    avgLatitude, avgLongitude = group[['latitude','longitude']].mean()
    map = folium.Map(location=[avgLatitude, avgLongitude], zoom_start=zoomLevel)

    points = group.groupby(['latitude','longitude'])
    coordinateCount = len(list(points.groups.keys()))

    for key, group in points:
        latitude,longitude = key
        folium.features.Circle(location=[latitude, longitude], radius=200, color='blue').add_to(map)

    geohashCode = geohash2.encode(avgLatitude, avgLongitude, precision=5)
    geohashBBox = geohash.bbox(geohashCode)    
    folium.features.RectangleMarker([(geohashBBox['s'], geohashBBox['w']),(geohashBBox['n'], geohashBBox['e'])],weight=0, fill_color='blue', fill_opacity=.33).add_to(map)
        
    print('geohash "' + geohashCode + '" for location ' + str(geoname_id) + ': ' + str(networkCount) + ' networks at ' + str(coordinateCount) + ' coordinates in ' + cityName + ', ' + subdivisionName + ', ' + countryName)
    return map

In [29]:
drawmapWithGeohashCentroid(3114472,11) # Pamplona

geohash "ezwgd" for location 3114472: 233 networks at 6 coordinates in Pamplona, Navarre, Spain


In [30]:
drawmapWithGeohashCentroid(4335045,12) # New Orleans

geohash "9vrfq" for location 4335045: 828 networks at 23 coordinates in New Orleans, Louisiana, United States


In [31]:
drawmapWithGeohashCentroid(2867714,11) # Munich

geohash "u281z" for location 2867714: 2538 networks at 67 coordinates in Munich, Bavaria, Germany


In [32]:
drawmapWithGeohashCentroid(2950159,10) # Berlin

geohash "u33d8" for location 2950159: 5479 networks at 197 coordinates in Berlin, Land Berlin, Germany
