In [203]:
!pip install --user googlemaps
!pip install --user geohash2

import os
import re
import pprint
import json
import requests
from requests_file import FileAdapter
import ipaddress
import googlemaps
import geohash2

import pandas as pd
pd.set_option('max_rows', 15)

[33mYou are using pip version 9.0.1, however version 9.0.2 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[33mYou are using pip version 9.0.1, however version 9.0.2 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [257]:
# This table converts various values found in the 'country' field of the 'whois.ibm.com'
# data into country names that will be recognized by the Google Maps geocoding API.

countrySubstitutions = {
    'ap': '',
    'ar': 'argentina',
    'au': 'australia',
    'brunei barussalam': 'brunei',
    'br': 'brazil',
    'ca': 'canada',
    'can': 'canada',
    'ch': 'switzerland',
    'cn': 'china',
    'co': 'colombia',
    'cr': 'costa rica',
    'cz': 'czech republic',
    'de': 'denmark',
    'ec': 'ecuador',
    'gcg': '',
    'hk': 'hong kong',
    'hongkong': 'hong kong',
    'hr': 'croatia',
    'ibm 348 edward st': 'australia',
    'id': 'indonesia',
    'in': 'india',
    'jp': 'japan',
    'kr': 'korea',
    'lk': 'sri lanka',
    'mex': 'mexico',
    'mx': 'mexico',
    'my': 'malaysia',
    'newzealand': 'new zealand',
    'nz': 'new zealand',
    'pe': 'peru',
    'ph': 'philippines',
    'phillipines': 'philippines',
    'republic of serbia': 'serbia',
    'russian federation': 'russia',
    'slovak republic': 'slovakia',
    'sg': 'singapore',
    'texas': 'united states',
    'th': 'thailand',
    'tw': 'taiwan',
    'uk': 'great britain',
    'untied kingdom': 'great britain',
    'united kingdom': 'great britain',
    'united state': 'united states',
    'united states of america': 'united states',
    'unitedstates': 'united states',
    'us canada warden ave': 'canada',
    'us canada': 'canada',
    'us': 'united states',
    'usa': 'united states',
    'uy': 'uruguay',
    've': 'venezuela',
    'vn': 'vietnam',
    }

# This function iterates through the 'whois.ibm.com' data retrieved from the specified URL, 
# returning a 'dictionary' containing one entry each time it is called, until the end of the 
# data is reached, after which it will return None.

def getWhoisEntry(url):
    
    # create an HTTP session that can handle a 'file' URL and send the request
    session = requests.Session()
    session.mount('file://', FileAdapter())
    response = session.get(url, stream=True)

    # if the HTTP response does not specify and encoding, force 'UTF-8'
    if response.encoding is None: response.encoding = 'utf-8'
    
    # read the HTTP response one line at a time, looking for the entry delimiters,
    # and returning one entry, converted to a JSON object, each time the function is called,
    # until the end of the response is reached
    body = ''
    state = 'prefix'
    for line in response.iter_lines(decode_unicode=True):
        if not line: continue
        if line == '  "object" : [ {': # starting delimiter for first entry 
            state = 'body'
        elif line == '  }, {': # delimiter between entries
            if len(body)>0: yield json.loads( '{' + body + '}' )
            body = ''
        elif line == '  } ]': # ending delimiter for last entry 
            if len(body)>0:  yield json.loads( '{' + body + '}' )
            state = 'suffix'
            body = ''
        else:
            if state=='body': body += line + '\n'
            
    # return 'None' when the end of the response is reached
    return None
    
    
# This function parses a 'dictionary' containing an entry from 'whois.ibm.com' and returns 
# a simpler dictionary containing the entry's address components (that is, 'building', 'site', 
# 'state', 'country' fields, after some normalization), plus a 'subnet' field containing 
# the CIDR representation of the entry's IP address range, or else 'None' if the entry 
# does not describe a subnet in the IBM internal network.

def parseWhoisEntry(entry):
    
    # ignore entries that do not describe subnets or locations
    if not re.fullmatch(r'inetnum|inet6num|domain', entry['type']): return None
    
    # extract the entry's key and convert it into a CIDR representation of an IP address range
    primaryKey = entry['primary-key']['attribute'][0]['value']        
    if entry['type']=='inetnum': # for IPv4 subnets
        if not primaryKey.startswith('9.'): return None
        match = re.fullmatch(r'([0-9.]+) - ([0-9.]+)', primaryKey)
        if not match: raise ValueError('IPv4 entry address range is malformed', primaryKey, entry)
        ranges = [range for range in ipaddress.summarize_address_range(ipaddress.IPv4Address(match.group(1)), ipaddress.IPv4Address(match.group(2)))]
        if len(ranges)<1: raise ValueError('IPv4 entry does not specify a subnet', ranges, primaryKey, entry)
        if len(ranges)>1: raise ValueError('IPv4 entry spans more than one subnet', ranges, primaryKey, entry)
        subnet = str(ranges[0])
    elif entry['type']=='inet6num': # for IPv6 subnets
        if not primaryKey.startswith('2620:1f7:'): return None
        subnet = primaryKey
    elif entry['type']=='domain': # for locations
        if not re.fullmatch(r'[A-Z0-9]{3}'): return None

    # extract the address-related components of the entry into a simpler dictionary
    result = { 'building': '', 'site': '', 'state': '', 'country': '' }
    for attribute in entry['attributes']['attribute']:
        if attribute['name'].lower()=='country': result['country'] = attribute['value'].lower()
        if attribute['name'].lower()=='remarks':
            match = re.fullmatch(r'.+ = (building|site|state):(.+)', attribute['value'], re.I)
            if match: result[match.group(1).lower()] = match.group(2).lower()
    
    # normalize the address component values by deleting placeholders, punctuation, and pure numbers
    for key,value in result.items(): 
        if re.fullmatch(r'n/a|unknown|unassigned|enterprise|lab|reserved|free', value, re.I): value = ''
        value = re.sub(r'[,/\-(): ]+', ' ', value).strip(' ')
        if re.fullmatch(r'[0-9 ]+', value): value = ''
        result[key] = value
        
    # substitute normalized country names for certain 'country' values
    if result['country'] in countrySubstitutions: result['country'] = countrySubstitutions[result['country']]
    
    # add the 'subnet' to the result dictionary and return it
    result['subnet'] = subnet
    return result


def convertAddressToGeocode(client, address):
    
    # convert address into geocode data with client for Google Maps geocode API
    geocode = client.geocode(address)
    ###########print('>>>>>>>>>>>googlemaps.Client.geocode(' + address + ') returned:')
    #############pprint.pprint(geocode,width=150)
    
    # return 'None' if address is not found
    if geocode is None: return None
    if len(geocode)<1: return None
    if 'address_components' not in geocode[0]: return None
    
    # extract address components from geocode data into simple dictionary
    result = { component['types'][0]: component['short_name'] for component in geocode[0]['address_components'] }

    # extract location components from geocode data into result dictionary
    result['addressIBM'] = address
    result['addressGoogle'] = geocode[0]['formatted_address']
    result['latitude'] = geocode[0]['geometry']['location']['lat']
    result['longitude'] = geocode[0]['geometry']['location']['lng']
    
    # add geohash representation of latitude+longitude to result dictionary
    result['geohash'] = geohash2.encode(result['latitude'],result['longitude'], precision=6)
    
    # return result dictionary of address and location components
    ############print('---------->convertAddressToGeocode(' + address + ') returned:')
    ###############pprint.pprint(result,width=150)
    return result

def loadCacheFromFile(filename):
    if os.path.exists(filename):
        with open(filename, 'r') as file: return json.load(file)
    else:
        return {}

def storeCacheToFile(filename, cache):
    if os.path.exists(filename+'.new'): os.remove(filename+'.new')
    with open(filename+'.new', 'w') as file: json.dump(cache, file)
    if os.path.exists(filename+'.old'): os.remove(filename+'.old')
    if os.path.exists(filename): os.rename(filename, filename+'.old')
    os.rename(filename+'.new', filename)
        

In [168]:
%%script false
for entry in getWhoisEntry('file:///Users/pring/ibm.inetnum.2018-03-20.json'): 
    print(entry['type'])

In [151]:
%%script false
for entry in getWhoisEntry('file:///Users/pring/ibm.inetnum.2018-03-20.json'): 
    address = parseWhoisEntry(entry)
    if address: print(address['subnet'],' ~' + address['country'] + '~' + address['state'] + '~' + address['site'] + '~' +  address['building'] + '~')

9.0.0.0/24  ~~~nul~~
9.0.0.0/30  ~~~~~
9.0.0.0/19  ~australia~~~~
9.0.128.0/26  ~united states~co~boulder~~
9.0.128.0/23  ~~~~~
9.0.128.128/29  ~united states~co~boulder~~
9.0.128.136/29  ~united states~co~boulder~~
9.0.128.192/26  ~united states~co~boulder~~
9.0.128.64/26  ~united states~co~boulder~~
9.0.129.0/26  ~united states~co~boulder~~
9.0.130.0/26  ~united states~ny~poughkeepsie~~
9.0.130.0/23  ~~~~~
9.0.130.128/29  ~united states~ny~poughkeepsie~~
9.0.130.136/29  ~united states~ny~poughkeepsie~~
9.0.130.192/26  ~united states~ny~poughkeepsie~~
9.0.130.64/26  ~united states~ny~poughkeepsie~~
9.0.131.0/26  ~united states~ny~poughkeepsie~~
9.0.131.128/25  ~~~~~
9.0.132.0/26  ~~~~~
9.0.132.0/23  ~~~~~
9.0.132.128/26  ~~~~~
9.0.132.240/28  ~~~~~
9.0.132.64/26  ~~~~~
9.0.133.0/26  ~~~~~
9.0.133.128/26  ~~~~~
9.0.133.192/26  ~~~~~
9.0.133.64/26  ~~~~~
9.0.134.0/26  ~argentina~buenos aires~buenos aires~building a~
9.0.134.0/23  ~argentina~~ama~building a~
9.0.134.128/29  ~argentina~bu

KeyboardInterrupt: 

In [156]:
%%script false
googlemapsClient = googlemaps.Client(key='AIzaSyAXrKyHrMa98L_e_CLtdi4UnQRPjHAEcYg')
xxx = convertAddressToGeocode(googlemapsClient, 'ibm, rochester, mn, united states')


>>>>>>>>>>>googlemaps.Client.geocode(ibm, rochester, mn, united states) returned:
[{'address_components': [{'long_name': '3605', 'short_name': '3605', 'types': ['street_number']},
                         {'long_name': 'U.S. 52', 'short_name': 'US-52', 'types': ['route']},
                         {'long_name': 'Rochester', 'short_name': 'Rochester', 'types': ['locality', 'political']},
                         {'long_name': 'Olmsted County', 'short_name': 'Olmsted County', 'types': ['administrative_area_level_2', 'political']},
                         {'long_name': 'Minnesota', 'short_name': 'MN', 'types': ['administrative_area_level_1', 'political']},
                         {'long_name': 'United States', 'short_name': 'US', 'types': ['country', 'political']},
                         {'long_name': '55901', 'short_name': '55901', 'types': ['postal_code']},
                         {'long_name': '7829', 'short_name': '7829', 'types': ['postal_code_suffix']}],
  'formatted_address':

In [184]:
#whoisURL = 'http://whois.ibm.com:8080/whois/search.json?inverse-attribute=org&type-filter=inetnum&query-string=ORG-IBM1-IGA'
#whoisURL = 'http://whois.ibm.com:8080/whois/search.json?inverse-attribute=org&type-filter=inet6num&query-string=ORG-IBM1-IGA'
whoisURL = 'file:///Users/pring/ibm.inetnum.2018-03-20.json'
#whoisURL = 'file:///Users/pring/ibm.inet6num.2018-03-20.json'

googlemapsClient = googlemaps.Client(key='AIzaSyAXrKyHrMa98L_e_CLtdi4UnQRPjHAEcYg')

whoisCacheFilename = 'ibm.whois.com.geocodeCache.json'
whoisCache = loadCacheFromFile(whoisCacheFilename)

for entry in getWhoisEntry(whoisURL): 
    
    entryFields = parseWhoisEntry(entry)
    if not entryFields: continue    
    
    ibmAddress = entryFields['site'] + ', ' + entryFields['state'] + ', ' + entryFields['country']
    if len(entryFields['site'])>0: ibmAddress = 'IBM, ' + ibmAddress
    if ibmAddress in whoisCache: continue    
        
    whoisCache[ibmAddress] = convertAddressToGeocode(googlemapsClient, ibmAddress)
    storeCacheToFile(whoisCacheFilename, whoisCache)
    if whoisCache[ibmAddress]: print(ibmAddress,'-->',whoisCache[ibmAddress]['address'])
    

IBM, hillsboro, , united states --> 1385 NW Amberglen Pkwy, Hillsboro, OR 97006, USA
IBM, 250 south high street columbus oh, oh,  --> 250 S High St #400, Columbus, OH 43215, USA
IBM, columbus, oh,  --> 4499 Fisher Rd, Columbus, OH 43228, USA
, oh,  --> Ohio, USA
, ny, united states --> New York, NY, USA
IBM, daytona, florida, united states --> 655 Century Point, Lake Mary, FL 32746, USA
IBM, brookhaven ga, , united states --> 1001 Summit Blvd NE, Brookhaven, GA 30319, USA
IBM, illinois, u.s.,  --> 321 N Clark St #325, Chicago, IL 60654, USA
IBM, illinois, , united states --> 71 S Wacker Dr, Chicago, IL 60606, USA
IBM, illinois, u.s,  --> 321 N Clark St #325, Chicago, IL 60654, USA
IBM, poughkeepsie, ny,  --> 710 Development Ct, Poughkeepsie, NY 12601, USA
IBM, truven, , united states --> 7700 Old Georgetown Rd, 6th floor, Bethesda, MD 20814, United States
IBM, 3039 e cornwallis rd, nc,  --> 3039 E Cornwallis Rd, Research Triangle Park, NC 27709, USA
IBM, dallas, tx,  --> 1177 S Belt Li

In [232]:
#googleAddresses = { v['address']: 1 for k,v in whoisCache.items() if v }
#len(googleAddresses)
#sorted(googleAddresses.keys())

#xxx = pd.DataFrame( [ dict([('addressIBM',k)]+list(v.items())) for k,v in whoisCache.items() if v ] )
#xxx.columns
#xxx[['ibmAddress','address','country', 'administrative_area_level_1','locality','latitude','longitude','geohash']]

Unnamed: 0,ibmAddress,address,country,administrative_area_level_1,locality,latitude,longitude,geohash
0,", , australia",Australia,AU,,,-25.274398,133.775136,qgx0hn
1,"IBM, boulder, co, united states","6300 Diagonal Hwy, Boulder, CO 80301, USA",US,CO,Boulder,40.089317,-105.198126,9xjhn4
2,"IBM, poughkeepsie, ny, united states","710 Development Ct, Poughkeepsie, NY 12601, USA",US,NY,Poughkeepsie,41.662296,-73.938723,dr7mj2
3,"IBM, buenos aires, buenos aires, argentina","Ing Enrique Butty 275, C1001AFA CABA, Argentina",AR,CABA,,-34.596097,-58.371448,69y7pt
4,"IBM, ehningen, , germany","IBM-Allee 1, 71139 Ehningen, Germany",DE,BW,Ehningen,48.651710,8.946660,u0wkg8
5,"IBM, portsmouth, , great britain","North Harbour, Portsmouth PO6 3AU, UK",GB,England,,50.842570,-1.085737,gcp1pr
6,", , japan",Japan,JP,,,36.204824,138.252924,xn6mfn
...,...,...,...,...,...,...,...,...
1333,"IBM, brasov, ,","Patinoarul Olimpic, Strada Turnului 5, Brașov ...",RO,BV,Brașov,45.666151,25.613842,u845yc
1334,"IBM, ehningen aag, , germany","IBM-Allee 1, 71139 Ehningen, Germany",DE,BW,Ehningen,48.651710,8.946660,u0wkg8


In [258]:
xxx = [ entry for entry in getWhoisEntry('file:///Users/pring/ibm.inetnum.2018-03-20.json') ]
yyy = [ parseWhoisEntry(entry) for entry in xxx ]
zzz = [ entry for entry in yyy if entry]
set( [ entry['country'] for entry in zzz ] )

{'',
 'algeria',
 'angola',
 'apac',
 'argentina',
 'aruba',
 'asean',
 'australia',
 'austria',
 'barbados',
 'belarus',
 'belgium',
 'bratislava',
 'brazil',
 'brunei',
 'bulgaria',
 'canada',
 'chile',
 'china',
 'colombia',
 'costa rica',
 'croatia',
 'cyprus',
 'czech republic',
 'denmark',
 'ecuador',
 'egypt',
 'estonia',
 'finland',
 'france',
 'germany',
 'ghana',
 'great britain',
 'greece',
 'hong kong',
 'hungary',
 'india',
 'indonesia',
 'ireland',
 'israel',
 'italy',
 'jamaica',
 'japan',
 'kazakhstan',
 'kenya',
 'korea',
 'latvia',
 'lithuania',
 'luxembourg',
 'macau',
 'malaysia',
 'mauritius',
 'mexico',
 'morocco',
 'netherlands',
 'new zealand',
 'nigeria',
 'norway',
 'pakistan',
 'peru',
 'philippines',
 'poland',
 'portugal',
 'puerto rico',
 'romania',
 'russia',
 'saudi arabia',
 'senegal',
 'serbia',
 'singapore',
 'slovakia',
 'slovenia',
 'south africa',
 'south korea',
 'spain',
 'sri lanka',
 'sweden',
 'switzerland',
 'taiwan',
 'thailand',
 'tiawan',
