In [10]:
import pandas as pd
import numpy as np
import requests
import gmaps
import re
import json
import geopy.distance
gmaps.configure(api_key="AIzaSyBHJpGnDIRbKDfWkgCQ7wosmrKbsTsNPz8") #API key
gplaces_api_key = 'AIzaSyDo4iMKKDIHujRDMXzxkr5P2wXCAB2dx5w'

In [11]:
# allow collapsible json objects in jupyter notebook
import uuid
from IPython.display import display_javascript, display_html, display
import json

class RenderJSON(object):
    def __init__(self, json_data):
        if isinstance(json_data, dict):
            self.json_str = json.dumps(json_data)
        else:
            self.json_str = json_data
        self.uuid = str(uuid.uuid4())

    def _ipython_display_(self):
        display_html('<div id="{}" style="height: 600px; width:100%;"></div>'.format(self.uuid), raw=True)
        display_javascript("""
        require(["https://rawgit.com/caldwell/renderjson/master/renderjson.js"], function() {
        document.getElementById('%s').appendChild(renderjson(%s))
        });
        """ % (self.uuid, self.json_str), raw=True)

In [12]:
def latlng(postcode):
    '''
    convert postcode to a latitude and longitute tuple
    '''
    html = requests.get("http://api.postcodes.io/postcodes/" + str(postcode))
    lat = html.json()['result']['latitude']
    lng = html.json()['result']['longitude']
    return lat,lng

In [13]:
def distance(lat1, lon1, lat2, lon2):
    '''
    Calculate distance (in km) between two sets of latitude and longitude coordinates
    Uses geopy package
    '''
    x = lat1, lon1
    y = lat2, lon2
    return geopy.distance.vincenty(x, y).km

In [14]:
def standardise(company_name):
    stopwords = ['limited','ltd','ltd.','lp']
    querywords = str(company_name).split()
    resultwords  = [word for word in querywords if word.lower() not in stopwords]
    result = ' '.join(resultwords)
    result_no_symbols = re.sub(r'[^\w]', '', result) # removes symbols
    return result_no_symbols.lower()

In [15]:
def nearby_search(lat, lon, keyword):
    '''
    input: latitude and longitude (floats), api_key for Google Places (string)
    output: location_id of place closest to co-ordinates and that matches the keyword
    if the company is not found, then a message will be given
    '''
    
    # HTML wrapped for arguments
    html = requests.get('https://maps.googleapis.com/maps/api/place/nearbysearch/json?location=' + \
                        str(lat) + ',' + str(lon)  + '&' + 'keyword=' + keyword + '&' + \
                        'rankby=distance' + '&' + 'key=' + str(gplaces_api_key))
        
    # Error message if location not found
    try: 
        place_id = html.json()['results'][0]['place_id']
    except:
        return 'Location not found'
    
    # Check how close the search result is, reject if more than 1km distance
    
    # Coordinates from Google Places
    lat_g = html.json()['results'][0]['geometry']['location']['lat']
    lon_g = html.json()['results'][0]['geometry']['location']['lng']

    if distance(lat, lon, lat_g, lon_g) > 10: # more than 10km away
        return 'Location too far'
    
    return html.json()
    # return place_id

In [16]:
def place_search(place_id):
    '''
    input: place_id from Nearby Search API (string)
    output: company website (if found)
    If a website is not found, then an error message will be given
    '''
    
    # exception in case place_id doesn't yield results in Place Search
    try:
        html = requests.get('https://maps.googleapis.com/maps/api/place/details/json?' + \
                    'placeid=' + place_id + '&' + 'key=' + str(gplaces_api_key))
    except:
        return 'No results in Place Search'
    
    # exception if website is not found
    try:
        website = html.json()['result']['website']
    except:
        return 'Website not found'
    
    return html.json()
    # return website

In [17]:
def return_website(company_name,postcode):
    '''
    Input: company name and postcode (both strings)
    Output: Company website
    '''
    
    # Step 1: Convert postcodes to latitude and longitude using postcodes.io
    # Some companies may give invalid postcodes in Companies House so build exception
    
    try: 
        html = requests.get("http://api.postcodes.io/postcodes/" + postcode)
        lat = html.json()['result']['latitude']
        lon = html.json()['result']['longitude']
    except:
        return 'Postcode not found'
    
    # Step 2: Get place_id by calling upon a Nearby Search (see Section 3.2)
    place_id = nearby_search(lat,lon,company_name,YOUR_API_KEY)
    
    # Step 3: Return website by feeding place_id into Place Search
    website = place_search(place_id)
    
    return website    

In [18]:
# Layout settings for Google Maps
figure_layout = {
    'width': '1000px',
    'height': '1000px',
    'border': '1px solid black',
    'padding': '1px'
}

The first step is to import the Free Company Data Product, a downloadable data snapshot containing basic company data of live companies on the companies house register.

In [20]:
# import companies house data, use crn as index
ch = pd.read_csv('ch_2018-02.csv', index_col= 1)

In [28]:
# preview a sample of companies
ch.sample(5)

Unnamed: 0_level_0,name,address1,address2,postTown,county,country,postcode,category,status,origin,accounts_lastMadeUpDate,accountCategory,returns_lastMadeUpDate,sic1,sic2,sic3,sic4
crn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
8782261,MAXMAX LIMITED,29 WATERLOO RD,,WOLVERHAMPTON,,,WV1 4DJ,Private Limited Company,Active,United Kingdom,30/11/2017,DORMANT,19/11/2015,70100 - Activities of head offices,,,
9835348,SMYTHSON ROSE ENTERTAINMENT LTD,7 REDBRIDGE LANE EAST,,ILFORD,,ENGLAND,IG4 5ET,Private Limited Company,Active,United Kingdom,31/10/2016,TOTAL EXEMPTION SMALL,,56302 - Public houses and bars,,,
9017127,NYC NETWORKS LIMITED,INGLES MANOR,CASTLE HILL AVENUE,FOLKESTONE,,UNITED KINGDOM,CT20 2RD,Private Limited Company,Active,United Kingdom,30/04/2017,DORMANT,01/04/2016,60200 - Television programming and broadcastin...,,,
5812515,SALISBURY TRUST FOR THE HOMELESS LTD,148 FISHERTON STREET,,SALISBURY,WILTSHIRE,,SP2 7QW,"PRI/LTD BY GUAR/NSC (Private, limited by guara...",Active,United Kingdom,31/03/2017,SMALL,10/05/2016,68209 - Other letting and operating of own or ...,,,
8244467,LONDON INTERNATIONAL FILMMAKER FESTIVAL LTD,56 ADDINGTON STREET,56 ADDINGTON STREET,MARGATE,KENT,,CT9 1QS,Private Limited Company,Active,United Kingdom,31/10/2016,TOTAL EXEMPTION SMALL,08/10/2015,60200 - Television programming and broadcastin...,,,


Let's walk through a Google Places search with **Harrods** as an example. This is to higlight the importance of standardising to avoid picking up similiarly named businesses...

In [21]:
# Do the following:
# 1) Get the row of data relating to Harrods' CRN, 
# 2) pick out columns of interest, get coordinates from postcode
harrods = ch.loc['00030209'][['name','address1','address2','postcode','sic1']]

# Derive coordinates from the postcode on Companies House
harrods['lat'] = latlng(harrods['postcode'])[0]
harrods['lng'] = latlng(harrods['postcode'])[1]
harrods

name                                          HARRODS LIMITED
address1                                 87/135 BROMPTON ROAD
address2                                               LONDON
postcode                                             SW1X 7XL
sic1        47190 - Other retail sale in non-specialised s...
lat                                                   51.4994
lng                                                 -0.163234
Name: 00030209, dtype: object

In [22]:
# generate map of coordinates  
fig = gmaps.figure(
    center = (harrods['lat'],harrods['lng']), # centred on coordinates for Harrods 
    zoom_level = 17, # set zoom level
    layout = figure_layout)
fig

We now feed these details in the first of the APIs that are used: the **Nearby Search**. 

As the name implies, this API allows you to search for places within a specified area, but doesn't provide a lot of detail on the places themselves.

In [30]:
harrods_search = nearby_search(
                     harrods['lat'], harrods['lng'],
                     standardise('HARRODS LIMITED')) # reduces this to 'harrods'

RenderJSON(harrods_search)

Once we're confident that Nearby Search yields the correct business, we can feed it's place_id into the **Place Search** which has all the juicy details...

In [31]:
harrods_detail = place_search("ChIJ_zAq7TgFdkgRQg1OSw1j7hU")
RenderJSON(harrods_detail)

In [32]:
# Capturing the website
harrods_detail['result']['website']

'http://www.harrods.com/'