# Geocoding

This script retrieves longitude and latitude data for the company contained in the file 'Go_Factoring_Output_Perfect_Matches_20180912.XLSX' provided.  

### Data import

In [3]:
import pandas as pd
import numpy as np
import time

from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

import os

In [4]:
user = os.environ['USERNAME']
datafolder = "C:/Users/{}/Dropbox/University/MscDataScience_Birkbeck/thesis_project/data/original_data/".format(user)
file = 'Go_Factoring_Output_Perfect_Matches_20180912.XLSX'
df = pd.read_excel(datafolder+file, header=1)
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')

In [5]:
df.shape

(4460, 41)

In [6]:
df.head().transpose()

Unnamed: 0,0,1,2,3,4
id,6,7,8,9,11
kunden_id,2004052,,2004078,,
k-debitoren_id,6604,3499,4139,12066,5732
debitorenname_1,1A Swiss Dienstleistungen AG,3.M.E.L. GmbH,36GRAD GmbH,3A Schreinerei,3S Partner AG
anschrift,CH-5034 Suhr; Bachstrasse 33,CH-5734 Reinach AG; Pilatusstrasse 32,CH-6039 Root D4; Platz 3,CH-2558 Aegerten; Schwadernaustrasse 63,CH-6020 Emmenbrücke; Gerliswilstrasse 19 b
straße,Bachstrasse,Pilatusstrasse 32,Platz 3,Schwadernaustrasse 63,Gerliswilstrasse
plz,5034,5734,6039,2558,6020
ort,Suhr,Reinach AG,Root D4,Aegerten,Emmenbrücke
customer_id,6,7,8,9,11
duns,480116741,486977734,480141611,486942464,481936891


In [50]:
def do_geocode(address):
    geoloc = Nominatim(user_agent='lon-lat-retrieval', timeout=25)
    geocoder = RateLimiter(geoloc.geocode, min_delay_seconds=1, max_retries=50) #rate limiter to avoid being blocked
    try:
        location = geocoder(address)
        return location
    except GeocoderTimedOut:
        return do_geocode(address)
    
def create_add(geoinfo):
    
    if len(geoinfo)>0:
        ad = ''
        for s in range(len(geoinfo)):
            if s!=0:
                ad+=' '
                ad+=geoinfo[s]
            else:
                ad+=geoinfo[s]
        return ad
    else:
        return None

def geolocator_2(company, count, full_address, city, address, postcode, canton, country, req_limit=500, timesleep=250):     #location, 
    """
    This function, given the following strings representing the address, will return latitude and longitude, if available.
    NaN otherwise.
    - Postadresse
    - PLZ
    - Kanton Postadresse
    - Land Postadresse
    """
    #TIMER
    if count>=req_limit-1 and count%req_limit==0:
            print("!!-- SCRIPT SLEEPING FOR {} SECS TO AVOID TIMEOUT ERROR --!!".format(timesleep))
            time.sleep(timesleep)
    
    #CASE WITH FULL ADDRESS
    full_address=full_address.replace(';','')
    location = do_geocode(full_address.split(' ')[0]+city)
    
    if location:
        print("Successful with full address {}!".format(full_address))
        lat = location.latitude
        lon = location.longitude
        return lon, lat
           
    else:
        print('Making a new attempt using address fields')
        geoinfo = []

        #ADDRESS
        if pd.isnull(address):
            print('no address available for company {}'.format(company))
            addresse=''
        else:
            geoinfo.append(str(address))

        #POSTCODE
        pcodedata = False
        if not pd.isnull(country) and not pd.isnull(postcode):
            pcodedata=True
            geoinfo.append(str(country)+'-'+str(postcode))
        else:
            if pd.isnull(country):
                print('no country information available for company {}'.format(company))
                country=''
            else:
                geoinfo.append(str(country))

            if pd.isnull(postcode):
                print('no postcode available for company {}'.format(company))
                postcode=''
            else:
                geoinfo.append(str(postcode))

        ad = create_add(geoinfo)

        if ad!=None:
            location = do_geocode(ad)
            if location:
                lat = location.latitude
                lon = location.longitude
                return lon, lat
            else:
                geoinfo = []
                print("Making a new attempt for company {} with postcode and canton area".format(company))
                if pcodedata:
                    geoinfo.append(str(country)+'-'+str(postcode))
                if pd.isnull(canton):
                    print('no canton information available for company {}'.format(company))
                    canton=''    
                else:
                    geoinfo.append(str(canton))

                ad = create_add(geoinfo)
                if ad!=None:
                    location = do_geocode(ad)
                    if location:
                        print("Successful attempt!")
                        lat = location.latitude
                        lon = location.longitude
                        return lon, lat
                    else:
                        print("--WARNING-- Geolocation data not available for company {} with input '{}'".format(company, ad))
                        lat = np.nan
                        lon = np.nan
                        return lon, lat
                else:
                    print("--WARNING-- Geolocation data not available for company {} with input '{}'".format(company, ad))
                    lat = np.nan
                    lon = np.nan
                    return lon, lat       
        else:
            print('--WARNING-- Geolocation data not available for company: {}'.format(company))
            lat = np.nan
            lon = np.nan
            return lon, lat

In [51]:
#try
try1=df.iloc[66]
geolocator_2(try1.debitorenname_1, 0, try1.anschrift, try1.ort, try1.postadresse, try1.plz, try1.kanton_postadresse, try1.land_postadresse) # try1.ort,
#try1.anschrift

Making a new attempt using address fields
Making a new attempt for company ADEFI SA with postcode and canton area
Successful attempt!


(121.325817199745, 14.07279125)

In [None]:
#try2
try1=df.iloc[0]
geolocator_2(try1.debitorenname_1, 0, try1.anschrift, try1.postadresse, try1.plz, try1.kanton_postadresse, try1.land_postadresse)

In [13]:
lon = []
lat = []
count = 0
for r in range(len(df)):
    row = df.iloc[r]
    lonlat = geolocator_2(row.debitorenname_1, r, row.anschrift, row.postadresse, row.plz, row.kanton_postadresse, row.land_postadresse)
    lon.append(lonlat[0])
    lat.append(lonlat[1])
    if r%500==0:
        count+=1
        dftemp = pd.DataFrame({'longitude':lon, 'latitude':lat})
        dftemp.to_pickle(datafolder+'lonlat_progress'+str(count))
    elif r==len(df):
        dftemp = pd.DataFrame({'longitude':lon, 'latitude':lat})
        dftemp.to_pickle(datafolder+'lonlat_final')
        

!!-- SCRIPT TEMPORARY SLEEPING FOR 250 SECS TO AVOID TIMEOUT ERROR --!!
!!-- SCRIPT TEMPORARY SLEEPING FOR 250 SECS TO AVOID TIMEOUT ERROR --!!
!!-- SCRIPT TEMPORARY SLEEPING FOR 250 SECS TO AVOID TIMEOUT ERROR --!!
!!-- SCRIPT TEMPORARY SLEEPING FOR 250 SECS TO AVOID TIMEOUT ERROR --!!
!!-- SCRIPT TEMPORARY SLEEPING FOR 250 SECS TO AVOID TIMEOUT ERROR --!!
!!-- SCRIPT TEMPORARY SLEEPING FOR 250 SECS TO AVOID TIMEOUT ERROR --!!
!!-- SCRIPT TEMPORARY SLEEPING FOR 250 SECS TO AVOID TIMEOUT ERROR --!!
!!-- SCRIPT TEMPORARY SLEEPING FOR 250 SECS TO AVOID TIMEOUT ERROR --!!


In [14]:
len(lon)

4460

In [15]:
len(df)

4460

In [16]:
df['longitude']=lon
df['latitude']=lat

In [17]:
df.head()

Unnamed: 0,id,kunden_id,k-debitoren_id,debitorenname_1,anschrift,straße,plz,ort,customer_id,duns,...,uid,regn_nbr_vat,rechtsform,duns_hauptsitz,name_hauptsitz,ort_hauptsitz,land_hauptsitz,aktivitätsstatus,longitude,latitude
0,6,2004052.0,6604,1A Swiss Dienstleistungen AG,CH-5034 Suhr; Bachstrasse 33,Bachstrasse,5034,Suhr,6,480116741,...,CHE-315.855.880,,Aktiengesellschaft,,,,,aktiv,8.070805,47.381473
1,7,,3499,3.M.E.L. GmbH,CH-5734 Reinach AG; Pilatusstrasse 32,Pilatusstrasse 32,5734,Reinach AG,7,486977734,...,CHE-258.353.008,,Gesellschaft mit beschränkter Haftung,,,,,gelöscht,8.181837,47.256605
2,8,2004078.0,4139,36GRAD GmbH,CH-6039 Root D4; Platz 3,Platz 3,6039,Root D4,8,480141611,...,CHE-447.812.384,,Gesellschaft mit beschränkter Haftung,,,,,aktiv,8.374256,47.103368
3,9,,12066,3A Schreinerei,CH-2558 Aegerten; Schwadernaustrasse 63,Schwadernaustrasse 63,2558,Aegerten,9,486942464,...,CHE-208.196.936,,Aktiengesellschaft,,,,,aktiv,7.289232,47.120476
4,11,,5732,3S Partner AG,CH-6020 Emmenbrücke; Gerliswilstrasse 19 b,Gerliswilstrasse,6020,Emmenbrücke,11,481936891,...,CHE-109.644.693,CHE-109.644.693 MWST,Aktiengesellschaft,,,,,aktiv,8.258342,47.092793


In [18]:
datafolder2 = "C:/Users/{}/Dropbox/University/MscDataScience_Birkbeck/thesis_project/data/".format(user)

In [19]:
df.to_pickle(datafolder2+'company_info_2.pkl')