# Geocoding

This script retrieves longitude and latitude data for the company contained in the file 'Go_Factoring_Output_Perfect_Matches_20180912.XLSX' provided.  

### Data import

In [1]:
import pandas as pd
import numpy as np
import time

from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

import os

In [2]:
user = os.environ['USERNAME']
datafolder = "C:/Users/{}/Dropbox/University/MscDataScience_Birkbeck/thesis_project/data/original_data/".format(user)
file = 'Go_Factoring_Output_Perfect_Matches_20180912.XLSX'
df = pd.read_excel(datafolder+file, header=1)
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')

In [3]:
df.shape

(4460, 41)

In [4]:
df.head().transpose()

Unnamed: 0,0,1,2,3,4
id,6,7,8,9,11
kunden_id,2004052,,2004078,,
k-debitoren_id,6604,3499,4139,12066,5732
debitorenname_1,1A Swiss Dienstleistungen AG,3.M.E.L. GmbH,36GRAD GmbH,3A Schreinerei,3S Partner AG
anschrift,CH-5034 Suhr; Bachstrasse 33,CH-5734 Reinach AG; Pilatusstrasse 32,CH-6039 Root D4; Platz 3,CH-2558 Aegerten; Schwadernaustrasse 63,CH-6020 Emmenbrücke; Gerliswilstrasse 19 b
straße,Bachstrasse,Pilatusstrasse 32,Platz 3,Schwadernaustrasse 63,Gerliswilstrasse
plz,5034,5734,6039,2558,6020
ort,Suhr,Reinach AG,Root D4,Aegerten,Emmenbrücke
customer_id,6,7,8,9,11
duns,480116741,486977734,480141611,486942464,481936891


In [5]:
def do_geocode(address):
    geoloc = Nominatim(user_agent='lon-lat-retrieval', timeout=12)
    geocoder = RateLimiter(geoloc.geocode, min_delay_seconds=1, max_retries=50) #rate limiter to avoid being blocked
    try:
        location = geocoder(address)
        return location
    except GeocoderTimedOut:
        return do_geocode(address)
    
def create_add(geoinfo):
    
    if len(geoinfo)>0:
        ad = ''
        for s in range(len(geoinfo)):
            if s!=0:
                ad+=' '
                ad+=geoinfo[s]
            else:
                ad+=geoinfo[s]
        return ad
    else:
        return None

def geolocator_2(company, count, address, postcode, canton, country, req_limit=500, timesleep=100):     #location, 
    """
    This function, given the following strings representing the address, will return latitude and longitude, if available.
    NaN otherwise.
    - Postadresse
    - PLZ
    - Kanton Postadresse
    - Land Postadresse
    """
    
    if count!=0 and count%req_limit==0:
        print("!!-- SCRIPT TEMPORARY SLEEPING FOR {} SECS TO AVOID TIMEOUT ERROR --!!".format(timesleep))
        time.sleep(timesleep)
    
    geoinfo = []
    
    #ADDRESS
    if pd.isnull(address):
        print('no address available for company {}'.format(company))
        addresse=''
    else:
        geoinfo.append(str(address))
    
    #POSTCODE
    pcodedata = False
    if not pd.isnull(country) and not pd.isnull(postcode):
        pcodedata=True
        geoinfo.append(str(country)+'-'+str(postcode))
    else:
        if pd.isnull(country):
            print('no country information available for company {}'.format(company))
            country=''
        else:
            geoinfo.append(str(country))

        if pd.isnull(postcode):
            print('no postcode available for company {}'.format(company))
            postcode=''
        else:
            geoinfo.append(str(postcode))
        
    ad = create_add(geoinfo)
    
    if ad!=None:
        location = do_geocode(ad)
        if location:
            lat = location.latitude
            lon = location.longitude
            return lon, lat
        else:
            geoinfo = []
            print("Making a new attempt for company {} with postcode and canton area".format(company))
            if pcodedata:
                geoinfo.append(str(country)+'-'+str(postcode))
            if pd.isnull(canton):
                print('no canton information available for company {}'.format(company))
                canton=''    
            else:
                geoinfo.append(str(canton))
                
            ad = create_add(geoinfo)
            if ad!=None:
                location = do_geocode(ad)
                if location:
                    print("Successful attempt!")
                    lat = location.latitude
                    lon = location.longitude
                    return lon, lat
                else:
                    print("--WARNING-- Geolocation data not available for company {} with input '{}'".format(company, ad))
                    lat = np.nan
                    lon = np.nan
                    return lon, lat
            else:
                print("--WARNING-- Geolocation data not available for company {} with input '{}'".format(company, ad))
                lat = np.nan
                lon = np.nan
                return lon, lat       
    else:
        print('--WARNING-- Geolocation data not available for company: {}'.format(company))
        lat = np.nan
        lon = np.nan
        return lon, lat

In [6]:
#try
try1=df.iloc[0]
geolocator_2(try1.debitorenname_1, 0, try1.postadresse, try1.plz, try1.kanton_postadresse, try1.land_postadresse) # try1.ort,

(8.0792366, 47.382247)

In [7]:
lon = []
lat = []
count = 0
for r in range(len(df)):
    row = df.iloc[r]
    lonlat = geolocator_2(row.debitorenname_1, r, row.postadresse, row.plz, row.kanton_postadresse, row.land_postadresse)
    lon.append(lonlat[0])
    lat.append(lonlat[1])
    if r%500==0:
        count+=1
        dftemp = pd.DataFrame({'longitude':lon, 'latitude':lat})
        dftemp.to_pickle(datafolder+'lonlat_progress'+str(count))
    elif r==len(df):
        dftemp = pd.DataFrame({'longitude':lon, 'latitude':lat})
        dftemp.to_pickle(datafolder+'lonlat_final')
        

Making a new attempt for company 4B Fenster AG with postcode and canton area
Successful attempt!
Making a new attempt for company A. Kuster AG with postcode and canton area
Successful attempt!
Making a new attempt for company A1-Bau GmbH with postcode and canton area
Successful attempt!
Making a new attempt for company AC Mächler AG with postcode and canton area
Successful attempt!
Making a new attempt for company Adecco Human Resources AG with postcode and canton area
Successful attempt!
Making a new attempt for company ADEFI SA with postcode and canton area
Successful attempt!
Making a new attempt for company Adnan Görgülü Industrieservice with postcode and canton area
Successful attempt!
Making a new attempt for company Adolf Barmettler Dachdeckerei AG with postcode and canton area
Successful attempt!
Making a new attempt for company Adolf Müller GmbH with postcode and canton area
Successful attempt!
Making a new attempt for company Adriatic Group SA with postcode and canton area
Su

RateLimiter caught an error, retrying (0/50 tries). Called with (*('Bahnhofstrasse 24 CH-8240',), **{}).
Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\urllib\request.py", line 1318, in do_open
    encode_chunked=req.has_header('Transfer-encoding'))
  File "C:\ProgramData\Anaconda3\lib\http\client.py", line 1239, in request
    self._send_request(method, url, body, headers, encode_chunked)
  File "C:\ProgramData\Anaconda3\lib\http\client.py", line 1285, in _send_request
    self.endheaders(body, encode_chunked=encode_chunked)
  File "C:\ProgramData\Anaconda3\lib\http\client.py", line 1234, in endheaders
    self._send_output(message_body, encode_chunked=encode_chunked)
  File "C:\ProgramData\Anaconda3\lib\http\client.py", line 1026, in _send_output
    self.send(msg)
  File "C:\ProgramData\Anaconda3\lib\http\client.py", line 964, in send
    self.connect()
  File "C:\ProgramData\Anaconda3\lib\http\client.py", line 1400, in connect
    server_hostname=server_hos

Making a new attempt for company Elektro Buck GmbH with postcode and canton area
Successful attempt!
Making a new attempt for company Elektro Christoffel with postcode and canton area
Successful attempt!
Making a new attempt for company Elektro Christoffel with postcode and canton area
Successful attempt!
Making a new attempt for company Elektro Furrer AG with postcode and canton area
Successful attempt!
Making a new attempt for company Elektro Hediger Elektroinstallationsgeschäft with postcode and canton area
Successful attempt!
Making a new attempt for company Elektro Illi AG with postcode and canton area
Successful attempt!
Making a new attempt for company Elektro Illi AG with postcode and canton area
Successful attempt!
Making a new attempt for company Elektro Jucker & Partner GmbH with postcode and canton area
Successful attempt!
Making a new attempt for company Elektro Schmid AG with postcode and canton area
Successful attempt!
Making a new attempt for company Elektro Schmid AG E

RateLimiter caught an error, retrying (0/50 tries). Called with (*('CH-8507 TG',), **{}).
Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\urllib\request.py", line 1318, in do_open
    encode_chunked=req.has_header('Transfer-encoding'))
  File "C:\ProgramData\Anaconda3\lib\http\client.py", line 1239, in request
    self._send_request(method, url, body, headers, encode_chunked)
  File "C:\ProgramData\Anaconda3\lib\http\client.py", line 1285, in _send_request
    self.endheaders(body, encode_chunked=encode_chunked)
  File "C:\ProgramData\Anaconda3\lib\http\client.py", line 1234, in endheaders
    self._send_output(message_body, encode_chunked=encode_chunked)
  File "C:\ProgramData\Anaconda3\lib\http\client.py", line 1026, in _send_output
    self.send(msg)
  File "C:\ProgramData\Anaconda3\lib\http\client.py", line 964, in send
    self.connect()
  File "C:\ProgramData\Anaconda3\lib\http\client.py", line 1392, in connect
    super().connect()
  File "C:\ProgramData\

Successful attempt!
!!-- SCRIPT TEMPORARY SLEEPING FOR 100 SECS TO AVOID TIMEOUT ERROR --!!
Making a new attempt for company Huber Dach und Wand AG with postcode and canton area
Successful attempt!
Making a new attempt for company Hurni Engineering Sàrl with postcode and canton area
Successful attempt!
Making a new attempt for company Husner AG Holzbau with postcode and canton area
Successful attempt!
Making a new attempt for company HW Manufacture SA with postcode and canton area
Successful attempt!
Making a new attempt for company ICM Swiss AG with postcode and canton area
Successful attempt!
Making a new attempt for company IGKG Züri with postcode and canton area
Successful attempt!
Making a new attempt for company Illudec AG with postcode and canton area
Successful attempt!
Making a new attempt for company Imfeld Metall- und Stahlbau AG with postcode and canton area
Successful attempt!
Making a new attempt for company Immobilien Aktiengesellschaft Doso with postcode and canton area

RateLimiter caught an error, retrying (0/50 tries). Called with (*('CH-3008 BE',), **{}).
Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\urllib\request.py", line 1318, in do_open
    encode_chunked=req.has_header('Transfer-encoding'))
  File "C:\ProgramData\Anaconda3\lib\http\client.py", line 1239, in request
    self._send_request(method, url, body, headers, encode_chunked)
  File "C:\ProgramData\Anaconda3\lib\http\client.py", line 1285, in _send_request
    self.endheaders(body, encode_chunked=encode_chunked)
  File "C:\ProgramData\Anaconda3\lib\http\client.py", line 1234, in endheaders
    self._send_output(message_body, encode_chunked=encode_chunked)
  File "C:\ProgramData\Anaconda3\lib\http\client.py", line 1026, in _send_output
    self.send(msg)
  File "C:\ProgramData\Anaconda3\lib\http\client.py", line 964, in send
    self.connect()
  File "C:\ProgramData\Anaconda3\lib\http\client.py", line 1392, in connect
    super().connect()
  File "C:\ProgramData\

Successful attempt!
Making a new attempt for company Laima Holding AG with postcode and canton area
Successful attempt!
Making a new attempt for company Landollina Haustechnik GmbH with postcode and canton area
Successful attempt!
Making a new attempt for company L'AQ AG with postcode and canton area
Successful attempt!
Making a new attempt for company L'AQ AG with postcode and canton area
Successful attempt!
Making a new attempt for company Lauber SA with postcode and canton area
Successful attempt!
Making a new attempt for company Lauclair AG with postcode and canton area
Successful attempt!
Making a new attempt for company le Chalet d'Adrien Le Grenier with postcode and canton area
Successful attempt!
!!-- SCRIPT TEMPORARY SLEEPING FOR 100 SECS TO AVOID TIMEOUT ERROR --!!
Making a new attempt for company Leutwyler Kühlanlagen AG with postcode and canton area
Successful attempt!
Making a new attempt for company LGT Financial Service AG with postcode and canton area
no canton informat

RateLimiter caught an error, retrying (0/50 tries). Called with (*('Chemin de la Gravière 4 CH-1227',), **{}).
Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\urllib\request.py", line 1318, in do_open
    encode_chunked=req.has_header('Transfer-encoding'))
  File "C:\ProgramData\Anaconda3\lib\http\client.py", line 1239, in request
    self._send_request(method, url, body, headers, encode_chunked)
  File "C:\ProgramData\Anaconda3\lib\http\client.py", line 1285, in _send_request
    self.endheaders(body, encode_chunked=encode_chunked)
  File "C:\ProgramData\Anaconda3\lib\http\client.py", line 1234, in endheaders
    self._send_output(message_body, encode_chunked=encode_chunked)
  File "C:\ProgramData\Anaconda3\lib\http\client.py", line 1026, in _send_output
    self.send(msg)
  File "C:\ProgramData\Anaconda3\lib\http\client.py", line 964, in send
    self.connect()
  File "C:\ProgramData\Anaconda3\lib\http\client.py", line 1400, in connect
    server_hostname=serv

Making a new attempt for company Profi Eisenleger AG with postcode and canton area
Successful attempt!
Making a new attempt for company PRONAVAL SA with postcode and canton area
Successful attempt!
Making a new attempt for company Propac AG with postcode and canton area
Successful attempt!
Making a new attempt for company PSP Trockenbau AG with postcode and canton area
Successful attempt!
Making a new attempt for company Publicitas AG with postcode and canton area
Successful attempt!
Making a new attempt for company PX Group SA with postcode and canton area
Successful attempt!
Making a new attempt for company Qerreti Bauunternehmen with postcode and canton area
Successful attempt!
no address available for company Quadroni Linard
Making a new attempt for company R & V GmbH with postcode and canton area
Successful attempt!
Making a new attempt for company R. Betschart Holzbau AG with postcode and canton area
Successful attempt!
Making a new attempt for company R. Rageth GmbH with postcod

In [8]:
len(lon)

4460

In [9]:
len(df)

4460

In [10]:
df['longitude']=lon
df['latitude']=lat

In [11]:
df.head()

Unnamed: 0,id,kunden_id,k-debitoren_id,debitorenname_1,anschrift,straße,plz,ort,customer_id,duns,...,uid,regn_nbr_vat,rechtsform,duns_hauptsitz,name_hauptsitz,ort_hauptsitz,land_hauptsitz,aktivitätsstatus,longitude,latitude
0,6,2004052.0,6604,1A Swiss Dienstleistungen AG,CH-5034 Suhr; Bachstrasse 33,Bachstrasse,5034,Suhr,6,480116741,...,CHE-315.855.880,,Aktiengesellschaft,,,,,aktiv,8.079237,47.382247
1,7,,3499,3.M.E.L. GmbH,CH-5734 Reinach AG; Pilatusstrasse 32,Pilatusstrasse 32,5734,Reinach AG,7,486977734,...,CHE-258.353.008,,Gesellschaft mit beschränkter Haftung,,,,,gelöscht,8.176528,47.253798
2,8,2004078.0,4139,36GRAD GmbH,CH-6039 Root D4; Platz 3,Platz 3,6039,Root D4,8,480141611,...,CHE-447.812.384,,Gesellschaft mit beschränkter Haftung,,,,,aktiv,174.892322,-37.014441
3,9,,12066,3A Schreinerei,CH-2558 Aegerten; Schwadernaustrasse 63,Schwadernaustrasse 63,2558,Aegerten,9,486942464,...,CHE-208.196.936,,Aktiengesellschaft,,,,,aktiv,7.296306,47.12477
4,11,,5732,3S Partner AG,CH-6020 Emmenbrücke; Gerliswilstrasse 19 b,Gerliswilstrasse,6020,Emmenbrücke,11,481936891,...,CHE-109.644.693,CHE-109.644.693 MWST,Aktiengesellschaft,,,,,aktiv,8.281529,47.069267


In [12]:
datafolder2 = "C:/Users/{}/Dropbox/University/MscDataScience_Birkbeck/thesis_project/data/".format(user)

In [13]:
df.to_pickle(datafolder2+'companydata_2')