# Geocoding

This script retrieves longitude and latitude data for the company contained in the file 'Go_Factoring_Output_Perfect_Matches_20180912.XLSX' provided.  

### Data import

In [1]:
import pandas as pd
import numpy as np
import time

from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

import os

In [2]:
user = os.environ['USERNAME']
datafolder = "C:/Users/{}/Dropbox/University/MscDataScience_Birkbeck/thesis_project/data/original_data/".format(user)
file = 'Go_Factoring_Output_Perfect_Matches_20180912.XLSX'
df = pd.read_excel(datafolder+file, header=1)
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')

In [3]:
df.shape

(4460, 41)

In [4]:
df.head().transpose()

Unnamed: 0,0,1,2,3,4
id,6,7,8,9,11
kunden_id,2004052,,2004078,,
k-debitoren_id,6604,3499,4139,12066,5732
debitorenname_1,1A Swiss Dienstleistungen AG,3.M.E.L. GmbH,36GRAD GmbH,3A Schreinerei,3S Partner AG
anschrift,CH-5034 Suhr; Bachstrasse 33,CH-5734 Reinach AG; Pilatusstrasse 32,CH-6039 Root D4; Platz 3,CH-2558 Aegerten; Schwadernaustrasse 63,CH-6020 Emmenbrücke; Gerliswilstrasse 19 b
straße,Bachstrasse,Pilatusstrasse 32,Platz 3,Schwadernaustrasse 63,Gerliswilstrasse
plz,5034,5734,6039,2558,6020
ort,Suhr,Reinach AG,Root D4,Aegerten,Emmenbrücke
customer_id,6,7,8,9,11
duns,480116741,486977734,480141611,486942464,481936891


In [8]:
def do_geocode(address):
    geoloc = Nominatim(user_agent='lon-lat-retrieval', timeout=12)
    geocoder = RateLimiter(geoloc.geocode, min_delay_seconds=1, max_retries=2) #rate limiter to avoid being blocked
    try:
        location = geocoder(address)
        return location
    except GeocoderTimedOut:
        return do_geocode(address)

def geolocator_2(company, count, address, postcode, location, canton, country, req_limit=500):
    """
    This function, given the following strings representing the address, will return latitude and longitude, if available.
    NaN otherwise.
    - Postadresse
    - PLZ
    - Ort Postadresse
    - Kanton Postadresse
    - Land Postadresse
    """
    
    if count!=0 and count%req_limit==0:
        print("!!-- SCRIPT TEMPORARY SLEEPING TO AVOID TIMEOUT ERROR --!!")
        time.sleep(50)
    
    geoinfo = []
    
    if pd.isnull(address):
        print('no address available for company {}'.format(company))
        addresse=''
    else:
        geoinfo.append(str(address))
    
    if not pd.isnull(country) and not pd.isnull(postcode):
        geoinfo.append(str(country)+'-'+str(postcode))
    
    else:
        if pd.isnull(country):
            print('no country information available for company {}'.format(company))
            country=''
        else:
            geoinfo.append(str(country))

        if pd.isnull(postcode):
            print('no postcode available for company {}'.format(company))
            postcode=''
        else:
            geoinfo.append(str(postcode))
        
    #if pd.isnull(location):
    #    print('no location available for company {}'.format(company))
    #    location=''
    #else:
    #    geoinfo.append(str(location))
    
    #if pd.isnull(canton):
    #    print('no canton information available for company {}'.format(company))
    #    canton=''
    #else:
    #    geoinfo.append(str(canton))
    
    if len(geoinfo)>0:
        ad = ''
        for s in geoinfo:
            ad+=' '
            ad+=s
        location = do_geocode(ad)
        
        if location:
            lat = location.latitude
            lon = location.longitude
            return lon, lat
        else:
            print("--WARNING-- Geolocation data not available for company {} with input '{}'".format(company, ad))
            lat = np.nan
            lon = np.nan
            return lon, lat
    else:
        print('--WARNING-- Geolocation data not available for company: {}'.format(company))
        lat = np.nan
        lon = np.nan
        return lon, lat
    
 #   if len(address)<3:
 #       print('--WARNING-- Geolocation data not available for address: {}'.format(address))
 #       lat = np.nan
 #       lon = np.nan
 #       return lon, lat
 #       
 #   
 #   
 #   else:
 #       #make a second attempt using only the entire address
 #       print('Making a second attempt with the entire address {}'.format(address))
 #       location = geocoder(address)
 #       
 #       if location:
 #           lat = location.latitude
 #           lon = location.longitude
 #           return lon, lat
 #           
 #       else:
 #           print
 #           
 #           print('--WARNING-- Geolocation data not available for address: {}'.format(address))
  #          lat = np.nan
  ##          lon = np.nan
  #          return lon, lat
    
    

In [9]:
#try
try1=df.iloc[0]
geolocator_2(try1.debitorenname_1, 0, try1.postadresse, try1.plz, try1.ort, try1.kanton_postadresse, try1.land_postadresse)

(8.0792366, 47.382247)

In [None]:
lon = []
lat = []
for r in range(len(df)):
    row = df.iloc[r]
    lonlat = geolocator_2(row.debitorenname_1, r, row.postadresse, row.plz, row.ort, row.kanton_postadresse, row.land_postadresse)
    lon.append(lonlat[0])
    lat.append(lonlat[1])



### Saving longitude and latitude data into dataframes

Geopy is used to retrieve latitude and longitude information for the companies of the given dataset. In order to avoid exceeding requests number, the dataset is split in 'sub-dataset'.

In [8]:
datafolder2 = "C:/Users/{}/Dropbox/University/MscDataScience_Birkbeck/thesis_project/data/".format(user)

In [9]:
#lonlat_0 = dfs[0].Anschrift.apply(geolocator)
#lonlat_0.to_pickle(datafolder2+'lonlat_0.pkl')
#done

In [10]:
#lonlat_1 = dfs[1].Anschrift.apply(geolocator)
#lonlat_1.to_pickle(datafolder2+'lonlat_1.pkl')
#done

In [11]:
#lonlat_2 = dfs[2].Anschrift.apply(geolocator)
#lonlat_2.to_pickle(datafolder2+'lonlat_2.pkl')
#done

In [12]:
#lonlat_3 = dfs[3].Anschrift.apply(geolocator)
#lonlat_3.to_pickle(datafolder2+'lonlat_3.pkl')
#done

In [13]:
#lonlat_4 = dfs[4].Anschrift.apply(geolocator)
#lonlat_4.to_pickle(datafolder2+'lonlat_4.pkl')
#done

Making a second attempt with the entire address GB-WD31JE Hertfordshire; Church street Rickmansworth
Making a second attempt with the entire address GB-BA134JT Westbury; West wilts Trading Estation, Station Road
Making a second attempt with the entire address GB-WD31JE Hertfordshire; Jubilee Trade Centre, Jubilee Rd, Letchworth
Making a second attempt with the entire address GB-GU32DX GUILFORD; SURREY; HENLEY BUSINESS PARK; PIRBRIGHT ROAD; NORMANDY


In [13]:
#lonlat_5 = dfs[5].Anschrift.apply(geolocator)
#lonlat_5.to_pickle(datafolder2+'lonlat_5.pkl')
#done

Making a second attempt with the entire address PT-4410083 VILA NOVA DE GAIA; RUA CAMINHO DO SENHOR 938


In [15]:
#lonlat_6 = dfs[6].Anschrift.apply(geolocator)
#lonlat_6.to_pickle(datafolder2+'lonlat_6.pkl')
#done

In [10]:
#lonlat_7 = dfs[7].Anschrift.apply(geolocator)
#lonlat_7.to_pickle(datafolder2+'lonlat_7.pkl')
#done

In [11]:
#lonlat_8 = dfs[8].Anschrift.apply(geolocator)
#lonlat_8.to_pickle(datafolder2+'lonlat_8.pkl')
#done

Making a second attempt with the entire address - ;


In [12]:
#lonlat_9 = dfs[9].Anschrift.apply(geolocator)
#lonlat_9.to_pickle(datafolder2+'lonlat_9.pkl')
#done

Making a second attempt with the entire address - ;


### Adding lon lat data to sub-dfs

In [15]:
for n in range(10):
    load = pd.read_pickle(datafolder2+'lonlat_'+str(n)+'.pkl')
    dfs[n]['longitude'] = load.apply(lambda x:x[0])
    dfs[n]['latitude'] = load.apply(lambda x:x[1])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a Da

### Merging dfs in a single df

In [16]:
df_output = pd.concat(dfs)
df_output.head()

Unnamed: 0,Kunden Id,K-Debitoren Id,Debitoren Id,Debitorenname 1,DUNS,UID,Namenszusatz,Anschrift,Währung,Genehmigtes Limit,Verfügbares Limit,Mahnsperre,Mahnsperre bis,Sperre Debitor,Eigene Factoringgebühren,Zahlungsbedingung,Limitgrundlage,Telefon 1,longitude,latitude
0,2004025,0,9999999999,CPD-Debitor,,,,CH-1201 Genève; Quai du Seujet 12,Schweizer Franken,0,0,keine,,nicht gesperrt,Nein,30 net,kein Limit,,6.145596,46.209789
1,che2004009,0,9999999999,CPD-Debitor,,,,CH-8872 Weesen; Hauptstrasse 12,Schweizer Franken,0,0,keine,,nicht gesperrt,Nein,90 Net,kein Limit,,9.111328,47.13756
2,2004056,0,9999999999,CPD-Debitor,,,,CH-8001 Zürich; Weinbergstrasse 22,Schweizer Franken,0,0,keine,,nicht gesperrt,Nein,10 Net,kein Limit,,8.541829,47.372767
3,2004071,0,9999999999,CPD-Debitor,,,,CH-8001 Zürich; Weinbergstrasse 22,Schweizer Franken,0,0,keine,,nicht gesperrt,Nein,30 Tage netto,kein Limit,,8.541829,47.372767
4,2004037,0,9999999999,CPD-Debitor,,,,CH-8001 Zürich; Weinbergstrasse 22,Schweizer Franken,0,0,keine,,nicht gesperrt,Nein,10 Net,kein Limit,,8.541829,47.372767


In [20]:
df_output[df_output.longitude.isnull()]

Unnamed: 0,Kunden Id,K-Debitoren Id,Debitoren Id,Debitorenname 1,DUNS,UID,Namenszusatz,Anschrift,Währung,Genehmigtes Limit,Verfügbares Limit,Mahnsperre,Mahnsperre bis,Sperre Debitor,Eigene Factoringgebühren,Zahlungsbedingung,Limitgrundlage,Telefon 1,longitude,latitude
123,che2004001,1831,1010008,ALTRAN Technologies SA,,,,"FR-78457 Velizy-Villacoublay; 2, Rue Paul Daut...",Schweizer Franken,000,000,keine,,nicht gesperrt,Nein,45 Net,kein Limit,,,
125,che2004001,1833,1010006,Alstom Power Systems SA,482495202.0,,,FR-90041 Belfort Cedex; 3 Avenue des Trois Che...,Schweizer Franken,000,000,keine,,nicht gesperrt,Nein,45 Net,kein Limit,,,
129,che2004001,1838,1010020,Akka Ingenerie Process SAS,487989840.0,,,FR-69258 LYON CEDEX 09; 21 RUE ANTONIN LABORDE,Schweizer Franken,000,000,keine,,nicht gesperrt,Nein,45 Net,kein Limit,,,
141,che2004001,1845,1180006,Siniat SA Pole Tech Agroparc Mft,,,,FR-84915 Avignon Cedex 9; 500 Rue Marcel Demon...,Schweizer Franken,000,000,keine,,nicht gesperrt,Nein,45 Net,kein Limit,,,
143,che2004001,1847,1180008,Spie Est SAS Geispolsheim Gare,486469567.0,,,FR-67411 Illkirch Cedex; 2 Route de Lingolshei...,Schweizer Franken,000,000,keine,,nicht gesperrt,Nein,45 Net,kein Limit,,,
301,che2004012,1972,50118,Derschlag Foliendruck GmbH & Co. KG,482342961.0,,,D-85813 Lüdenscheid; Werdohler Landstrasse 352,Schweizer Franken,000,000,keine,,nicht gesperrt,Nein,30 Net,kein Limit,,,
317,che2004012,1988,50538,Valles Plastic Poligono Ind. El Carrascot,484521195.0,,,ES-46850 L'Olleria - Valencia; Fase A Carrer B...,Schweizer Franken,000,000,keine,,nicht gesperrt,Nein,30 Net,kein Limit,,,
330,che2004012,1998,5502,UNISLEEVE,,,,FR-91421 Morangis Cedex; 15 avenue Arago,Schweizer Franken,000,000,keine,,nicht gesperrt,Nein,30 Net,kein Limit,,,
370,che2004012,2030,6104,Stanley Press Equipment Ltd,487037533.0,,,"GB-SK117JL Cheshire; Gunco Lane, Macclesfield",Schweizer Franken,000,000,keine,,nicht gesperrt,Nein,30 Net,kein Limit,,,
409,che2004012,2055,11307,DCM BP 406,,,,FR-92004 Nanterre Cedex; 45 Avenue des Guiller...,Schweizer Franken,000,000,keine,,nicht gesperrt,Nein,30 Net,kein Limit,,,


In [21]:
df_output.to_pickle(datafolder2+'company_info.pkl')