In [1]:
import pandas as pd
import numpy as np
import os
import re
import requests
import matplotlib.pyplot as plt
%matplotlib inline 

In [2]:
pc_file = os.path.join(os.pardir,"data","postcodes_uniq.csv")
pcdf = pd.read_csv(pc_file, header=None, names={'postalcode'})
pcdf = pcdf.fillna('')
pcdf['lat'] = ''
pcdf['lon'] = ''
print(len(pcdf))
print(pcdf.head())
pcdf.to_csv(os.path.join(os.pardir,"data","pc_latlon.csv"), index=False)

27393
  postalcode lat lon
0     400336        
1     546183        
2     578345        
3     650420        
4     460059        


In [3]:
pc_file = os.path.join(os.pardir,"data","pc_latlon.csv")
pcdf = pd.read_csv(pc_file)
pcdf = pcdf.fillna('')

In [4]:
print(len(pcdf))
pcdf.head(7)

27393


Unnamed: 0,postalcode,lat,lon
0,400336.0,,
1,546183.0,,
2,578345.0,,
3,650420.0,,
4,460059.0,,
5,,,
6,466507.0,,


In [5]:
validpc_regex = re.compile("\\b([0-9]{4}(?:[1-9][0-9]|[0-9][1-9]))")

In [6]:
pcdf['validity'] = pcdf['postalcode'].apply(lambda x: bool(re.search(validpc_regex, str(x))))
pcdf

Unnamed: 0,postalcode,lat,lon,validity
0,400336,,,True
1,546183,,,True
2,578345,,,True
3,650420,,,True
4,460059,,,True
5,,,,False
6,466507,,,True
7,357984,,,True
8,760776,,,True
9,560129,,,True


In [7]:
print(len(pcdf.loc[pcdf['validity'] == False]))
poorly_formed_ps = pcdf.loc[pcdf['validity'] == False][['postalcode', 'lat', 'lon']]
poorly_formed_ps = poorly_formed_ps.reset_index(drop=True)
print(poorly_formed_ps)

266
    postalcode lat lon
0                     
1       320100        
2       538800        
3         2057        
4       797500        
5       562700        
6       466000        
7       556800        
8       138600        
9       140100        
10      528800        
11      199600        
12      550500        
13      310200        
14      563700        
15      357900        
16        1335        
17      750300        
18      239100        
19      298100        
20      419400        
21        1231        
22      510700        
23      737900        
24      460100        
25      739000        
26      541200        
27      561700        
28      425300        
29      310100        
..         ...  ..  ..
236     328400        
237     535500        
238     467900        
239     805800        
240     547400        
241     S46401        
242     600300        
243     456200        
244       0315        
245      81200        
246     813000        
247    

In [8]:
print(len(pcdf.loc[pcdf['validity'] == True]))
vpcdf = pcdf.loc[pcdf['validity'] == True][['postalcode', 'lat', 'lon']]
vpcdf = vpcdf.reset_index(drop=True)
print(vpcdf)

27127
      postalcode lat lon
0         400336        
1         546183        
2         578345        
3         650420        
4         460059        
5         466507        
6         357984        
7         760776        
8         560129        
9         732685        
10        669558        
11        737785        
12        330004        
13        531470        
14        684686        
15        760150        
16        640489        
17        798595        
18        828785        
19        540142        
20        543124        
21        550407        
22        670183        
23        510709        
24        760797        
25        315079        
26        275018        
27        310153        
28        160043        
29        270026        
...          ...  ..  ..
27097     454965        
27098     549740        
27099     428717        
27100     535039        
27101     746760        
27102     577591        
27103     460449        
27104     548135   

In [9]:
print(type(vpcdf['postalcode'][7]))

<class 'str'>


In [10]:
def onemap_geocode(postalcode):
    header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
    onemap_request = 'https://developers.onemap.sg/commonapi/search?searchVal=' + postalcode + '&returnGeom=Y&getAddrDetails=N'
#     print(onemap_request)
    response = requests.get(onemap_request)
    response_json = response.json()

    try:
        lat = response_json['results'][0]['LATITUDE']
        lon = response_json['results'][0]['LONGITUDE']
    except IndexError:
        lat = ''
        lon = ''
    print('response for ' + postalcode + ': ' + lat + ", " + lon)
    return lat + ', ' + lon

In [11]:
onemap_geocode('178880')

response for 178880: 1.28881003164, 103.85034901


'1.28881003164, 103.85034901'

In [17]:
%%time
tempdf = vpcdf.iloc[:100]
tempdf['latlon'] = tempdf['postalcode'].apply(lambda x: onemap_geocode(x))
print(tempdf)
# vpcdf['latlon'] = vpcdf['postalcode'].apply(lambda x: onemap_geocode(x))

response for 400336: 1.32572020348, 103.901822393
response for 546183: 1.3624250511, 103.876520751
response for 578345: 1.37403135668, 103.832480498
response for 650420: 1.36255043036, 103.745770795
response for 460059: 1.32454416186, 103.92061613
response for 466507: 1.31578148106, 103.941424516
response for 357984: 1.34656818171, 103.868524083
response for 760776: 1.42255934807, 103.83394406
response for 560129: 1.36989498896, 103.84205905
response for 732685: 1.4413334284, 103.806136165
response for 669558: 1.36249366464, 103.765891211
response for 737785: 1.42867767353, 103.786351064
response for 330004: 1.31620522831, 103.860021154
response for 531470: 1.37905480017, 103.901927832
response for 684686: 1.4041436942, 103.748355691
response for 760150: 1.4309694175, 103.833648679
response for 640489: 1.34917459624, 103.727239833
response for 798595: 1.38762349661, 103.879314929
response for 828785: 1.40503554983, 103.900043752
response for 540142: 1.39045415383, 103.905102918
respons

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [None]:
vpcdf['lat'] = vpcdf['latlon'].apply(lambda x: x.split(',')[0])
vpcdf['lon'] = vpcdf['latlon'].apply(lambda x: x.split(',')[1])
print(vpcdf.head(20))

In [None]:
vpcdf = vpcdf[['postalcode', 'lat', 'lon']]
vpcdf.reset_index()
print(len(pcdf))
pcdf

In [None]:
pc_latlon_file = os.path.join(os.pardir,"data","withlatlon.csv")
vpcdf.to_csv(pc_latlon_file)