In [52]:
import googlemaps
import pandas as pd
import re
import jellyfish
from itertools import combinations
from collections import Counter

# file to save suspicious venues
susp_ven_file = "suspicious_venues.txt"

aus_states_short2long = {"nsw": "new south wales", "vic": "victoria", "sa": "south australia", 
                         "tas": "tasmania", "qld": "queensland",
                  "wa": "western australia", "act": "australian capital territory", "nt": "northern territory"}
aus_cities = {"sydney", "melbourne", "perth", "adelaide", "brisbane", "canberra", "darwin", "hobart"}

aus_states_long2short = {v: k for k, v in aus_states_short2long.items()}

pois = ["airport", "park", "aquarium", "art gallery", "bakery", "bank", "bar", "store", "bowling alley", "cafe", "campground",
          "casino", "cemetery", "church", "city hall", "town hall", "courthouse", "embassy", "gym", "hospital", "library", "mosque", "island",
            "movie theater", "museum", "night club", "nightclub", "pharmacy", "police", "post office", "cinemas",
               "restaurant", "school", "shopping mall", "shopping centre", "spa", "stadium", "station", "synagogue", "university",
                   "zoo", "club", "casino", "theatre", "parklands", "hotel", "rsl", "oval", "showground", "racecourse", "gallery",
                   "resort", "square", "estate", "arena", "reserve", "winery", "wharf", "cathedral", "plaza", "opera house", "vineyard",
                   "farm", "aquatic centre", "pavillion", "convention centre", "community centre", "point", "institute", "business centre"]

street_types = """alley ally arcade arc avenue ave boulevard bvd bypass bypa circuit cct close cl corner crn court
                    ct crescent cres cul-de-sac cds drive dr esplanade esp green grn grove gr highway
                    hwy junction jnc lane lane link link mews mews parade pde place pl ridge rdge road rd 
                    square sq street st terrace tce""".split()

fake_venue_words = {"voucher", "tour", "vouchers", "tours", "testing"}

venues = pd.read_csv("aus_venues.txt", sep="\t")
print("venues to process: {}".format(len(venues)))

# make sure there are only single white spaces in names and addresses
for col in ["v_name", "v_addr"]:
    venues.loc[:,col] = venues.loc[:,col].str.replace("[.;-@_:#&()]"," ")
    venues.loc[:,col] = venues.loc[:,col].apply(lambda x: " ".join(str(x).strip().split()).lower())

# remove all numbers from addresses
venues.loc[:,"v_addr"] = venues.loc[:,"v_addr"].str.replace(r"\b\d+\b","")

# remove duplicates in names + addresses; some venues differ only in venue code
venues.drop_duplicates(subset=["v_name","v_addr"], inplace=True)
print("venues after removed duplicates: {}".format(len(venues)))

# first look for the outright suspicious venues
junk_venues = pd.concat([venues.loc[venues.v_addr.isnull() | venues.v_name.isnull(),:], 
                        venues.loc[venues.lat0.isnull() | venues.lng0.isnull(),:], 
                        venues.loc[(venues.v_name + ' ' + venues.v_addr).apply(lambda x: True if set(x.split()) & 
                                                                               fake_venue_words else False), :],
                        venues.loc[venues.v_addr.apply(lambda x: True if len(x.split()) < 2 else False), :]])
print("found {} junk venues".format(len(junk_venues)))
# remove these venues
venues = venues.loc[-venues.v_code.isin(junk_venues.v_code),:]
print("remaining venues: {}".format(len(venues)))

venues to process: 3027
venues after removed duplicates: 2866
found 118 junk venues
remaining venues: 2756


In [53]:
# find venues with some POI in name
venues_poi_name = venues.loc[venues.v_name.apply(lambda x: True if [1 for c in range(3) 
                                                                    for w in combinations(x.split(), c) 
                                                                    if {" ".join(w)} & set(pois)] else False),:]
# add the search line column which for these venues is just their names
venues_poi_name["search_line"] = venues_poi_name["v_name"].str.replace(r"[-;.!$@_&]"," ")
print("venues with some POI in name: {}".format(len(venues_poi_name)))

# remove these venues
venues = venues.loc[-venues.v_code.isin(venues_poi_name.v_code),:]
print("venues left: {}".format(len(venues)))

# find venues with some POI in address
venues_poi_addr = venues.loc[venues.v_addr.apply(lambda x: True if [1 for c in range(3) 
                                                                    for w in combinations(x.split(), c) 
                                                                    if {" ".join(w)} & set(pois)] else False),:]
venues_poi_addr["search_line"] = venues_poi_addr["v_addr"].str.replace(r"[-;.!$@_&]"," ")
print("venues with some POI in address: {}".format(len(venues_poi_addr)))

# remove these venues
venues = venues.loc[-venues.v_code.isin(venues_poi_addr.v_code),:]
print("venues left: {}".format(len(venues)))

venues with some POI in name: 1513
venues left: 1243
venues with some POI in address: 184
venues left: 1059


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [54]:
def find_states(st): 
    
    thestate = ""
    
    st_words = st.split()
    # one word states
    found_states = set(st_words) & set(aus_states_short2long.keys())
    
    if found_states:
        thestate = " ".join(found_states)
    else:
        for candidate_state in aus_states_long2short:
            res = re.search(r"\b({})\b".format(candidate_state), st)
            if res:
                thestate = aus_states_long2short[candidate_state]
    
    return thestate

# look at the addresses and find states
venues["state"] = venues["v_addr"].apply(find_states).str.upper()
print("found states for {} our of {} venues".format(sum(venues.state.notnull()), len(venues)))

found states for 1059 our of 1059 venues


In [55]:
subs_df = pd.read_csv("aus_subs_12APR2017.txt")
aus_suburbs = set(subs_df.loc[:,"sub"].tolist())

In [56]:
subu = []

for i, row in enumerate(venues.itertuples()):
    
    candidates = []
    
    # select only the part of suburb data frame related to this state
    s = set(subs_df.loc[subs_df.state == row.state,"sub"].tolist())
   
    oneword_suburbs = [sb for sb in s if len(sb.split()) == 1]
    #print("oneword suburbs: {}".format(len(oneword_suburbs)))
    
    st = str(row.v_addr).strip()
    
    for su in oneword_suburbs:
        if su in st.split():
            candidates.append(su)
    
    twoword_suburbs = [sb for sb in s if len(sb.split()) == 2]
    
    for su in twoword_suburbs:
        aw = st.split()
        if su in [w + " " + aw[i + 1] for i, w in enumerate(aw) if i + 1 < len(aw)]:
            candidates.append(su)
            
    threeword_suburbs = [sb for sb in s if len(sb.split()) == 2]
    for su in threeword_suburbs:
        aw = st.split()
        if su in [w + " " + aw[i + 1] + " " + aw[i + 2] for i, w in enumerate(aw) if i + 2 < len(aw)]:
            candidates.append(su)
    
    if len(candidates) > 1:
        #print("candidates=",candidates)
        for k in candidates:
            #print("st.split(k)=",st.split(k))
            if st.split(k):
                for part in st.split(k):
                    if set(part.split()) & set(street_types):
                        candidates = [k]
                    
    subu.append(",".join(candidates))
    
    
venues["suburb"] = subu

print("found suburbs for {} our of {} venues".format(sum(venues.suburb.notnull()), len(venues)))

candidates= ['perth', 'east perth']
st.split(k)= ['fielder st east ', ' wa']
st.split(k)= ['fielder st ', ' wa']
candidates= ['ophir', 'bathurst']
st.split(k)= ['', ' rd bathurst nsw']
st.split(k)= ['ophir rd ', ' nsw']
candidates= ['hindmarsh', 'adelaide']
st.split(k)= ['', ' sq adelaide sa']
st.split(k)= ['hindmarsh sq ', ' sa']
candidates= ['melbourne', 'port melbourne']
st.split(k)= ['ingles street port ', ' vic']
st.split(k)= ['ingles street ', ' vic']
candidates= ['southbank', 'melbourne']
st.split(k)= ['sturt street ', ' melbourne vic']
st.split(k)= ['sturt street southbank ', ' vic']
candidates= ['annangrove', 'rouse hill']
st.split(k)= ['- ', ' rd cnr edwards rd rouse hill nsw']
st.split(k)= ['- annangrove rd cnr edwards rd ', ' nsw']
candidates= ['penola', 'mount gambier']
st.split(k)= ['', ' road mount gambier sa']
st.split(k)= ['penola road ', ' sa']
candidates= ['redfern', 'cleveland']
st.split(k)= ['cleveland street ', ' nsw']
st.split(k)= ['', ' street redfern nsw']
cand

st.split(k)= ['brooker highway glenorchy ', ' tas']
candidates= ['dandenong', 'dingley village']
st.split(k)= ['cnr ', ' rd boundry rd dingley village vic']
st.split(k)= ['cnr dandenong rd boundry rd ', ' vic']
candidates= ['melbourne', 'flinders', 'flinders lane']
st.split(k)= ['flinders lane ', ' vic']
st.split(k)= ['', ' lane melbourne vic']
st.split(k)= ['', ' melbourne vic']
candidates= ['waters', 'sylvania', 'sylvania waters']
st.split(k)= ['belgrave esplanade sylvania ', ' nsw']
st.split(k)= ['belgrave esplanade ', ' waters nsw']
st.split(k)= ['belgrave esplanade ', ' nsw']
candidates= ['penrith', 'mulgoa']
st.split(k)= ['', ' panthers mulgoa road ', ' nsw']
st.split(k)= ['penrith panthers ', ' road penrith nsw']
candidates= ['perth', 'hay']
st.split(k)= ['hay street ', ' wa']
st.split(k)= ['', ' street perth wa']
candidates= ['dale', 'middle swan']
st.split(k)= ['', ' road middle swan wa']
st.split(k)= ['dale road ', ' wa']
candidates= ['dunsborough', 'naturaliste']
st.split(k)

candidates= ['federal', 'lake george']
st.split(k)= ['westering ', ' hwy lake george nsw']
st.split(k)= ['westering federal hwy ', ' nsw']
candidates= ['kuluin', 'maroochydore']
st.split(k)= ['maroochydore road ', ' qld']
st.split(k)= ['', ' road kuluin qld']
candidates= ['sydney', 'north sydney']
st.split(k)= ['- mount street north ', ' nsw']
st.split(k)= ['- mount street ', ' nsw']
candidates= ['melbourne', 'blackwood', 'north melbourne']
st.split(k)= ['blackwood street north ', ' vic']
st.split(k)= ['', ' street north melbourne vic']
st.split(k)= ['blackwood street ', ' vic']
candidates= ['ascot', 'epsom', 'ascot vale']
st.split(k)= ['epsom road ', ' vale vic']
st.split(k)= ['', ' road ascot vale vic']
st.split(k)= ['epsom road ', ' vic']
candidates= ['docklands', 'melbourne']
st.split(k)= ['waterfront way ', ' melbourne vic']
st.split(k)= ['waterfront way docklands ', ' vic']
candidates= ['adelaide', 'north adelaide']
st.split(k)= ['war memorial drive north ', ' sa']
st.split(k)= [

candidates= ['bowen', 'bowen hills']
st.split(k)= ['gregory terrace ', ' hills qld']
st.split(k)= ['gregory terrace ', ' qld']
candidates= ['bowen', 'bowen hills']
st.split(k)= ['', ' hills qld']
st.split(k)= ['', ' qld']
candidates= ['rockhampton', 'yaamba']
st.split(k)= ['yaamba rd ', ' qld']
st.split(k)= ['', ' rd rockhampton qld']
candidates= ['rockhampton', 'wandal']
st.split(k)= ['exhibition road wandal ', ' qld']
st.split(k)= ['exhibition road ', ' rockhampton qld']
candidates= ['crawley', 'stirling']
st.split(k)= ['uwa  stirling highway ', ' wa']
st.split(k)= ['uwa  ', ' highway crawley wa']
candidates= ['nicholson', 'carlton']
st.split(k)= ['', ' street carlton vic']
st.split(k)= ['nicholson street ', ' vic']
candidates= ['ascot', 'epsom', 'ascot vale']
st.split(k)= ['epsom rd ', ' vale vic']
st.split(k)= ['', ' rd ascot vale vic']
st.split(k)= ['epsom rd ', ' vic']
candidates= ['ascot', 'epsom', 'ascot vale']
st.split(k)= ['epsom road ', ' vale vic']
st.split(k)= ['', ' road 

candidates= ['melbourne', 'flinders', 'flinders lane']
st.split(k)= ['flinders lane ', ' vic']
st.split(k)= ['', ' lane melbourne vic']
st.split(k)= ['', ' melbourne vic']
candidates= ['sydney', 'newtown']
st.split(k)= ['king street newtown ', ' nsw']
st.split(k)= ['king street ', ' sydney nsw']
candidates= ['southbank', 'clarendon']
st.split(k)= ['corner of clarendon street and crown riverwalk crown entertainment complex ', ' vic']
st.split(k)= ['corner of ', ' street and crown riverwalk crown entertainment complex southbank vic']
candidates= ['melbourne', 'brunswick']
st.split(k)= ['leslie street brunswick ', ' victoria']
st.split(k)= ['leslie street ', ' melbourne victoria']
candidates= ['nelson', 'nelson bay']
st.split(k)= ['', ' bay road ', ' bay nsw']
st.split(k)= ['', ' road ', ' nsw']
candidates= ['glenlyon', 'gladstone']
st.split(k)= ['', ' rd gladstone qld']
st.split(k)= ['glenlyon rd ', ' qld']
candidates= ['glenvale', 'toowoomba']
st.split(k)= ['', ' rd toowoomba qld']
st.s

In [57]:
venues_no_suburb = venues.loc[venues["suburb"].isnull(),:]
print("venues without suburbs: {}".format(len(venues_no_suburb)))
venues = venues.loc[-venues.v_code.isin(venues_no_suburb.v_code),:]
print("remaining venues: {}".format(len(venues)))

venues without suburbs: 0
remaining venues: 1059


In [58]:
venues

Unnamed: 0,v_code,v_name,v_addr,lat0,lng0,state,suburb
0,proxywin,win entertainment centre wollongong,corner harbour and crown streets nsw,-34.426220,150.902150,NSW,
2,med,1 two 3 mediterranean,shop surf parade broadbeach qld,-28.035530,153.432890,QLD,broadbeach
4,rus,170 russell,russell st melbourne vic,-37.811907,144.968272,VIC,melbourne
14,abs,abbotsford convent,st heliers street abbotsford vic,-37.802170,145.003780,VIC,abbotsford
15,abp,abc perth studios,fielder st east perth wa,-31.951680,115.873080,WA,east perth
17,aho,abercrombie house,ophir rd bathurst nsw,-33.392609,149.518807,NSW,bathurst
18,kqu,aboard the kimberley quest ii,pro-fisherman's boat ramp quindalup wa,-33.613940,115.110719,WA,quindalup
20,acb,academy,bunda street canberra city act,-35.279020,149.133250,ACT,canberra
21,adc,academy cinema,hindmarsh sq adelaide sa,-34.923210,138.606350,SA,adelaide
22,aod,academy of design australia,ingles street port melbourne vic,-37.828871,144.941085,VIC,port melbourne


In [265]:
gmaps = googlemaps.Client(key='AIzaSyCsJnOb6VESNe9C-BXpkbrLppPA2ygCJMg')

In [267]:
def find_coords(st):
    
    attempted_search_res = gmaps.geocode(st)
    
    if attempted_search_res:
        res = (attempted_search_res[0]["geometry"]["location"], attempted_search_res[0]['formatted_address'])
        return res  
    else:   
        return ({'lat': None, 'lng': None}, None)  

landmark_name_venues["result"] = None
landmark_name_venues.ix[:,"result"] = landmark_name_venues.ix[:,"st"].apply(find_coords)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [268]:
landmark_name_venues["lng"] = landmark_name_venues.loc[:,"result"].apply(lambda x: x[0]['lng'])
landmark_name_venues["lat"] = landmark_name_venues.loc[:,"result"].apply(lambda x: x[0]['lat'])
landmark_name_venues["adr"] = landmark_name_venues.loc[:,"result"].apply(lambda x: x[1])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [272]:
res0 = landmark_name_venues.loc[:,["v_code","v_name","v_addr","lat0","lat", "lng0", "lng","adr"]]

In [284]:
res0["flagged"] = abs(res0["lat0"] - res0["lat"]) > 0.005

In [285]:
rr = res0[res0.flagged == True]

In [286]:
rr.drop("flagged",inplace=True,axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [289]:
rr.to_csv("sample_380.csv",index=False)