In [138]:
import googlemaps
import pandas as pd
import re
import jellyfish
from itertools import combinations
from collections import Counter

# file to save suspicious venues
susp_ven_file = "suspicious_venues.txt"

aus_states_short2long = {"nsw": "new south wales", "vic": "victoria", "sa": "south australia", "tas": "tasmania", "qld": "queensland",
                  "wa": "western australia", "act": "australian capital territory", "nt": "northern territory"}
aus_cities = {"sydney", "melbourne", "perth", "adelaide", "brisbane", "canberra", "darwin", "hobart"}

aus_states_long2short = {v: k for k, v in aus_states_short2long.items()}

pois = ["airport", "park", "aquarium", "art gallery", "bakery", "bank", "bar", "store", "bowling alley", "cafe", "campground",
          "casino", "cemetery", "church", "city hall", "town hall", "courthouse", "embassy", "gym", "hospital", "library", "mosque", "island",
            "movie theater", "museum", "night club", "nightclub", "pharmacy", "police", "post office", "cinemas",
               "restaurant", "school", "shopping mall", "shopping centre", "spa", "stadium", "station", "synagogue", "university",
                   "zoo", "club", "casino", "theatre", "parklands", "hotel", "rsl", "oval", "showground", "racecourse", "gallery",
                   "resort", "square", "estate", "arena", "reserve", "winery", "wharf", "cathedral", "plaza", "opera house", "vineyard",
                   "farm", "aquatic centre", "pavillion", "convention centre", "community centre", "point", "institute", "business centre"]

street_types = """alley ally arcade arc avenue ave boulevard bvd bypass bypa circuit cct close cl corner crn court
                    ct crescent cres cul-de-sac cds drive dr esplanade esp green grn grove gr highway
                    hwy junction jnc lane lane link link mews mews parade pde place pl ridge rdge road rd 
                    square sq street st terrace tce""".split()

fake_venue_words = {"voucher", "tour", "vouchers", "tours", "testing"}

venues = pd.read_csv("aus_venues.txt", sep="\t")
print("venues to process: {}".format(len(venues)))

# make sure there are only single white spaces in names and addresses
for col in ["v_name", "v_addr"]:
    venues.loc[:,col] = venues.loc[:,col].apply(lambda x: " ".join(str(x).split()).lower())

# remove all numbers from addresses
venues.loc[:,"v_addr"] = venues.loc[:,"v_addr"].str.replace(r"\b\d+\b","")

# remove duplicates in names + addresses; some venues differ only in venue code
venues.drop_duplicates(subset=["v_name","v_addr"], inplace=True)
print("venues after removed duplicates: {}".format(len(venues)))

venues to process: 3027
venues after removed duplicates: 2867


In [139]:
# first look for the outright suspicious venues
junk_venues = pd.concat([venues.loc[venues.v_addr.isnull() | venues.v_name.isnull(),:], 
                        venues.loc[venues.lat0.isnull() | venues.lng0.isnull(),:], 
                        venues.loc[(venues.v_name + ' ' + venues.v_addr).apply(lambda x: True if set(x.split()) & 
                                                                               fake_venue_words else False), :],
                        venues.loc[venues.v_addr.apply(lambda x: True if len(x.split()) < 2 else False), :]])
print("found {} junk venues".format(len(junk_venues)))
# remove these venues
venues = venues.loc[-venues.v_code.isin(junk_venues.v_code),:]
print("remaining venues: {}".format(len(venues)))

found 118 junk venues
remaining venues: 2757


In [140]:
# find venues with some POI in name
venues_poi_name = venues.loc[venues.v_name.apply(lambda x: True if [1 for c in range(3) 
                                                                    for w in combinations(x.split(), c) 
                                                                    if {" ".join(w)} & set(pois)] else False),:]
# add the search line column which for these venues is just their names
venues_poi_name["search_line"] = venues_poi_name["v_name"].str.replace(r"[-;.!$@_&]"," ")
print("venues with some POI in name: {}".format(len(venues_poi_name)))

# remove these venues
venues = venues.loc[-venues.v_code.isin(venues_poi_name.v_code),:]
print("venues left: {}".format(len(venues)))

# find venues with some POI in address
venues_poi_addr = venues.loc[venues.v_addr.apply(lambda x: True if [1 for c in range(3) 
                                                                    for w in combinations(x.split(), c) 
                                                                    if {" ".join(w)} & set(pois)] else False),:]
venues_poi_addr["search_line"] = venues_poi_addr["v_addr"].str.replace(r"[-;.!$@_&]"," ")
print("venues with some POI in address: {}".format(len(venues_poi_addr)))

# remove these venues
venues = venues.loc[-venues.v_code.isin(venues_poi_addr.v_code),:]
print("venues left: {}".format(len(venues)))

venues with some POI in name: 1514
venues left: 1243
venues with some POI in address: 180
venues left: 1063


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [142]:
def find_states(st): 
    
    thestate = ""
    
    st_words = st.split()
    # one word states
    found_states = set(st_words) & set(aus_states_short2long.keys())
    
    if found_states:
        thestate = " ".join(found_states)
    else:
        for candidate_state in aus_states_long2short:
            res = re.search(r"\b({})\b".format(candidate_state), st)
            if res:
                thestate = aus_states_long2short[candidate_state]
    
    return thestate

# look at the addresses and find states
venues["state"] = venues["v_addr"].apply(find_states).str.upper()
print("found states for {} our of {} venues".format(sum(venues.state.notnull()), len(venues)))

found states for 1063 our of 1063 venues


In [143]:
subs_df = pd.read_csv("aus_subs_12APR2017.txt")
aus_suburbs = set(subs_df.loc[:,"sub"].tolist())

In [156]:
# set up a column where we will place the detected suburb
venues["suburb"] = None

for i, row in enumerate(venues.itertuples()):
    
    candidates = []
    
    # select only the part of suburb data frame related to this state
    s = set(subs_df.loc[subs_df.state == row.state,"sub"].tolist())
   
    oneword_suburbs = [sb for sb in s if len(sb.split()) == 1]
    #print("oneword suburbs: {}".format(len(oneword_suburbs)))
    
    for su in oneword_suburbs:
        if su in row.v_addr.strip().split():
            candidates.append(su)
    
    twoword_suburbs = [sb for sb in s if len(sb.split()) == 2]
    
    for su in twoword_suburbs:
        aw = row.v_addr.strip().split()
        if su in [w + " " + aw[i + 1] for i, w in enumerate(aw) if i + 1 < len(aw)]:
            candidates.append(su)
            
    threeword_suburbs = [sb for sb in s if len(sb.split()) == 2]
    for su in threeword_suburbs:
        aw = row.v_addr.strip().split()
        if su in [w + " " + aw[i + 1] + " " + aw[i + 2] for i, w in enumerate(aw) if i + 2 < len(aw)]:
            candidates.append(su)

    venues.ix[i,"suburb"] =  ",".join(candidates)

print("found suburbs for {} our of {} venues".format(sum(venues.suburb.notnull()), len(venues)))

found suburbs for 1799 our of 2203 venues


In [158]:
venues_no_suburb = venues.loc[venues["suburb"].isnull(),:]
print("venues without suburbs: {}".format(len(venues_no_suburb)))

venues without suburbs: 404


In [265]:
gmaps = googlemaps.Client(key='AIzaSyCsJnOb6VESNe9C-BXpkbrLppPA2ygCJMg')

In [267]:
def find_coords(st):
    
    attempted_search_res = gmaps.geocode(st)
    
    if attempted_search_res:
        res = (attempted_search_res[0]["geometry"]["location"], attempted_search_res[0]['formatted_address'])
        return res  
    else:   
        return ({'lat': None, 'lng': None}, None)  

landmark_name_venues["result"] = None
landmark_name_venues.ix[:,"result"] = landmark_name_venues.ix[:,"st"].apply(find_coords)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [268]:
landmark_name_venues["lng"] = landmark_name_venues.loc[:,"result"].apply(lambda x: x[0]['lng'])
landmark_name_venues["lat"] = landmark_name_venues.loc[:,"result"].apply(lambda x: x[0]['lat'])
landmark_name_venues["adr"] = landmark_name_venues.loc[:,"result"].apply(lambda x: x[1])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [272]:
res0 = landmark_name_venues.loc[:,["v_code","v_name","v_addr","lat0","lat", "lng0", "lng","adr"]]

In [284]:
res0["flagged"] = abs(res0["lat0"] - res0["lat"]) > 0.005

In [285]:
rr = res0[res0.flagged == True]

In [286]:
rr.drop("flagged",inplace=True,axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [289]:
rr.to_csv("sample_380.csv",index=False)