In [359]:
import googlemaps
import pandas as pd
import re
import time
from itertools import combinations
from collections import Counter, defaultdict

In [370]:
states_sh2lng = {"nsw": "new south wales", "vic": "victoria", "sa": "south australia", 
                 "tas": "tasmania", "qld": "queensland",
                 "wa": "western australia", "act": "australian capital territory", 
                 "nt": "northern territory"}

aus_cities = {"sydney", "melbourne", "perth", "adelaide", "brisbane", "canberra", "darwin", "hobart",
             "gold coast", "cairns", "townsville", "launceston", "geelong", "alice springs"}

states_lng2sh = {v: k for k, v in states_sh2lng.items()}

pois = ["airport", "park", "aquarium", "art gallery", "bakery", "bank", "bar", "tavern","store", "bowling alley", 
        "cafe", "brewery", "campground", "casino", "studio", "cemetery", "church", "city hall", "town hall", "concert hall", "courthouse", 
        "embassy", "gym", "hospital", "hotel", "library", "mosque", "island", "academy", "cinema", "centre",
        "movie theater", "museum", "night club", "nightclub", "pharmacy", "police", "post office", "cinemas",
        "restaurant", "school", "shopping mall", "shopping centre", "spa", "stadium", "station", "synagogue", 
        "university", "zoo", "club", "casino", "theatre", "parklands", "hotel", "rsl", "oval", "showground", "showgrounds", 
        "racecourse", "gallery", "resort", "square", "estate", "arena", "reserve", "winery", "wharf", 
        "cathedral", "plaza", "opera house", "vineyard", "farm", "aquatic centre", "gardens", "valley",
        "pavillion", "convention centre", "community centre", "point", "institute", "business centre", "castle",
        "national park", "harbour", "studios", "world", "house", "hall", "lounge", "marina", "parks"]

only_w_suburb_pois = ["cinemas", "cinema", "movie theatre", "hotel", "wharf", "world", "house", "hall", "marina"]

nonspecific_pois = ["island", "parklands", "showground", "estate", "reserve", "winery", "wharf", "vineyard", "farm", 
                   "gardens", "point", "valley", "national park", "parks"]

In [361]:
street_types = """alley ally arcade arc avenue ave boulevard bvd bypass bypa circuit cct close cl corner crn court
                    ct crescent cres cul-de-sac cds drive dr esplanade esp green grn grove gr highway
                    hwy junction jnc lane lane link link mews mews parade pde place pl ridge rdge road rd 
                    square sq street st terrace tce way""".split()

fake_venue_words = {"voucher", "tour", "vouchers", "tours", "various", "testing", "test", "cruise", "departs", 
                    "membership", "members", "memberships", "n/a"}

venues = pd.read_csv("aus_venues.txt", sep="\t")
nv_init = len(venues)
print("venues to process: {}".format(nv_init))

# make sure there are only single white spaces in names and addresses
for col in ["v_name", "v_addr"]:
    venues.loc[:,col] = venues.loc[:,col].str.replace("[.;-@_:#&()]"," ")
    venues.loc[:,col] = venues.loc[:,col].str.replace(r"\bmt\b","mount")
    venues.loc[:,col] = venues.loc[:,col].apply(lambda x: " ".join(str(x).strip().split()).lower())

# remove all numbers from addresses
venues.loc[:,"v_addr"] = venues.loc[:,"v_addr"].str.replace(r"\b\d+\b","")

# remove duplicates in names + addresses; some venues differ only in venue code
venues.drop_duplicates(subset=["v_name","v_addr"], inplace=True)
nv_dpl = len(venues)
print("removed duplicates, remaining venues: {}".format(nv_dpl))

venues to process: 3027
removed duplicates, remaining venues: 2866


In [362]:
v_bad_codes = set(venues.loc[venues.v_name.isnull() | 
                   venues.v_addr.isnull() |
                   venues.v_addr.isin(["","nan","national"]) |
                   venues.lat0.isnull() | venues.lng0.isnull(),"v_code"].tolist())

print("venues with missing name, address or coordinates: {}".format(len(v_bad_codes)))

# first look for the outright suspicious venues
v_bad_codes.update(set(venues.loc[(venues["v_name"] + ' ' + venues["v_addr"]).apply(lambda x: True if set(x.split()) & 
                                                                               fake_venue_words else False), "v_code"].tolist()))
print("venues with something missing and suspicious words: {}".format(len(v_bad)))

venues = venues.loc[-venues.v_code.isin(v_bad_codes),:]
nv_nosusp = len(venues)
print("removed suspicious word venues, remaining venues: {}".format(nv_nosusp))

venues with missing name, address or coordinates: 28
venues with something missing and suspicious words: 107
removed suspicious word venues, remaining venues: 2760


In [363]:
subs_df = pd.read_csv("aus_subs_12APR2017.txt")
aus_suburbs = set(subs_df.loc[:,"sub"].tolist())

In [364]:
def find_states(st): 
    
    thestate = None    
    st_words = st.split()
    found_states = set(st_words) & set(states_sh2lng.keys())
    
    if found_states:
        thestate = " ".join(found_states)
    else:
        for candidate_state in states_lng2sh:
            res = re.search(r"\b({})\b".format(candidate_state), st)
            if res:
                thestate = states_lng2sh[candidate_state]
    
    return thestate

def find_city(st): 
    
    city = None   
    st_words = st.split()
    
    found_cities = {" ".join(w) for c in range(1,3) for w in combinations(st_words, c) if " ".join(w) in aus_cities}
    
    if found_cities:
        city = " ".join(found_cities)
    
    return city

def find_suburb(st):
    
    all_suburbs = set(subs_df["sub"].tolist()) # all australian suburbs
    st = " " + st + " "
    suburb_candidates = set()
    
    for s in all_suburbs:
        if " " + s + " " in st:
            suburb_candidates.add(s)
    
    if suburb_candidates:
        return suburb_candidates
    else:
        return None

def find_poi(st):
    
    st = " " + st + " "
    poi_candidates = set()
    
    for s in pois:
        if " " + s + " " in st:
            poi_candidates.add(s)
    
    if poi_candidates:
        return poi_candidates
    else:
        return None
   

In [365]:
cd_dict = defaultdict(set)

# look at the addresses and find states

venues["state"] = venues["v_addr"].astype(str).apply(find_states).str.upper()
venues["city"] = (venues["v_addr"].astype(str) + " " + venues["v_name"].astype(str)).apply(find_city)
venues["suburb"] = (venues["v_addr"].astype(str) + " " + venues["v_name"].astype(str)).apply(find_suburb)

for what in "state city suburb".split():
    cd_dict["with " + what] = set(venues.loc[venues[what].notnull(),"v_code"].tolist())
    cd_dict["without " + what] = set(venues.v_code.tolist()) - cd_dict["with " + what]

for k in cd_dict:
    print("venues " + k + ": {}".format(len(cd_dict[k])))

venues with state: 2751
venues without state: 9
venues with city: 948
venues without city: 1812
venues with suburb: 2718
venues without suburb: 42


In [366]:
venues["POI"] = (venues.v_name + " " + venues.v_addr).apply(find_poi)

In [367]:
venues_nostate = venues.loc[venues.v_code.isin(cd_dict["without state"]),:]
# no state and no POI
v_bad_codes.update(set(venues_nostate.loc[venues_nostate.POI.isnull(),"v_code"].tolist()))

venues_nosuburb = venues.loc[venues.v_code.isin(cd_dict["without suburb"]),:]
# no suburb and no suitable POI - put these into rubbish
v_bad_codes.update(set(venues_nosuburb.loc[venues_nosuburb.POI.apply(lambda x: True if (not x) or (x & set(only_w_suburb_pois)) else False),"v_code"].tolist()))

venues = venues.loc[-venues.v_code.isin(v_bad_codes),:]
print("venues left: {}".format(len(venues)))

for k in cd_dict:
    cd_dict[k] = cd_dict[k] - v_bad_codes

venues left: 2735


In [368]:
venues["search_line"] = None
venues["suburb_upd"] = None

for row in venues.itertuples():
    
    cnd = set()
    
    # if found a street type in address, priority to suburb candidates to the right
    for street in street_types:
        if row.suburb:
            for k in row.suburb:
                lst = row.v_addr.split(street)
                if (len(lst) > 1) and (k in lst[-1]):   # i.e. if street was there
                    cnd.add(k)
        else:
            pass

    if len(cnd) == 1: # if a single candidate selected, 
        venues.ix[row.Index,"suburb_upd"] = cnd.pop()
    else:
        if row.suburb and row.state:  # if state is available, priority to suburb candidate right before state       
            for k in row.suburb:
                if (re.search(r"\b({})\b".format(" ".join([k, row.state.lower()])), row.v_addr) or
                    re.search(r"\b({})\b".format(" ".join([k, states_sh2lng[row.state.lower()]])), row.v_addr)):
                    venues.ix[row.Index,"suburb_upd"] = k
    
    # if still unclear what suburb but there's a city name, just take the city name
    if not venues.ix[row.Index,"suburb_upd"]:  # if still undefined
        if row.city:
            venues.ix[row.Index,"suburb_upd"] = row.city
    
    if not venues.ix[row.Index,"suburb_upd"]:  # if still undefined
        if row.suburb and len(row.suburb) == 1:
            venues.ix[row.Index,"suburb_upd"] = row.suburb.pop()
    
    if row.state and row.POI and venues.ix[row.Index,"suburb_upd"] and (row.POI & set(pois)):
        venues.ix[row.Index,"search_line"] = " ".join([row.v_name, venues.ix[row.Index,"suburb_upd"], row.state])
    
    if not venues.ix[row.Index,"search_line"]:
        if (not venues.ix[row.Index,"suburb_upd"]) and row.state and row.POI and (len(row.POI & set(only_w_suburb_pois)) == 0):
            venues.ix[row.Index,"search_line"] = " ".join([row.v_name, row.state])
    if not venues.ix[row.Index,"search_line"]:
         if venues.ix[row.Index,"suburb_upd"] and row.state and row.POI and (row.POI & set(only_w_suburb_pois)):
            venues.ix[row.Index,"search_line"] = " ".join([row.v_name, venues.ix[row.Index,"suburb_upd"], row.state])
    if not venues.ix[row.Index,"search_line"]:
         if venues.ix[row.Index,"suburb_upd"] and row.POI and (row.POI & set(row.v_name.split())):
            venues.ix[row.Index,"search_line"] = " ".join([row.v_name, venues.ix[row.Index,"suburb_upd"]])
        

In [369]:
print(len(venues.loc[venues.search_line.isnull()]))
venues.loc[venues.search_line.isnull()]

646


Unnamed: 0,v_code,v_name,v_addr,lat0,lng0,state,city,suburb,POI,search_line,suburb_upd
2,med,1 two 3 mediterranean,shop surf parade broadbeach qld,-28.035530,153.432890,QLD,,{broadbeach},,,broadbeach
4,rus,170 russell,russell st melbourne vic,-37.811907,144.968272,VIC,melbourne,"{melbourne, russell}",,,melbourne
14,abs,abbotsford convent,st heliers street abbotsford vic,-37.802170,145.003780,VIC,,{abbotsford},,,abbotsford
18,kqu,aboard the kimberley quest ii,pro-fisherman's boat ramp quindalup wa,-33.613940,115.110719,WA,,"{quindalup, kimberley}",,,quindalup
23,vaccamelb,acca,sturt street southbank melbourne vic,-37.826700,144.966560,VIC,melbourne,"{melbourne, southbank, sturt}",,,melbourne
28,rou,action paintball - rouse hill,- annangrove rd cnr edwards rd rouse hill nsw,-33.661080,150.923610,NSW,,"{annangrove, rouse hill}",,,rouse hill
29,ser,action paintball - serpentine,punrak road serpentine wa,-32.378840,115.921190,WA,,{serpentine},,,serpentine
31,afp,active flight paragliding bright,bright vic,-36.730200,146.960900,VIC,,{bright},,,bright
32,spa,actt the old performance space,cleveland street redfern nsw,-33.889410,151.203450,NSW,,"{redfern, cleveland}",,,redfern
34,agr,adelaide,goldrush ballooning strathalbyn sa,-35.258880,138.890350,SA,adelaide,"{strathalbyn, adelaide}",,,strathalbyn


In [58]:
venues.loc[venues.suburb2.isnull()]

Unnamed: 0,v_code,v_name,v_addr,lat0,lng0,state,city,suburb,suburb2
0,proxywin,win entertainment centre wollongong,corner harbour and crown streets nsw,-34.426220,150.902150,NSW,,{wollongong},
66,aco,albany centennial oval,lockyer ave wa,-35.015420,117.885110,WA,,"{lockyer, albany}",
82,agl,albury gateway village,lincolin causeway wadonga nsw,-36.106579,146.895767,NSW,,"{village, albury}",
124,arc,arana leagues club,dawson parade kepera qld,-27.399570,152.961290,QLD,,{dawson},
164,azo,australia zoo,steve irwin way beerwah sunshine coast qld,-26.862930,152.962320,QLD,,"{irwin, beerwah, sunshine}",
223,btv,batavia marina geraldton,wiebbe hayes lane wa,-28.768240,114.611250,WA,,"{hayes, geraldton}",
274,bic,bicentennial community centre,bicentennial drive off ben lexcon dve sunshine...,-26.406170,153.110150,QLD,,"{sunshine beach, sunshine}",
321,bor,borenore field days site,nsw,-31.253220,146.921100,NSW,,"{field, borenore}",
350,bci,broadford circuit,vic,-37.471310,144.785150,VIC,,{broadford},
368,bhp,bunbury hay park north,cnr parade rd bussel highway bunnury wa,-33.370730,115.642110,WA,,"{bunbury, hay}",


In [23]:
v_bad = pd.concat([v_bad,
                    venues.loc[venues.suburb.apply(lambda x: False if len(x)>0 else True),:]])

venues = venues.loc[-venues.v_code.isin(v_bad.v_code.tolist()),:]
nv_nosub = len(venues)
print("removed no suburb venues, remaining venues: {}".format(nv_nosub))

removed no suburb venues, remaining venues: 2715


In [24]:
poi_v_codes = ( set(venues.loc[venues.loc[:,"v_name"].apply(lambda x: True if [1 for c in range(1,3) 
                                                                    for w in combinations(x.split(), c) 
                                                                    if {" ".join(w)} & set(pois)] else False),"v_code"].tolist()) | 
set(venues.loc[venues.loc[:,"v_addr"].apply(lambda x: True if [1 for c in range(3) 
                                                                    for w in combinations(x.split(), c) 
                                                                    if {" ".join(w)} & set(pois)] else False),"v_code"].tolist()) )
   

In [25]:
nv_pois = len(poi_v_codes)
print("venues with POIs in name or address: {}".format(nv_pois))

venues with POIs in name or address: 1883


In [26]:
nopoi_v_codes = set(venues.v_code.tolist()) - poi_v_codes

In [27]:
venues_poi = venues.loc[venues.v_code.isin(poi_v_codes),:]
venues_poi.head()

Unnamed: 0,v_code,v_name,v_addr,lat0,lng0,state,city,suburb
0,proxywin,win entertainment centre wollongong,corner harbour and crown streets nsw,-34.42622,150.90215,NSW,,{wollongong}
1,level 2,newtown rsl,petersham rsl club enmore road newtown nsw,-33.89842,151.17648,NSW,,"{petersham, newtown, enmore}"
3,stk,1300smiles stadium townsville,golf links drive kirwan townsville qld,-19.30748,146.71389,QLD,townsville,"{townsville, kirwan}"
5,mmr,2012 bmw magic millions race day,gold coast turf club racecourse drive bundall qld,-28.01086,153.40712,QLD,,"{racecourse, bundall}"
6,ahb,21 arms nightclub,armstrong street north ballarat vic,-37.56055,143.85635,VIC,,"{ballarat, armstrong}"


In [28]:
venues_nopoi = venues.loc[venues.v_code.isin(nopoi_v_codes),:]
venues_nopoi.head()

Unnamed: 0,v_code,v_name,v_addr,lat0,lng0,state,city,suburb
2,med,1 two 3 mediterranean,shop surf parade broadbeach qld,-28.03553,153.43289,QLD,,{broadbeach}
4,rus,170 russell,russell st melbourne vic,-37.811907,144.968272,VIC,melbourne,"{melbourne, russell}"
14,abs,abbotsford convent,st heliers street abbotsford vic,-37.80217,145.00378,VIC,,{abbotsford}
15,abp,abc perth studios,fielder st east perth wa,-31.95168,115.87308,WA,perth,"{east perth, perth}"
17,aho,abercrombie house,ophir rd bathurst nsw,-33.392609,149.518807,NSW,,"{abercrombie, ophir, bathurst}"


In [23]:
venues_w_poi_name.loc[venues_w_poi_name.state.isnull(),"state"] = \
    venues_w_poi_name.loc[venues_w_poi_name.state.isnull(),"state"].fillna("")

venues_w_poi_name["search_line"] = venues_w_poi_name["v_name"] + " " + venues_w_poi_name["state"]

venues = venues.loc[-venues.v_code.isin(venues_w_poi_name.v_code.tolist()),:]

venues_poi_addr.loc[venues_poi_addr.state.isnull(),"state"] = venues_poi_addr.loc[venues_poi_addr.state.isnull(),"state"].fillna("")


venues_poi_addr["search_line"] = venues_poi_addr["v_addr"] + " " + venues_poi_addr["state"] 

# remove these venues
venues = venues.loc[-venues.v_code.isin(venues_poi_addr.v_code),:]

venues_w_poi = pd.concat([venues_w_poi_name, venues_poi_addr])

In [24]:
venues_w_poi.drop(["city","state"], inplace=True, axis=1)
print("venues with some POI: {}".format(len(venues_w_poi)))
print("venues left: {}".format(len(venues)))

venues with some POI: 1587
venues left: 975


In [33]:
venues["suburb"] = None

for i, row in enumerate(venues.itertuples()):
    
    candidates = []
    
    s = set(subs_df.loc[subs_df.state == row.state,"sub"].tolist())
   
    oneword_suburbs = [sb for sb in s if len(sb.split()) == 1]
    
    st = str(row.v_addr).strip()
    
    for su in oneword_suburbs:
        if su in st.split():
            candidates.append(su)
    
    twoword_suburbs = [sb for sb in s if len(sb.split()) == 2]
    
    for su in twoword_suburbs:
        aw = st.split()
        if su in [w + " " + aw[i + 1] for i, w in enumerate(aw) if i + 1 < len(aw)]:
            candidates.append(su)
            
    threeword_suburbs = [sb for sb in s if len(sb.split()) == 2]
    for su in threeword_suburbs:
        aw = st.split()
        if su in [w + " " + aw[i + 1] + " " + aw[i + 2] for i, w in enumerate(aw) if i + 2 < len(aw)]:
            candidates.append(su)
    
    if len(candidates) > 1:
        for k in candidates:
            if st.split(k):
                for part in st.split(k):
                    if set(part.split()) & set(street_types):
                        candidates = [k]
                    
    venues.ix[i, "suburb"] = ",".join(candidates)
    
print("found suburbs for {} out of {} venues".format(sum(venues.suburb.notnull()), len(venues)))

found suburbs for 975 out of 1643 venues


In [350]:
venues_no_suburb = venues.loc[(venues["suburb"] == "") | venues["suburb"].isnull(),:]

venues_no_suburb_w_city = venues_no_suburb.loc[venues_no_suburb.loc[:,"city"].notnull(),:]
venues_no_suburb_w_city["search_line"] = venues_no_suburb_w_city["v_name"] + " " + venues_no_suburb_w_city["city"]

venues_no_suburb_w_city.drop(["state", "city"], axis=1, inplace=True)

venues_no_suburb = venues_no_suburb.loc[-venues_no_suburb.v_code.isin(venues_no_suburb_w_city.v_code),:]

# collect all baddies
v_bad = pd.concat([v_bad, venues_no_suburb.drop(["state", "city", "suburb"], axis=1)])

venues = venues.loc[-venues.v_code.isin(venues_no_suburb.v_code),:]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [351]:
venues.loc[venues.loc[:,"state"].isnull(),"state"] = venues.loc[venues.loc[:,"state"].isnull(),"state"].fillna("")
venues.loc[venues.loc[:,"v_name"].isnull(),"v_name"] = venues.loc[venues.loc[:,"v_name"].isnull(),"v_name"].fillna("")

In [352]:
venues["search_line"] = venues["v_name"] + " " + venues["suburb"] + " " + venues["state"]

In [353]:
venues = pd.concat([venues, venues_no_suburb_w_city])
Counter(venues.search_line)

Counter({'1 two 3 mediterranean broadbeach QLD': 1,
         '170 russell melbourne VIC': 1,
         'abbotsford convent abbotsford VIC': 1,
         'abc perth studios east perth WA': 1,
         'abercrombie house ophir NSW': 1,
         'aboard the kimberley quest ii quindalup WA': 1,
         'academy canberra ACT': 1,
         'academy cinema hindmarsh SA': 1,
         'academy of design australia port melbourne VIC': 1,
         'acca melbourne VIC': 1,
         'action paintball - rouse hill rouse hill NSW': 1,
         'action paintball - serpentine serpentine WA': 1,
         'activ8 health and sports centre mount gambier SA': 1,
         'active flight paragliding bright bright VIC': 1,
         'actt the old performance space cleveland NSW': 1,
         'adelaide entertainment centre hindmarsh SA': 1,
         'adelaide festival centre adelaide SA': 1,
         'adelaide festival centre trust adelaide SA': 1,
         'adelaide film festival unley SA': 1,
         'adelaide

Collect all venues suitable for giving to Google Maps in one data frame with a search line.

In [354]:
venues.drop(["city","state","suburb"], axis=1, inplace=True)

In [355]:
venues

Unnamed: 0,lat0,lng0,search_line,v_addr,v_code,v_name
2,-28.035530,153.432890,1 two 3 mediterranean broadbeach QLD,shop surf parade broadbeach qld,med,1 two 3 mediterranean
4,-37.811907,144.968272,170 russell melbourne VIC,russell st melbourne vic,rus,170 russell
14,-37.802170,145.003780,abbotsford convent abbotsford VIC,st heliers street abbotsford vic,abs,abbotsford convent
15,-31.951680,115.873080,abc perth studios east perth WA,fielder st east perth wa,abp,abc perth studios
17,-33.392609,149.518807,abercrombie house ophir NSW,ophir rd bathurst nsw,aho,abercrombie house
18,-33.613940,115.110719,aboard the kimberley quest ii quindalup WA,pro-fisherman's boat ramp quindalup wa,kqu,aboard the kimberley quest ii
20,-35.279020,149.133250,academy canberra ACT,bunda street canberra city act,acb,academy
21,-34.923210,138.606350,academy cinema hindmarsh SA,hindmarsh sq adelaide sa,adc,academy cinema
22,-37.828871,144.941085,academy of design australia port melbourne VIC,ingles street port melbourne vic,aod,academy of design australia
23,-37.826700,144.966560,acca melbourne VIC,sturt street southbank melbourne vic,vaccamelb,acca


In [356]:
gmaps = googlemaps.Client(key='AIzaSyCsJnOb6VESNe9C-BXpkbrLppPA2ygCJMg')

In [357]:
def find_coords(st):
    
    attempted_search_res = gmaps.geocode(st)
    
    if attempted_search_res:
        res = (attempted_search_res[0]["geometry"]["location"], attempted_search_res[0]['formatted_address'])
        return res  
    else:   
        return ({'lat': None, 'lng': None}, None)  

venues["result"] = None
venues.ix[:,"result"] = venues.ix[:,"search_line"].apply(find_coords)


In [360]:
venues["lng"] = venues.loc[:,"result"].apply(lambda x: x[0]['lng'])
venues["lat"] = venues.loc[:,"result"].apply(lambda x: x[0]['lat'])
venues["adr"] = venues.loc[:,"result"].apply(lambda x: x[1])

In [362]:
res0 = venues.loc[:,["v_code","v_name","v_addr","lat0","lat", "lng0", "lng","adr"]]

In [366]:
res0.head()

Unnamed: 0,v_code,v_name,v_addr,lat0,lat,lng0,lng,adr,flagged
2,med,1 two 3 mediterranean,shop surf parade broadbeach qld,-28.03553,-28.028273,153.43289,153.430982,"Shop 10 & 11 Phoenician Tower East Surf Pde, B...",False
4,rus,170 russell,russell st melbourne vic,-37.811907,-37.812043,144.968272,144.968038,"170 Russell St, Melbourne VIC 3000, Australia",False
14,abs,abbotsford convent,st heliers street abbotsford vic,-37.80217,-37.802465,145.00378,145.0029,"1 St Heliers St, Abbotsford VIC 3067, Australia",False
15,abp,abc perth studios,fielder st east perth wa,-31.95168,-31.951747,115.87308,115.873288,"30 Fielder St, East Perth WA 6004, Australia",False
17,aho,abercrombie house,ophir rd bathurst nsw,-33.392609,-33.392716,149.518807,149.518933,"311 Ophir Rd, Bathurst NSW 2795, Australia",False


In [381]:
res0["flagged"] = (abs(res0["lat0"] - res0["lat"]) >= 0.1) |  (abs(res0["lng0"] - res0["lng"]) >= 0.1)

In [392]:
rr1 = res0[res0.flagged == False]

In [384]:
len(rr)

54

In [385]:
rr.drop("flagged",inplace=True,axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [386]:
rr.to_csv("venues_to_investigate.csv",index=False, sep="\t")

In [390]:
v_bad.to_csv("bad_venues.csv",index=False, sep="\t")

In [391]:
v_bad.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 248 entries, 2964 to 2952
Data columns (total 5 columns):
v_code    248 non-null object
v_name    185 non-null object
v_addr    185 non-null object
lat0      219 non-null float64
lng0      219 non-null float64
dtypes: float64(2), object(3)
memory usage: 11.6+ KB


In [393]:
rr1.to_csv("venues_by_gmaps.csv",index=False, sep="\t")