In [1]:
import pandas as pd
import json
import re
from pprint import pprint
from itertools import chain
from collections import defaultdict
import googlemaps

In [2]:
class VenueMatcher:
    
    """
    all useful Ticketek venue information is contained in two tables which we join together
    """
    TKT_VENUES = pd.read_csv('data/sample_venue_dim.csv.gz', 
                             encoding='latin-1', 
                             error_bad_lines=False, 
                             sep='\t')[['pk_venue_dim', 'venue_name', 'venue_desc', 'venue_region_desc']] \
    .join(pd.read_csv('data/sample_VenuesPowerWebAddresses.csv.gz', 
                             sep='\t', 
                             encoding='latin-1')[['venue_name', 'vcName', 'paAddressLine1', 'paAddressLine2','vcRegionName']] \
                             .set_index('venue_name'), on='venue_name', how='left').fillna('')
    
    PREFERRED_STATES = 'nsw vic qld wa act sa tas nt'.split()
    
    gmaps = googlemaps.Client(**json.load(open('credentials/google.json')))
    
    def __init__(self, read_local=True):
        
        self.suburbs = json.load(open('data/aus_suburbs_auspost_APR2017.json'))
        self.tkt_venues = [] if not read_local else json.load(open('data/tkt_venues.json'))
        
        print(f'{len(self.tkt_venues)} ticketek venues at the moment')
    
    def select_ticketek_venues(self):
        """
        select and restructure relevant ticketek venue data
        """
        VenueMatcher.TKT_VENUES = VenueMatcher.TKT_VENUES[VenueMatcher.TKT_VENUES['venue_name'].str.isalpha()]
        print(f'venues with names: {len(VenueMatcher.TKT_VENUES)} rows, {len(set(VenueMatcher.TKT_VENUES.pk_venue_dim))} unique keys')
        
        bad_words = set("""games ticketek voucher circus winery cruise cirque buses cruises coach reserve vineyard office""".split())
        
        VenueMatcher.TKT_VENUES = VenueMatcher.TKT_VENUES[~VenueMatcher.TKT_VENUES['venue_desc'] \
                                                          .apply(lambda _: len(bad_words & set(_.lower().split())) > 0)]
        
        print(f'filtered venues: {len(VenueMatcher.TKT_VENUES)} rows, {len(set(VenueMatcher.TKT_VENUES.pk_venue_dim))} unique keys')
        
        for i, row in enumerate(VenueMatcher.TKT_VENUES.iterrows(),1):
            
            if i%100 == 0:
                print(f'processing row {i}...')
            this_venue = defaultdict()
        
            this_venue['name'] = self._normalize(row[1]['venue_desc'])
            this_venue['code'] = [row[1]['venue_name'].lower()]
            
            candidate_states = None
            
            # search for state according to priority until found in one of the columns,
            # then stop
            
            for c in ['venue_desc', 'vcRegionName','venue_region_desc']:
                
                candidate_states = self._find_state(self._normalize(row[1][c]))
                
                if len(candidate_states) == 1:
                    # a single candidate state
                    this_venue['state'] = candidate_states.pop()
                    break
                else: # many or no candidate states 
                    sub_state = None
                    for c in ['venue_desc', 'venue_region_desc']:
                        
                        sub_state = self._find_suburb(self._normalize(row[1][c]))
                        
                        if sub_state:
                            if (len(sub_state) == 1) and (len(candidate_states) > 0):
                                if list(sub_state)[0][1] in candidate_states:
                                    this_venue['state'] = list(sub_state)[0][1]
                                    break
                            elif (len(sub_state) == 1) and (len(candidate_states) == 0):
                                this_venue['state'] = list(sub_state)[0][1]
                                break
                                
                            elif (len(sub_state) > 1) and (len(candidate_states) == 0):
                                _ = {s[1] for s in sub_state}
                                if len(_) == 1:
                                    this_venue['state'] = _.pop()
                                    break
                                else:
                                    # return the longest tuple (first found)
                                    longest_sub = max(sub_state, key=lambda x: len(x[1].split()))
                                    if len(longest_sub[0].split()) > 1:
                                        this_venue['state'] = longest_sub[1]
                                    else:
                                        this_venue['state_'] = list(_)
                                        break
                            elif (len(sub_state) > 1) and (len(candidate_states) > 0):
                                for ss in sub_state:
                                    if ss[1] in candidate_states:
                                        this_venue['state'] = ss[1]
                                        break
                                        
            self.tkt_venues.append(this_venue)
        
        # merge venues with multiple codes
        venues_ = []
        nms = set()
        
        for v in self.tkt_venues:
            
            if v['name'] not in nms:
                venues_.append(v)
                nms.add(v['name'])
            else:
                # this name is already available, must be under another code
                for v_ in venues_:
                    if v_['name'] == v['name']:
                        v_['code'].extend(v['code'])
                        v_['code'] = list(set(v_['code']))
                        
        self.tkt_venues = venues_
            
        return self
    
    def _normalize(self, st):
        """
        normalize a string st
        """
        st = st.lower()
        # replace separators with white spaces
        st = re.sub(r'[-/_.]', ' ', st)
        # keep only letters, numbers and white spaces
        st = ''.join([l for l in st if str(l).isalnum() or str(l).isspace()])
        st = re.sub(r'\s{2,}', ' ', st)
        
        return st
    
    def _find_state(self, st):
        """
        find state names in string st; returns a set of these names
        """
        states = {'nsw': 'new south wales', 
                    'act': 'australian capital territory', 
                    'vic': 'victoria',
                    'tas': 'tasmania',
                    'wa': 'western australia',
                    'nt': 'northern teritory',
                    'sa': 'south australia',
                    'qld': 'queensland'}
        
        states_ = {v: k for k, v in states.items()}
        
        states_found = set()
        
        st_norm = self._normalize(st)
        
        for s in (set(states) | set(states_)):
            try:
                states_found.add(re.search(r'\b' + s + r'\b', st_norm).group(0))
            except:
                continue
                
        if states_found:
            return {s if s not in states_ else states_[s] for s in states_found}
        else:
            return states_found
    
    def _find_suburb(self, st):
        """
        find suburb names in string st; returns a set of tuples (suburb, state)
        """
        st_norm = self._normalize(st)
        
        suburbs_found = set()
        
        words_ = st_norm.split()
        
        for i, w in enumerate(words_):
            
            l1_ = w[0]
            
            if l1_ in self.suburbs:
            
                for r in self.suburbs[l1_]:
                    
                    sub_ = None
                    
                    try:
                        sub_ = re.search(r'\b' + r['name'] + r'\b', ' '.join(words_[i:])).group(0)
                    except:
                        continue
                        
                    if sub_:
                        suburbs_found.add((sub_, r['state']))
        if suburbs_found:
            return suburbs_found 
        else:
            return None
        
    def get_googlemaps_basics(self, local_file='data/tkt_venues.json'):
        
        """
        ask google maps to find places by name; the key here is to hopefully
        grab a place id
        """
        
        if local_file:
            self.tkt_venues = json.load(open(local_file))
            print(f'collected {len(self.tkt_venues)} venues from the locally saved file {local_file}')
            print(f'{sum(["place_id" in v for v in self.tkt_venues])} of these already have place_ids')

        for i, v in enumerate(self.tkt_venues,1):
            
            print(f'venue {i}: {v["name"].upper()}...')
            
            # we want to query Google Maps for the venues that don't have a place_id yet
            
            if 'place_id' not in v:
                      
                if 'state' in v:
                
                    # so we have a specific state..
                
                    q = ' '.join([v['name'], v['state']])
                
                    try:
                        qr_ = vm.gmaps.geocode(q)
                    except:
                        print(f'no response, probably exceeded quota')
                        json.dump(vm.tkt_venues, open('data/tkt_venues.json','w'))
                        break
                
                    if qr_:
                    
                        query_result = qr_[0]  # pick the top result only
            
                        v.update({'place_id': query_result.get('place_id', None),
                                     'address': query_result.get('formatted_address', None),
                                         'venue_type': query_result.get('types', None),
                                             'coordinates': query_result['geometry']['location']})
            
                elif 'state' not in v:
                
                    # problem with the state - there are multiple candidates
                
                    for possible_state in v['state_']:
                    
                        q = ' '.join([v['name'], possible_state])
        
                        try:
                            qr_ = vm.gmaps.geocode(q)
                        except:
                            print(f'no response, probably exceeded quota')
                            json.dump(vm.tkt_venues, open('data/tkt_venues.json','w'))
                            break
                    
                        if qr_:
                        
                            query_result = qr_[0]
        
                            for cm in query_result['address_components']:
                                # if the state we search for is actually mentioned somewhere in 
                                # the result components, we say it's a suitable query result
                                if cm['short_name'][0].lower() == possible_state:
                                    v.update({'place_id': query_result.get('place_id', None),
                                            'address': query_result.get('formatted_address', None),
                                                 'venue_type': query_result.get('types', None),
                                                     'coordinates': query_result['geometry']['location']})
                                    break
        
        json.dump(vm.tkt_venues, open('data/tkt_venues.json','w'))
        
        return self
    
    def get_googlemaps_place_info(self, local_file='data/tkt_venues.json'):
        
        """
        ask google maps for place details using a place id
        """
        
        if local_file:
            
            self.tkt_venues = json.load(open(local_file))
            print(f'collected {len(self.tkt_venues)} venues from the locally saved file {local_file}')
            print(f'{sum(["name_googlemaps" in v for v in self.tkt_venues])} of these already have googlemaps name')
        
        for i, v in enumerate(self.tkt_venues, 1):
            
            print(f'venue {i}: {v["name"].upper()}...')
            
            if ('place_id' in v) and ('name_googlemaps' not in v):
                
                
                try:
                    place_details = self.gmaps.place(v['place_id'])['result']
                except:
                    print(f'can\'t get any place details for place_id {v["place_id"]}')
                    json.dump(self.tkt_venues, open('data/tkt_venues.json','w'))
                    return self
                      
                      
                try:
                    v.update({'name_googlemaps': place_details['name'].lower()})
                except:
                    print(f'no googlemap name found for place_id {v["place_id"]}!')

                try:
                      v.update({'opening_hours': [d.lower() for d in place_details['opening_hours']['weekday_text']]})
                except:
                      print(f'no opening_hours found for place_id {v["place_id"]}!')

                try:     
                     v.update({'rating': float(place_details['rating'])})
                except:
                     print(f'no rating found for place_id {v["place_id"]}!')

                try:
                    v.update({'url_googlemaps': place_details['url']})
                except:
                    print(f'no url found for place_id {v["place_id"]}!')

                try:
                    v.update({'website': place_details['website']})
                except:
                     print(f'no website found for place_id {v["place_id"]}!') 
        
        
        json.dump(self.tkt_venues, open('data/tkt_venues.json','w'))
        
        return self
        
        
    
if __name__ == '__main__':
    
    vm = VenueMatcher()
#     vm.select_ticketek_venues()
    vm.get_googlemaps_place_info()

2886 ticketek venues at the moment
collected 2886 venues from the locally saved file data/tkt_venues.json
1002 of these already have googlemaps name
venue 1: ABC SOUTHBANK CENTRE MELBOURNE...
venue 2: AVOCA BEACH THEATRE 69 AVOCA DRIVE AVOCA...
venue 3: ACCA SOUTHBANK MELBOURNE...
venue 4: ACADEMY BUNDA STREET CIVIC...
venue 5: ADELAIDE ENT CENTRE...
venue 6: ACMI...
venue 7: SYDNEY ATHLETIC CENTRE HOMEBUSH...
venue 8: ACADEMY CINEMA CITY ADELAIDE...
venue 9: THE ANNANDALE HOTEL...
venue 10: AUSTRALIAN GOLF CLUB...
venue 11: SYDNEY ART GALLERY ULTIMO...
venue 12: AIS ARENA CANBERRA...
venue 13: ROYAL RANDWICK...
venue 14: CHAPEL HALL ALBURY SCOTS COLLEGE NSW...
venue 15: ALTONA SPORT LEISURE CENTRE MELBOURNE...
venue 16: ALEXANDRA HILLS HOTEL QLD...
venue 17: ALMA SPORTS CLUB...
venue 18: AMA...
venue 19: AMBIWERRA FESTIVAL ERINVALE ST CORINDA...
venue 20: ANA HOTEL GOLD COAST...
venue 21: ANDERSON CINEMA VISCTORIA GARDENS...
venue 22: AUSTRALIAN NATIONAL THEATRE MELBOURNE...
venue 23: