In [110]:
import pandas as pd
import json
import re
from itertools import chain
from collections import defaultdict

In [106]:
class VenueMatcher:
    
    """
    all useful Ticketek venue information is contained in two tables which we join together
    """
    TKT_VENUES = pd.read_csv('data/sample_venue_dim.csv.gz', 
                             encoding='latin-1', 
                             error_bad_lines=False, 
                             sep='\t')[['pk_venue_dim', 'venue_name', 'venue_desc', 'venue_region_desc']] \
    .join(pd.read_csv('data/sample_VenuesPowerWebAddresses.csv.gz', 
                             sep='\t', 
                             encoding='latin-1')[['venue_name', 'vcName', 'paAddressLine1', 'paAddressLine2','vcRegionName']] \
                             .set_index('venue_name'), on='venue_name', how='left').fillna('')
    
    def __init__(self):
        
        self.suburbs = json.load(open('data/aus_suburbs_auspost_APR2017.json'))
        
        pass
    
    def select_ticketek_venues(self):
        """
        select and restructure relevant ticketek venue data
        """
        VenueMatcher.TKT_VENUES = VenueMatcher.TKT_VENUES[VenueMatcher.TKT_VENUES['venue_name'].str.isalpha()]
        print(f'venues with names: {len(VenueMatcher.TKT_VENUES)} rows, {len(set(VenueMatcher.TKT_VENUES.pk_venue_dim))} unique keys')
        
        return self
    
    def _normalize(self, st):
        """
        normalize a string st
        """
        st = st.lower()
        # replace separators with white spaces
        st = re.sub(r'[-/_.]', ' ', st)
        # keep only letters, numbers and white spaces
        st = ''.join([l for l in st if str(l).isalnum() or str(l).isspace()])
        st = re.sub(r'\s{2,}', ' ', st)
        
        return st
    
    def _find_state(self, st):
        """
        find state names in string st
        """
        states = {'nsw': 'new south wales', 
                    'act': 'australian capital territory', 
                    'vic': 'victoria',
                    'tas': 'tasmania',
                    'wa': 'western australia',
                    'nt': 'northern teritory',
                    'sa': 'south australia',
                    'qld': 'queensland'}
        
        states_ = {v: k for k, v in states.items()}
        
        states_found = set()
        
        st_norm = self._normalize(st)
        
        for s in (set(states) | set(states_)):
            try:
                states_found.add(re.search(r'\b' + s + r'\b', st_norm).group(0))
            except:
                continue
        
        return {s if s not in states_ else states_[s] for s in states_found}
    
    def _find_suburb(self, st):
        """
        find suburb names in string st; returns a set of tuples (suburb, state)
        """
        st_norm = self._normalize(st)
        print(st_norm)
        
        suburbs_found = set()
        
        words_ = st_norm.split()
        
        for i, w in enumerate(words_):
            
            l1_ = w[0]
            
            if l1_ in self.suburbs:
            
                for r in self.suburbs[l1_]:
                    
                    sub_ = None
                    
                    try:
                        sub_ = re.search(r'\b' + r['name'] + r'\b', ' '.join(words_[i:])).group(0)
                    except:
                        continue
                        
                    if sub_:
                        suburbs_found.add((sub_, r['state']))
                    
        return suburbs_found 
    
if __name__ == '__main__':
    
    vm = VenueMatcher()
    vm.select_ticketek_venues()

venues with names: 3037 rows, 3037 unique keys


In [107]:
vm._normalize("svoca be3ch 0---- hotel nsw -99u western australia in #$ canberra " + str('NaN'))

'svoca be3ch 0 hotel nsw 99u western australia in canberra nan'

In [108]:
vm._find_state("svoca be3ch 0---- hotel nsw -99u western australia in #$ canberra ")

{'nsw', 'wa'}

In [109]:
vm._find_suburb("avoca beach 0---- coogee hotel nsw -99u western australia in #$ canberra ")

avoca beach 0 coogee hotel nsw 99u western australia in canberra 


{('avoca', 'nsw'),
 ('avoca', 'qld'),
 ('avoca', 'tas'),
 ('avoca', 'vic'),
 ('avoca beach', 'nsw'),
 ('canberra', 'act'),
 ('coogee', 'nsw'),
 ('coogee', 'wa')}

In [72]:
vm.TKT_VENUES.head()

Unnamed: 0,pk_venue_dim,venue_name,venue_desc,venue_region_desc,vcName,paAddressLine1,paAddressLine2,vcRegionName
2,3,ABC,ABC SOUTHBANK CENTRE - MELBOURNE,MELBOURNE REGION,alex test,,,NSW/ACT
3,4,ABT,AVOCA BEACH THEATRE - 69 AVOCA DRIVE AVOCA,SYDNEY REGION,Avoca Beach Theatre,69 Avoca Drive,,NSW - Central Coast
4,5,ACA,ACCA SOUTHBANK - MELBOURNE,MELBOURNE REGION,,,,
5,6,ACB,ACADEMY BUNDA STREET - CIVIC,SYDNEY REGION,Academy,Bunda Street,,ACT
6,7,ACC,ADELAIDE ENT. CENTRE,ADELAIDE REGION,,,,


In [20]:
ws = [w for _ in l for w in _.lower().split()]

In [73]:
vm.suburbs

{'a': [{'name': 'aarons pass', 'state': 'nsw', 'postcode': 2850},
  {'name': 'abba river', 'state': 'wa', 'postcode': 6280},
  {'name': 'abbey', 'state': 'wa', 'postcode': 6280},
  {'name': 'abbeyard', 'state': 'vic', 'postcode': 3737},
  {'name': 'abbeywood', 'state': 'qld', 'postcode': 4613},
  {'name': 'abbotsbury', 'state': 'nsw', 'postcode': 2176},
  {'name': 'abbotsford', 'state': 'vic', 'postcode': 3067},
  {'name': 'abbotsford', 'state': 'qld', 'postcode': 4670},
  {'name': 'abbotsford', 'state': 'nsw', 'postcode': 2046},
  {'name': 'abbotsham', 'state': 'tas', 'postcode': 7315},
  {'name': 'abeckett street', 'state': 'vic', 'postcode': 8006},
  {'name': 'abels bay', 'state': 'tas', 'postcode': 7112},
  {'name': 'abercorn', 'state': 'qld', 'postcode': 4627},
  {'name': 'abercrombie', 'state': 'nsw', 'postcode': 2795},
  {'name': 'abercrombie river', 'state': 'nsw', 'postcode': 2795},
  {'name': 'aberdare', 'state': 'nsw', 'postcode': 2325},
  {'name': 'aberdeen', 'state': 'tas'