In [24]:
import pandas as pd
import json
from collections import defaultdict
from itertools import chain

In [36]:
DATA_DIR = '/Users/ik/Data/'

class StringNormalizer(object):
    """
    normalise a string
    """
    def __init__(self):
        
        # english stopwords
        self.STOP_WORDS = {line.strip() for line in open(DATA_DIR + 'stopwords' + '/english_stopwords_nltk.txt', 'r').readlines() if line.strip()}      

    def normalize(self, s):
        
        # check if s is really a string
        assert isinstance(s, str), 'you are trying to normalise something that is NOT a string!'
        # note s might still be empty but that's fine as we take care of it next
        _ = "".join([ch.lower() for ch in s if ch.isalnum() or ch.isspace()])
        # remove stopwords
        _ = " ".join([w for w in _.split() if w not in self.STOP_WORDS])
        
        return _

class SportIdentifier(object):
    """
    """
    def __init__(self):
        self.sids = json.load(open(DATA_DIR + 'sports/' + 'sports-identifiers/' + "sports-identifiers-12092017.json","r"))
        
    def _find_competitions(self, s):
        
        comps = defaultdict(int)
        for sport in self.sids:
            for ckind in ["generic", "nongeneric"]:
                for gc in self.sids[sport]["competitions"][ckind]:
                    if gc in s:
                        comps["_".join([sport, ckind[0], 'comps'])] += 1
        return comps
    
    def _find_sport_name(self, s):
        comps = defaultdict(int)
        for sport in self.sids:
            if sport in s.split():
                comps[sport] += 1
        return comps
    
    def _find_abbreviation(self, s):
        comps = defaultdict(int)
        for sport in self.sids:
            if set(self.sids[sport]["abbreviations"]) & set(s.split()):
                comps["_".join([sport, 'abbr'])] += 1
        return comps
    
    def find_identifiers(self, s):
        return {k: v for k, v in chain.from_iterable([self._find_competitions(s).items(), 
                                                      self._find_sport_name(s).items(),
                                                     self._find_abbreviation(s).items()])}
            
        

In [37]:
si = SportIdentifier()

In [39]:
si._find_abbreviation('cfa top tour australia')

KeyError: 'abbreviations'

In [40]:
si.sids

{'badminton': {'abbreviations': ['bwf'],
  'competitions': {'generic': ['international series', 'championships'],
   'nongeneric': ['sudirman cup',
    'li-ning sydney',
    'australian open superseries']}},
 'cycling': {'abbreviations': ['uci', 'its', 'ncxs', 'bmx'],
  'competitions': {'generic': ['world championships',
    'classic',
    'challenge',
    'series',
    'race'],
   'nongeneric': ['mountain bike world championships',
    'road world championships',
    'battle recharge',
    'junior track series',
    'national road series',
    'masters road national championships',
    'omnium national championships',
    'national madison championships',
    'bay cycling classic',
    'santos tour down under',
    'cadel evans great ocean road race',
    'jayco herald sun tour',
    'oceania road championship',
    'mersey valley tour',
    'grafton to inverell',
    'bmx world championships',
    "amy's ride",
    'grafton to inverell cyclosportif',
    'marysville lake mountain cha

In [38]:
si.find_identifiers('fiba world badminton cup iff')

KeyError: 'abbreviations'

In [63]:
tkt_events = pd.read_csv('~/Data/ticketek-shows/all-tkt-events-11092017.csv.gz', encoding='latin-1', sep='\t')

In [62]:
def find_abbreviatons(s):
    return True if set(normalise(s).split()) & abbrs_volleyball else False

def find_countries(s):
    return True if len(set(normalise(s).split()) & cntrs_volleyball) == 2 else False

def find_competition(s):
    for compet in comps_volleyball:
        if ' ' + compet + ' ' in ' ' + s + ' ':
            return True
    return False     

In [70]:
abbrs = {"championships": ["champs"]}
country_alt = {"united states": ["usa", "united states of america", "us"],
              "russia": ["russian federation"],
              "chinese taipei": ["taiwan"],
              "macedonia": ["fyrom"],
              "netherlands": ["holland"]}
volleyball_words = {"volleyroos", "volleyball"}

In [35]:
si.sids

{'badminton': {'abbreviations': ['bwf'],
  'competitions': {'generic': ['international series', 'championships'],
   'nongeneric': ['sudirman cup',
    'li-ning sydney',
    'australian open superseries']}},
 'cycling': {'abbreviations': ['uci', 'its', 'ncxs', 'bmx'],
  'competitions': {'generic': ['world championships',
    'classic',
    'challenge',
    'series',
    'race'],
   'nongeneric': ['mountain bike world championships',
    'road world championships',
    'battle recharge',
    'junior track series',
    'national road series',
    'masters road national championships',
    'omnium national championships',
    'national madison championships',
    'bay cycling classic',
    'santos tour down under',
    'cadel evans great ocean road race',
    'jayco herald sun tour',
    'oceania road championship',
    'mersey valley tour',
    'grafton to inverell',
    'bmx world championships',
    "amy's ride",
    'grafton to inverell cyclosportif',
    'marysville lake mountain cha

In [72]:
tkt_events.loc[tkt_events.description.apply(find_abbreviatons) | tkt_events.description.apply(lambda x: True if volleyball_words & set(x.split()) else False),:]

Unnamed: 0,event_id,description
24054,24057,2001 goodwill games brisbane - all events ...
24055,24058,2001 goodwill games brisbane - all events ...
24056,24059,2001 goodwill games brisbane - all events ...
24057,24060,2001 goodwill games brisbane - all events ...
24058,24061,2001 goodwill games brisbane - all events ...
24059,24062,2001 goodwill games brisbane - all events ...
24060,24063,2001 goodwill games brisbane - all events ...
24061,24064,2001 goodwill games brisbane - all events ...
24062,24065,2001 goodwill games brisbane - all events ...
24063,24066,2001 goodwill games brisbane - all events ...
