In [1]:
import pandas as pd
import json
import math
from collections import defaultdict, Counter
from itertools import chain
import time

In [2]:
DATA_DIR = '/Users/ik/Data/'

class StringNormalizer(object):
    """
    normalise a string
    """
    def __init__(self):
        
        # english stopwords
        self.STOP_WORDS = {line.strip() for line in open(DATA_DIR + 'stopwords' + '/english_stopwords_nltk.txt', 'r').readlines() 
                           if line.strip()} 
        # country abbreviation according to the UN(!) standards
        self.COUNTRY_ABBRS = pd.read_csv(DATA_DIR + 'country-abbreviations' + '/country_abbreviations.txt', header=None,
                                        names="country abbr1 abbr2".split())
        self.MISC_ABBRS = {"champs": "championships", "champ": "championship", "intl": "international", 
                           "int": "international", "aust": "australian"}
        self.COUNTRY_ALT = {"united states": ["usa", "united states of america", "us"],
                              "russia": ["russian federation"], "chinese taipei": ["taiwan"], "macedonia": ["fyrom"],
                                  "netherlands": ["holland"]}

    def normalize(self, s):
        
        # check if s is really a string
        assert isinstance(s, str), 'you are trying to normalise something that is NOT a string!'
        # note s might still be empty but that's fine as we take care of it next
        _ = "".join([ch.lower() for ch in s if ch.isalnum() or ch.isspace()])
        # remove stopwords
        _ = " ".join([w for w in _.split() if w not in self.STOP_WORDS])
        # unfold "other" abbreviations
        _  = " ".join([self.MISC_ABBRS[w] if w in self.MISC_ABBRS else w for w in _.split()])
        # country alternative names
        _ = ' ' + _ + ' '
        for country in self.COUNTRY_ALT:
            for alt_names in self.COUNTRY_ALT[country]:
                _ = _.replace(' ' + alt_names  + ' ', ' ' + country + ' ')     
        return _

class SportIdentifier(object):
    """
    """
    def __init__(self):
        
        self.sids = json.load(open(DATA_DIR + 'sports/' + 'sports-identifiers/' + "sports-identifiers-27092017.json","r"))
        print("supported sports: {}".format(len(self.sids)))
        print(" ".join(["{}. {}".format(i, sport) for i, sport in enumerate(sorted([sp for sp in self.sids]), 1)]))
        
        self.SHOW_SYNS = set("""appearance display fair pageant parade presentation program spectacle expo exposition
                    fanfare fireworks grandstand occurrence pageantry panoply representation shine showboat
                    showing sight splash view anniversary commemoration competition fair feast gala
                    holiday carnival entertainment festivities fete fiesta jubilee merrymaking trear
                    bazar celebration display exhibit festival gala market pageant
                    show centennial occasion spectacle act concert portrayal production burlesque
                    ceremony gig matinee recital rehearsal revue rigmarole rite special
                    spectacle stunt stage circus""".split())
        self.MEAL_SYNS = set("breakfast lunch dinner banquet feast supper".split())
        
        self.NONSPORT_TYPES = {"theatre", "theater", "movie", "cinema", "circus", "opera", "musical", 
                              "exhibition", "market", "event", "encounter", "night", "casino", "comedy", 
                              "trivia", "charity", "fundraiser", "museum", "donation", "parking"}
        
        self.norm = StringNormalizer()
    
    
    def _find_identifiers(self, s):
        
        comps = defaultdict(lambda: defaultdict(int))
        
        for sport in self.sids:
            # competitions
            try:
                for gc in self.sids[sport]["competitions"]["generic"]:
                    if self.norm.normalize(gc) in self.norm.normalize(s):
                        comps[sport]["generic" + "_" + 'comps'] += 1
            except:
                pass
            
            try:
                for gc in self.sids[sport]["competitions"]["nongeneric"]:
                    if self.norm.normalize(gc) in self.norm.normalize(s):
                        comps[sport]["nongeneric" + "_" + 'comps'] += 1
            except:
                pass
            
            # nicknames
            if "team_nicknames" in self.sids[sport]:
                for gc in self.sids[sport]["team_nicknames"]:
                    if self.norm.normalize(gc) in self.norm.normalize(s):
                        comps[sport]["team_nicknames"] += 1
            # sport names
            if "(" in sport:
                sport_names = [sport_name.strip() for sport_name in sport.replace(")", "").split("(")]
            else:
                sport_names = [sport]
            for sp in sport_names:
                if self.norm.normalize(sp) in self.norm.normalize(s):
                    comps[sport]["sport_name"] += 1
            # abbreviations
            if "abbreviations" in self.sids[sport]:
                cmn_abbrs = set(self.sids[sport]["abbreviations"]) & set(s.split())
                if cmn_abbrs:
                    comps[sport]['abbreviations'] = len(cmn_abbrs)
            # sponsors
            if "sponsors" in self.sids[sport]:
                for sponsor in self.sids[sport]["sponsors"]:
                    if self.norm.normalize(sponsor) in self.norm.normalize(s):
                        comps[sport]['sponsors'] += 1
            # participants
            if "key_participants" in self.sids[sport]:
                np = 0
                for participant in self.sids[sport]["key_participants"]:
                    if self.norm.normalize(participant) in self.norm.normalize(s):
                        np += 1
                if np > 0:
                    comps[sport]['key_participantss'] = np
                    
        return comps
    
    
    def pick_sport(self, s):
        
        # if any show synonym or non-sport word found, it's clearly not a sport
        words_in_descr = set(s.lower().split())
        
        if (self.SHOW_SYNS | self.NONSPORT_TYPES) & words_in_descr:
            return None
        
        # now if there's a chance that this is sport...
        self.d = self._find_identifiers(s)
        
        for sport in self.d:
            # first, check for the specific compatitions
            if "nongeneric_comps" in self.d[sport]:
                if {"abbreviations", "key_participants", "sport_name", "team_nicknames"} & set(self.d[sport]):
                    return sport
                else:
                    return None
            if "generic_comps" in self.d[sport]:
                try:
                    if (self.d[sport]["key_participants"] == 2):
                        return sport
                except:
                    if {"abbreviations", "sport_name", "team_nicknames"} & set(self.d[sport]):
                        return sport
                    else:
                        return None
            if ("abbreviations" in self.d[sport]) or ("sponsors" in self.d[sport]):
                 try:
                    if (self.d[sport]["key_participants"] == 2):
                        return sport
                 except:
                    return None


In [None]:
tkt_events = pd.read_csv(DATA_DIR + 'events/' + 'all-events-18092017.csv.gz', encoding='latin-1')
print("total events to check: {}".format(len(tkt_events)))

total events to check: 397753


In [None]:
si = SportIdentifier()

t0 = time.time()
tkt_events["is_sport"] = tkt_events.description.apply(lambda x: si.pick_sport(x))
print("elapsed time: {:.0f} m {:.0f} s".format(*divmod(time.time() - t0, 60)))
tkt_events[tkt_events.is_sport.notnull()].to_csv("all-sport-events.csv")

supported sports: 44
1. afl (australian football league) 2. archery 3. badminton 4. baseball 5. bodybuilding 6. boxing 7. bullriding 8. canoeing 9. cricket 10. crossfit 11. cycling 12. dancesport 13. darts 14. diving 15. equestrian 16. fencing 17. golf 18. gridiron (american football) 19. gymnastics 20. handball 21. hockey 22. ice skating 23. karate 24. kickboxing 25. lacrosse 26. motorcycle racing 27. nrl (national rugby league) 28. pentathlon 29. rowing 30. rugby union 31. sailing 32. shooting 33. skating 34. softball 35. squash 36. supercars 37. swimming 38. table tennis 39. taekwondo 40. triathlon 41. volleyball 42. water polo 43. weightlifting 44. wrestling


In [None]:
sports = pd.read_csv("all-sport-events.csv")

In [None]:
sports.head()

In [None]:
Counter(sports.is_sport)

In [None]:
sports.info()