In [49]:
import pandas as pd
import json
import math
from collections import defaultdict
from itertools import chain

In [50]:
DATA_DIR = '/Users/ik/Data/'

class StringNormalizer(object):
    """
    normalise a string
    """
    def __init__(self):
        
        # english stopwords
        self.STOP_WORDS = {line.strip() for line in open(DATA_DIR + 'stopwords' + '/english_stopwords_nltk.txt', 'r').readlines() 
                           if line.strip()} 
        # country abbreviation according to the UN(!) standards
        self.COUNTRY_ABBRS = pd.read_csv(DATA_DIR + 'country-abbreviations' + '/country_abbreviations.txt', header=None,
                                        names="country abbr1 abbr2".split())
        self.MISC_ABBRS = {"champs": "championships", "champ": "championship", "intl": "international", 
                           "int": "international", "aust": "australian"}
        self.COUNTRY_ALT = {"united states": ["usa", "united states of america", "us"],
                              "russia": ["russian federation"], "chinese taipei": ["taiwan"], "macedonia": ["fyrom"],
                                  "netherlands": ["holland"]}

    def normalize(self, s):
        
        # check if s is really a string
        assert isinstance(s, str), 'you are trying to normalise something that is NOT a string!'
        # note s might still be empty but that's fine as we take care of it next
        _ = "".join([ch.lower() for ch in s if ch.isalnum() or ch.isspace()])
        # remove stopwords
        _ = " ".join([w for w in _.split() if w not in self.STOP_WORDS])
        # unfold "other" abbreviations
        _  = " ".join([self.MISC_ABBRS[w] if w in self.MISC_ABBRS else w for w in _.split()])
        # country alternative names
        for country in self.COUNTRY_ALT:
            for alt_names in self.COUNTRY_ALT[country]:
                _ = (' ' + _ + ' ').replace(' ' + alt_names  + ' ', ' ' + country + ' ')     
        return _

class SportIdentifier(object):
    """
    """
    def __init__(self):
        
        self.sids = json.load(open(DATA_DIR + 'sports/' + 'sports-identifiers/' + "sports-identifiers-21092017.json","r"))
        print("supported sports: {}".format(len(self.sids)))
        print(" ".join(["{}. {}".format(i, sport) for i, sport in enumerate(sorted([sp for sp in self.sids]), 1)]))

        # feature weights
        self.w = {"nongeneric_comps": lambda _: 1,
                  "generic_comps": lambda _: _*0.25, # so 1 generic conmp gets 0.25, 2 get 0.5
                  "sport_name": lambda _: 1, "teams": lambda _:  _*0.2,  # 1 team only 0.2, two get 1
                     "spons": lambda _: 0.5, "abbreviations": lambda _: _}
        
        self.SHOW_SYNS = set("""appearance display fair pageant parade presentation program spectacle expo exposition
                    fanfare fireworks grandstand occurrence pageantry panoply representation shine showboat
                    showing sight splash view anniversary commemoration competition fair feast gala
                    holiday carnival entertainment festivities fete fiesta jubilee merrymaking trear
                    bazar celebration display exhibit festival gala market pageant
                    show centennial occasion spectacle act concert portrayal production burlesque
                    ceremony gig matinee recital rehearsal revue rigmarole rite special
                    spectacle stunt stage circus""".split())
        
        self.NONSPORT_TYPES = {"theatre", "theater", "movie", "cinema", "circus", "opera", "musical", 
                              "exhibition", "market", "event", "encounter", "night", "casino", "comedy", 
                              "trivia", "charity", "fundraiser", "museum"}
        
        self.norm = StringNormalizer()
    
    
    def _find_identifiers(self, s):
        
        comps = defaultdict(lambda: defaultdict(int))
        
        for sport in self.sids:
            # competitions
            for ckind in ["generic", "nongeneric"]:
                for gc in self.sids[sport]["competitions"][ckind]:
                    if self.norm.normalize(gc) in self.norm.normalize(s):
                        comps[sport][ckind + "_" + 'comps'] += 1
            # nicknames
            if "team_nicknames" in self.sids[sport]:
                for gc in self.sids[sport]["team_nicknames"]:
                    if self.norm.normalize(gc) in self.norm.normalize(s):
                        comps[sport]["team_nicknames"] += 1
            # sport names
            if "(" in sport:
                sport_names = [sport_name.strip() for sport_name in sport.replace(")", "").split("(")]
            else:
                sport_names = [sport]
            for sp in sport_names:
                if self.norm.normalize(sp) in self.norm.normalize(s):
                    comps[sport]["sport_name"] += 1
            # abbreviations
            if "abbreviations" in self.sids[sport]:
                cmn_abbrs = set(self.sids[sport]["abbreviations"]) & set(s.split())
                if cmn_abbrs:
                    comps[sport]['abbreviations'] = len(cmn_abbrs)
            # sponsors
            if "sponsors" in self.sids[sport]:
                for sponsor in self.sids[sport]["sponsors"]:
                    if self.norm.normalize(sponsor) in self.norm.normalize(s):
                        comps[sport]['spons'] += 1
            # participants
            if "key_participants" in self.sids[sport]:
                np = 0
                for participant in self.sids[sport]["key_participants"]:
                    if self.norm.normalize(participant) in self.norm.normalize(s):
                        np += 1
                if np > 0:
                    comps[sport]['teams'] = np
                    
        return comps
    
    
    def pick_sport(self, s):
        
        # if any show synonym or non-sport word found, it's clearly not a sport
        words_in_descr = set(s.lower().split())
        
        if (self.SHOW_SYNS | self.NONSPORT_TYPES) & words_in_descr:
            return None
        
        # now if there's a chance that this is sport...
        self.d = self._find_identifiers(s)
        
        for sport in self.d:
            # first, check for the specific compatitions
            if "nongeneric_comps" in self.d[sport]:
                if (("abbreviations" in self.d[sport]) or ("key_participants" in self.d[sport]) 
                or ("sport_name" in self.d[sport])):
                    return sport
            # 1 nickname and 1 participants
            if ("team_nicknames" in self.d[sport]) and ("key_participants" in self.d[sport]):
                return sport
            # 2 participants and a generic competition
            if ("key_participants" in self.d[sport]) and ("generic_comps" in self.d[sport]):
                if (self.d[sport]["key_participants"] > 1) and (self.d[sport]["generic_comps"] > 1):
                    return sport
            # 1 abbreviation and a generic competition
            if ("abbreviations" in self.d[sport]) and ("generic_comps" in self.d[sport]):
                return sport
            # sports name (if 1 word in sentence)
            if "sport_name" in self.d[sport]:
                if len(self.norm.normalize(s)) == 1:
                    return sport
                elif "generic_comps" in self.d[sport]:
                    return sport      

In [51]:
tkt_events = pd.read_csv(DATA_DIR + 'events/' + 'all-events-18092017.csv.gz', encoding='latin-1')

In [52]:
tkt_events.head()

Unnamed: 0,event_id,description
0,1,unknown - - - - - - - - -
1,2,unknown - - - - - - - - -
2,3,r carlos nakai - iwaki auditorium r. carlos ...
3,4,karen schaupp - iwaki auditorium karin scha...
4,5,unknown - - - - - - - - -


In [None]:
si = SportIdentifier()
import time
t0 = time.time()
tkt_events["is_sport"] = tkt_events.loc[tkt_events.event_id.isin(range(20000)), "description"].apply(lambda x: si.pick_sport(x))
print("elapsed time: {:.0f} m {:.0f} s".format(*divmod(time.time() - t0, 60)))
tkt_events[tkt_events.is_sport.notnull()]

# print("sport: {}".format(si.find_identifiers("big bash").score_sports()))

supported sports: 30
1. badminton 2. baseball 3. bodybuilding 4. boxing 5. bullriding 6. cricket 7. crossfit 8. cycling 9. dancesport 10. darts 11. diving 12. equestrian 13. fencing 14. golf 15. gridiron (american football) 16. gymnastics 17. hockey 18. ice skating 19. karate 20. kickboxing 21. pentathlon 22. rowing 23. skating 24. squash 25. supercars 26. swimming 27. table tennis 28. volleyball 29. weightlifting 30. wrestling


In [23]:
tkt_events.loc[tkt_events.event_id == 2209, "description"].tolist()

['astor theatre season - prahran - melbourne  angelas ashes (m) sat 26 aug 2000 7:30pm   the astor theatre   the astor theatre   angelas ashes (m)     (2000)  sat 26 aug 2000 7:30pm']

In [24]:
StringNormalizer().COUNTRY_ABBRS    

Unnamed: 0,country,abbr1,abbr2
0,afghanistan,af,afg
1,albania,al,alb
2,algeria,dz,dza
3,american samoa,as,asm
4,andorra,ad,and
5,angola,ao,ago
6,anguilla,ai,aia
7,antarctica,aq,ata
8,antigua and barbuda,ag,atg
9,argentina,ar,arg
