In [39]:
import pandas as pd
import json
import math
from collections import defaultdict
from itertools import chain

In [44]:
DATA_DIR = '/Users/ik/Data/'

class StringNormalizer(object):
    """
    normalise a string
    """
    def __init__(self):
        
        # english stopwords
        self.STOP_WORDS = {line.strip() for line in open(DATA_DIR + 'stopwords' + '/english_stopwords_nltk.txt', 'r').readlines() if line.strip()} 
        self.COUNTRY_ABBRS = pd.read_csv(DATA_DIR + 'country-abbreviations' + '/country_abbreviations.txt', header=None,
                                        names="country abbr1 abbr2".split())
        self.MISC_ABBRS = {"champs": "championships", "champ": "championship", "intl": "international", "int": "international"}
        self.COUNTRY_ALT = {"united states": ["usa", "united states of america", "us"],
              "russia": ["russian federation"],
              "chinese taipei": ["taiwan"],
              "macedonia": ["fyrom"],
              "netherlands": ["holland"]}

    def normalize(self, s):
        
        # check if s is really a string
        assert isinstance(s, str), 'you are trying to normalise something that is NOT a string!'
        # note s might still be empty but that's fine as we take care of it next
        _ = "".join([ch.lower() for ch in s if ch.isalnum() or ch.isspace()])
        # remove stopwords
        _ = " ".join([w for w in _.split() if w not in self.STOP_WORDS])
        # unfold "other" abbreviations
        _  = " ".join([self.MISC_ABBRS[w] if w in self.MISC_ABBRS else w for w in _.split()])
        # pad with white spaces
        _ = ' ' + _ + ' '
        
        return _

class SportIdentifier(object):
    """
    """
    def __init__(self):
        self.sids = json.load(open(DATA_DIR + 'sports/' + 'sports-identifiers/' + "sports-identifiers-21092017.json","r"))
        # feature weights
        self.w = {"nongeneric_comps": lambda _: 1,
                  "generic_comps": lambda _: _*0.25, # so 1 generic conmp gets 0.25, 2 get 0.5
                  "sport_name": lambda _: 1, "teams": lambda _:  _*0.2,  # 1 team only 0.2, two get 1
                 "spons": lambda _: 0.5, "abbreviations": lambda _: _}
        self.SHOW_SYNS = set("""appearance display fair pageant parade presentation program spectacle expo exposition
        fanfare fireworks grandstand occurrence pageantry panoply representation shine showboat
        showing sight splash view anniversary commemoration competition fair feast gala
        holiday carnival entertainment festivities fete fiesta jubilee merrymaking trear
        bazar celebration display exhibit festival gala market pageant
        show centennial occasion spectacle act concert portrayal production burlesque
        ceremony gig matinee recital rehearsal revue rigmarole rite special
        spectacle stunt stage circus""".split())
        self.norm = StringNormalizer()
        
    def _find_competitions(self, s):
        
        comps = defaultdict(lambda: defaultdict(int))
        for sport in self.sids:
            for ckind in ["generic", "nongeneric"]:
                for gc in self.sids[sport]["competitions"][ckind]:
                    if self.norm.normalize(gc) in self.norm.normalize(s):
                        comps[sport][ckind + "_" + 'comps'] += 1
        return comps
    
    def _find_sport_name(self, s):
        comps = defaultdict(lambda: defaultdict(int))
        for sport in self.sids:
            if "(" in sport:
                sport_names = [sport_name.strip() for sport_name in sport.replace(")", "").split("(")]
            else:
                sport_names = [sport]
            for sp in sport_names:
                if self.norm.normalize(sp) in self.norm.normalize(s):
                    comps[sport]["sport_name"] += 1
        return comps
    
    def _find_abbreviation(self, s):
        comps = defaultdict(lambda: defaultdict(int))
        for sport in self.sids:
            if "abbreviations" in self.sids[sport]:
                cmn_abbrs = set(self.sids[sport]["abbreviations"]) & set(s.split())
                if cmn_abbrs:
                    comps[sport]['abbreviations'] = len(cmn_abbrs)
        return comps
    
    def _find_sponsors(self, s):
        comps = defaultdict(lambda: defaultdict(int))
        for sport in self.sids:
            if "sponsors" in self.sids[sport]:
                for sponsor in self.sids[sport]["sponsors"]:
                    if self.norm.normalize(sponsor) in self.norm.normalize(s):
                        comps[sport]['spons'] += 1
        return comps
    
    def _find_participants(self, s):
        comps = defaultdict(lambda: defaultdict(int))
        for sport in self.sids:
            if "key_participants" in self.sids[sport]:
                np = 0
                for participant in self.sids[sport]["key_participants"]:
                    if self.norm.normalize(participant) in self.norm.normalize(s):
                        np += 1
                if np > 0:
                    comps[sport]['teams'] = np
                
        return comps
    
    def find_identifiers(self, s):
        
        self.d = defaultdict(lambda: defaultdict(int))
        
        if set(s.split()) & self.SHOW_SYNS:
            return self
        
        for k, v in chain.from_iterable([self._find_competitions(s).items(), 
                                            self._find_sport_name(s).items(),
                                                self._find_abbreviation(s).items(),
                                                    self._find_sponsors(s).items(),
                                                        self._find_participants(s).items()]):
            self.d[k].update(v)
        
        return self
    
    def score_sports(self):
        sport_scores = defaultdict(float)
        for sport in self.d:
            sport_scores[sport] = sum([self.w[f](self.d[sport][f]) for f in self.d[sport]])
        # find max score and pick the likeliest sport if it has scored at least 1
        if sport_scores:
            top_sp, top_sc = sorted([(k, v) for k, v in sport_scores.items()], key=lambda _: _[1], reverse=True)[0]
            return top_sp if top_sc >= 1 else None
        else:
            return None

In [45]:
tkt_events = pd.read_csv(DATA_DIR + 'events/' + 'all-events-18092017.csv.gz', encoding='latin-1')

In [46]:
tkt_events.head()

Unnamed: 0,event_id,description
0,1,unknown - - - - - - - - -
1,2,unknown - - - - - - - - -
2,3,r carlos nakai - iwaki auditorium r. carlos ...
3,4,karen schaupp - iwaki auditorium karin scha...
4,5,unknown - - - - - - - - -


In [47]:
si = SportIdentifier()
import time
t0 = time.time()
tkt_events["is_sport"] = tkt_events.loc[tkt_events.event_id.isin(range(2000)), "description"].apply(lambda x: si.find_identifiers(x).score_sports())
print("elapsed time: {:.0f} m {:.0f} s".format(*divmod(time.time() - t0, 60)))
tkt_events[tkt_events.is_sport.notnull()]

# print("sport: {}".format(si.find_identifiers("big bash").score_sports()))

elapsed time: 1 m 1 s


Unnamed: 0,event_id,description,is_sport
28,29,samsung hockey champions - canberra hockey cen...,hockey
69,70,iska kickboxing titles - bicentennial centre i...,kickboxing
201,202,dancesport championship - canberra dancesport ...,dancesport
202,203,dancesport championship - canberra dancesport ...,dancesport
225,226,1999 foxsport canberra cup - ais arena 1999 fo...,gymnastics
226,227,1999 foxsport canberra cup - ais arena 1999 fo...,gymnastics
227,228,1999 foxsport canberra cup - ais arena 1999 fo...,gymnastics
228,229,1999 foxsport canberra cup - ais arena 1999 fo...,gymnastics
245,246,national capital dancesport championships danc...,dancesport
246,247,national capital dancesport championships danc...,dancesport


In [48]:
tkt_events.loc[tkt_events.event_id == 1594, "description"].tolist()

['vodafone wallabies v south africa - suncorp wallabies v south africa  pgm offer 2 aug 2003   suncorp stadium   suncorp stadium  wallabies v south africa  pgm offer 2 aug 2003']

In [185]:
StringNormalizer().COUNTRY_ABBRS    

Unnamed: 0,country,abbr1,abbr2
0,afghanistan,af,afg
1,albania,al,alb
2,algeria,dz,dza
3,american samoa,as,asm
4,andorra,ad,and
5,angola,ao,ago
6,anguilla,ai,aia
7,antarctica,aq,ata
8,antigua and barbuda,ag,atg
9,argentina,ar,arg
