In [52]:
import pandas as pd
import json
import math
from collections import defaultdict
from itertools import chain

In [95]:
DATA_DIR = '/Users/ik/Data/'

class StringNormalizer(object):
    """
    normalise a string
    """
    def __init__(self):
        
        # english stopwords
        self.STOP_WORDS = {line.strip() for line in open(DATA_DIR + 'stopwords' + '/english_stopwords_nltk.txt', 'r').readlines() if line.strip()}      

    def normalize(self, s):
        
        # check if s is really a string
        assert isinstance(s, str), 'you are trying to normalise something that is NOT a string!'
        # note s might still be empty but that's fine as we take care of it next
        _ = "".join([ch.lower() for ch in s if ch.isalnum() or ch.isspace()])
        # remove stopwords
        _ = " ".join([w for w in _.split() if w not in self.STOP_WORDS])
        # pad with white spaces
        _ = ' ' + _ + ' '
        
        return _

class SportIdentifier(object):
    """
    """
    def __init__(self):
        self.sids = json.load(open(DATA_DIR + 'sports/' + 'sports-identifiers/' + "sports-identifiers-12092017.json","r"))
        # feature weights
        self.w = {"nongeneric_comps": lambda _: 1,
                  "generic_comps": lambda _: _*0.25, # so 1 generic conmp gets 0.25, 2 get 0.5
                  "sport_name": lambda _: 1, "teams": lambda _:  _*0.5,  # 1 team only 0.5, two get 1
                 "spons": lambda _: 0.5, "abbreviations": lambda _: _}
        self.norm = StringNormalizer()
        
    def _find_competitions(self, s):
        
        comps = defaultdict(lambda: defaultdict(int))
        for sport in self.sids:
            for ckind in ["generic", "nongeneric"]:
                for gc in self.sids[sport]["competitions"][ckind]:
                    if self.norm.normalize(gc) in self.norm.normalize(s):
                        comps[sport][ckind + "_" + 'comps'] += 1
        return comps
    
    def _find_sport_name(self, s):
        comps = defaultdict(lambda: defaultdict(int))
        for sport in self.sids:
            if "(" in sport:
                sport_names = [sport_name.strip() for sport_name in sport.replace(")", "").split("(")]
            else:
                sport_names = [sport]
            for sp in sport_names:
                if self.norm.normalize(sp) in self.norm.normalize(s):
                    comps[sport]["sport_name"] += 1
        return comps
    
    def _find_abbreviation(self, s):
        comps = defaultdict(lambda: defaultdict(int))
        for sport in self.sids:
            if "abbreviations" in self.sids[sport]:
                any_abbrs = set(self.sids[sport]["abbreviations"]) & set(s.split())
                if set(self.sids[sport]["abbreviations"]) & set(s.split()):
                    comps[sport]['abbr'] = len()
        return comps
    
    def _find_sponsors(self, s):
        comps = defaultdict(lambda: defaultdict(int))
        for sport in self.sids:
            if "sponsors" in self.sids[sport]:
                for sponsor in self.sids[sport]["sponsors"]:
                    if sponsor in s:
                        comps[sport]['spons'] += 1
        return comps
    
    def _find_participants(self, s):
        comps = defaultdict(lambda: defaultdict(int))
        for sport in self.sids:
            if "key_participants" in self.sids[sport]:
                np = 0
                for participant in self.sids[sport]["key_participants"]:
                    if participant in s:
                        np += 1
                if np > 0:
                    comps[sport]['teams'] = np
                
        return comps
    
    def find_identifiers(self, s):
        
        self.d = defaultdict(lambda: defaultdict(int))
        
        for k, v in chain.from_iterable([self._find_competitions(s).items(), 
                                            self._find_sport_name(s).items(),
                                                self._find_abbreviation(s).items(),
                                                    self._find_sponsors(s).items(),
                                                        self._find_participants(s).items()]):
            self.d[k].update(v)
        
        print(self.d)
        return self
    
    def score_sports(self):
        sport_scores = defaultdict(float)
        for sport in self.d:
            sport_scores[sport] = sum([self.w[f](self.d[sport][f]) for f in self.d[sport]])
        return sport_scores

In [96]:
si = SportIdentifier()
si.find_identifiers("wolfpack v league kookaburra bowl american football").score_sports()

defaultdict(<function SportIdentifier.find_identifiers.<locals>.<lambda> at 0x11ef43f28>, {'volleyball': defaultdict(<class 'int'>, {'generic_comps': 1}), 'hockey': defaultdict(<class 'int'>, {'generic_comps': 1}), 'gridiron (american football)': defaultdict(<class 'int'>, {'nongeneric_comps': 1, 'sport_name': 1, 'teams': 1})})


defaultdict(float,
            {'gridiron (american football)': 2.5,
             'hockey': 0.25,
             'volleyball': 0.25})

In [97]:
h = si.find_identifiers('china cuba weightlifting volleyball championship trials battle recharge')

defaultdict(<function SportIdentifier.find_identifiers.<locals>.<lambda> at 0x11e8e28c8>, {'cycling': defaultdict(<class 'int'>, {'nongeneric_comps': 1}), 'supercars': defaultdict(<class 'int'>, {'generic_comps': 1}), 'swimming': defaultdict(<class 'int'>, {'generic_comps': 1}), 'darts': defaultdict(<class 'int'>, {'generic_comps': 1}), 'weightlifting': defaultdict(<class 'int'>, {'generic_comps': 1, 'sport_name': 1}), 'volleyball': defaultdict(<class 'int'>, {'sport_name': 1, 'teams': 2}), 'hockey': defaultdict(<class 'int'>, {'teams': 1})})


In [57]:
pd.DataFrame.from_dict(h, orient='index').fillna(0)

TypeError: object of type 'NoneType' has no len()

In [42]:
tkt_events = pd.read_csv(DATA_DIR + 'events/' + 'all-events-18092017.csv.gz', encoding='latin-1')

In [43]:
tkt_events.head()

Unnamed: 0,event_id,description
0,1,unknown - - - - - - - - -
1,2,unknown - - - - - - - - -
2,3,r carlos nakai - iwaki auditorium r. carlos ...
3,4,karen schaupp - iwaki auditorium karin scha...
4,5,unknown - - - - - - - - -


In [70]:
abbrs = {"championships": ["champs"]}
country_alt = {"united states": ["usa", "united states of america", "us"],
              "russia": ["russian federation"],
              "chinese taipei": ["taiwan"],
              "macedonia": ["fyrom"],
              "netherlands": ["holland"]}
volleyball_words = {"volleyroos", "volleyball"}

In [16]:
gg = defaultdict(lambda: defaultdict(int))

In [17]:
gg["badminton"]["teams"] = 1

In [18]:
gg

defaultdict(<function __main__.<lambda>>,
            {'badminton': defaultdict(int, {'teams': 1})})

In [72]:
tkt_events.loc[tkt_events.description.apply(find_abbreviatons) | tkt_events.description.apply(lambda x: True if volleyball_words & set(x.split()) else False),:]

Unnamed: 0,event_id,description
24054,24057,2001 goodwill games brisbane - all events ...
24055,24058,2001 goodwill games brisbane - all events ...
24056,24059,2001 goodwill games brisbane - all events ...
24057,24060,2001 goodwill games brisbane - all events ...
24058,24061,2001 goodwill games brisbane - all events ...
24059,24062,2001 goodwill games brisbane - all events ...
24060,24063,2001 goodwill games brisbane - all events ...
24061,24064,2001 goodwill games brisbane - all events ...
24062,24065,2001 goodwill games brisbane - all events ...
24063,24066,2001 goodwill games brisbane - all events ...
