In [1]:
from pprint import pprint
from nltk.stem.snowball import EnglishStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

from atod import Heroes

In [2]:
heroes = Heroes.all()

No abilities for this HeroID == 16


In [3]:
def replace_many(string: str, replacements: list):
    ''' Performs many str.replace() functions in a row.
    
    Args:
        string: string to be changed
        replacements (list of tuples): tuples are args for replace function
            in form (old, new)
            
    Returns:
        str: `string` on which all replace() functions were performed
        
    '''
    for repl in replacements:
        string = string.replace(*repl)
        
    return string

In [4]:
def count_words(doc):
    ''' Returns amount of unique words in the document.'''
    words = set()
    for text in doc:
        words = words.union([w for w in text.split()])
        
    return len(words)

In [5]:
# replace (old, new)
replacements = [('%', ''), ('\\n', ' '), ('%%', ''), ('_', ' ')]

abilities = {h.name: list(map(lambda x: replace_many(x, replacements), 
                              h.abilities.get_texts()['description']))
                     for h in heroes}

texts_list = [text for hero in abilities.values() for text in hero]
print(count_words(texts_list))

# What above dict comprehension does: 
# abilities = dict()
# for hero in heroes:
#     abilities[hero.name] = list()
#     for ability in hero.abilities.get_texts():
#         abilities[hero.name].append(ability.replace('\\n', ' ').replace('%%', '%'))
# I just love lambdas and comprehensions, so :)

2835


In [6]:
# Create list of stop words 
# names of heroes commonly occur in descriptions, so
# there is need to remove them
heroes_names = [h.name for h in heroes]
words_in_heroes_names = [word.lower() 
                         for name in heroes_names 
                         for word in name.split(' ')]

eng_stop_words = TfidfVectorizer(stop_words='english').get_stop_words()
stop_words = set(words_in_heroes_names + list(eng_stop_words) 
                 + ['font', 'color', '7998b5', 'target', 'enemy', 'friendly', 'allied',
                    'remnant', 'aghanim', 'scepter', 'units', 'deal damage'
                    'cause', 'creep'])

In [8]:
# stemmer = EnglishStemmer()
corpus = dict()

# # for every hero
for hero, texts in abilities.items():    
#     stemmed_corpus[hero] = list()
#     # concatenate all abilities descriptions into one
#     # and stem all words inside
    corpus[hero] = ' '.join([word for doc in texts
                                  for word in doc.split(' ')])
    
# print(count_words(list(stemmed_corpus.values())))
    
# stemmed_stop_words = [stemmer.stem(word) for word in stop_words]

In [9]:
vectorizer = TfidfVectorizer(stop_words=stop_words,
                             ngram_range=(1,2),
                             min_df=2,)
vectorizer.fit(corpus.values())

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=2,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words={'warrunner', 'viper', 'since', 'devourer', 'allied', 'without', 'night', 'find', 'pugna', 'morphling', 'former', 'ours', 'as', 'each', 'riki', 'clinkz', 'fifty', 'warlock', 'spirit', 'axe', 'stalker', 'few', 'further', 'mill', 'describe', 'fifteen', 'perhaps', 'cry', 'slardar', 'huskar',...', 'done', 'co', 'well', 'monkey', 'anything', 'winter', 'front', 'storm', 'razor', 'ten', 'anyway'},
        strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [10]:
print(len(vectorizer.vocabulary_))

1151


In [11]:
print(vectorizer.vocabulary_)

{'neutral': 723, 'wave dealing': 1143, 'arrows': 67, 'current': 240, 'effects': 398, 'invis': 594, 'lost': 653, 'slowing movement': 915, 'upgradable launches': 1116, 'min': 683, 'increasing movement': 578, '50': 14, 'golem': 491, 'gathers': 482, 'seconds unit': 867, 'rolls': 851, 'destroys trees': 352, 'short distance': 887, 'leaps': 624, 'speed movement': 944, 'cause': 176, 'initial': 582, 'burns enemies': 163, 'silencing': 899, 'evasion': 430, 'collides': 211, 'damage area': 247, 'speed slowed': 945, 'providing': 783, 'explodes': 439, 'gains bonus': 479, 'duration damage': 388, 'additional': 33, 'impassable terrain': 559, '35': 12, 'stacks diminishingly': 968, 'levels': 632, 'lets loose': 630, 'impassable': 558, 'spikes': 957, 'heroes': 532, 'takes': 1017, 'kills': 608, 'targeted location': 1023, 'impact': 557, 'launch': 620, 'preventing': 773, '100': 2, 'devastating': 354, 'ally': 46, 'based max': 115, 'separate': 871, 'nearby dealing': 714, 'main': 662, 'damaging energy': 311, 'mel

In [15]:
# find the most popular words
most_popular_words = [('', 0)] * 20
print(most_popular_words)

id2word = {str(id_): word for word, id_ in vectorizer.vocabulary_.items()}
corpus_matrix = vectorizer.transform(corpus.values())

for index in range(corpus_matrix.shape[1]):
    col = corpus_matrix.getcol(index)
    
    if col.nnz > most_popular_words[0][1]:
        most_popular_words[0] = (id2word[str(index)], 
                                  col.nnz)
        most_popular_words = sorted(most_popular_words, key=lambda x: x[1])

[('', 0), ('', 0), ('', 0), ('', 0), ('', 0), ('', 0), ('', 0), ('', 0), ('', 0), ('', 0), ('', 0), ('', 0), ('', 0), ('', 0), ('', 0), ('', 0), ('', 0), ('', 0), ('', 0), ('', 0)]


In [16]:
print(most_popular_words)

[('damaging', 36), ('attacks', 38), ('dealing damage', 39), ('hero', 41), ('time', 44), ('movement speed', 45), ('bonus', 45), ('duration', 46), ('deals', 46), ('area', 47), ('dealing', 53), ('seconds', 56), ('nearby', 56), ('movement', 59), ('speed', 60), ('enemies', 60), ('attack', 67), ('unit', 77), ('upgradable', 84), ('damage', 108)]
