# Topic modeling of abilities texts
In this file I'm trying to cluster the abilities by performing topic modeling (with NMF decomposition) on their descriptions.

In [1]:
import pandas as pd
from pprint import pprint
from sklearn.decomposition import NMF
from nltk.stem.snowball import EnglishStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

from atod import Abilities, Heroes

## NMF

In [2]:
# create stop-words list
heroes = Heroes.all()
heroes_names = [h.name for h in heroes]
# names of heroes commonly occur in descriptions, so
# deleting them is a good idea
words_in_heroes_names = [word.lower() for name in heroes_names for word in name.split(' ')]

eng_stop_words = TfidfVectorizer(stop_words='english').get_stop_words()
stop_words = set(words_in_heroes_names + list(eng_stop_words) 
                 + ['font', 'color', '7998b5', 'target', 'enemy', 'friendly', 'allied',
                    'remnant', 'aghanim', 'scepter', 'units', 'deal damage'
                    'cause', 'creep'])

No abilities for this HeroID == 16


In [3]:
# get all texts for all abilities
texts = Abilities.all().get_texts()
# get abilities names and  descriptions
descriptions = texts[['description', 'name']]
corpus = [a.replace('\\n', ' ').replace('%%', '%') for a in descriptions['description']]
corpus.extend(['stun', 'silence', 
               'blink is short distance teleportation', 
               'silence', 
               'healing',
               'invisibility', 'area of usage', 'armor', 'percentage',
               'DOT is damage over time (seconds)',
               'summon or place a ward', 'illusions, images, duplicates, nemesisis',
               'critical damage', 'movement speed', 'attack speed'])

stemmer = EnglishStemmer()
stemmed_corpus = []
for doc in corpus:
    stemmed_corpus.append(' '.join([stemmer.stem(word) for word in doc.split(' ')]))

corpus = stemmed_corpus

In [4]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,  
                                   stop_words=stop_words,
                                   ngram_range=(1,3))
tfidf_vectorizer.fit(corpus)
tf_corpus = tfidf_vectorizer.transform(corpus)
tf_features_names = tfidf_vectorizer.get_feature_names()

tf_corpus.shape

(503, 1554)

In [5]:
nmf = NMF(n_components=40, init='nndsvd').fit(tf_corpus)

for topic_idx, topic in enumerate(nmf.components_):
    print("Topic %d:" % (topic_idx))
    print(", ".join([tf_features_names[i]
                    for i in topic.argsort()[:-10 - 1:-1]]))

Topic 0:
unit, enemi unit, enemi, send, beast, radius, upgrad, unit damag, spells, swarm
Topic 1:
movement speed, movement, speed, increas movement, increas movement speed, increas, cast, slow movement speed, movement speed attack, bonus movement
Topic 2:
dispel, dispel type basic, type basic dispel, type basic, basic dispel, basic, dispel type, type, purg, magic
Topic 3:
damag enemi, damag enemi unit, deal damag enemi, damag, enemi unit, deal damag, deal, enemi, unit, area deal
Topic 4:
nearbi enemi, nearbi, enemi, nearbi enemi unit, burn, damag nearbi enemi, damag nearbi, radius, struck, damag
Topic 5:
damag time, time, deal damag, deal, deal damag time, damag, poison, time seconds, time slow, damag time slow
Topic 6:
level, base level, base, invok, base level quas, level quas, quas, level cabe68 exort, level cabe68, base level cabe68
Topic 7:
attack, damag attack, great, slow attack, allow, attack movement, poison slow, element, attack damag, attack slow
Topic 8:
area, unit area, en

In [6]:
# find the ability with `index` in descriptions DataFrame
# and define categories of this ability
index = 19
test = tfidf_vectorizer.transform([descriptions['description'][index]])
weights = nmf.transform(test)
topics = weights.argsort()[0][-3:]

print(descriptions['name'][index])
print(descriptions['description'][index])
print('-' * len(descriptions['name'][index]))

for topic in topics:
    for i in nmf.components_[topic].argsort()[-3:]:
        print(tf_features_names[i], end=', ')
        
    print()

fissure
Slams the ground with a mighty totem, creating an impassable ridge of stone while stunning and damaging enemy units along its line.
-------
morphl, ani enemi, ani, 
deal damag, time, damag time, 
slam, stun, ground, 


In [7]:
# distribution of categories
%matplotlib inline
categories_dist = dict()
skills_by_category = dict()

for index, text in enumerate(descriptions['description']):
    vect = tfidf_vectorizer.transform([text])
    weights = nmf.transform(vect)
    topics = weights.argsort()[0][-2:]
    
    for topic in topics:
        # find words describing topic
        for i in nmf.components_[topic].argsort()[-1:]:
            skills_by_category.setdefault(tf_features_names[i], [])
            skills_by_category[tf_features_names[i]].append(descriptions['name'][index])
            
# categories_dist = categories_dist.dropna()
categories_dist = pd.Series({k: len(skills_by_category[k]) for k in skills_by_category})
# categories_dist.plot()

In [8]:
print('NOTE: the name of category can be misleading, check the above example with fissure')
print('#categories', categories_dist.shape[0])
print(categories_dist)

NOTE: the name of category can be misleading, check the above example with fissure
#categories 39
allow              11
ani                12
area               47
armor              21
attack             58
attack speed       19
bonus              11
caus               24
control            25
creat               5
critic              5
damag enemi         3
damag time         17
damage            170
dispel             25
enemies            61
friend              5
ground             12
heal                9
health             40
hero               40
invis              10
level              24
mana               15
movement speed     64
nearbi enemi        1
path               16
percentag           4
rang               24
second             22
seconds            43
silenc             17
slow               18
stun               13
teleport           18
tree               19
uniqu attack       16
unit               20
ward               12
dtype: int64


In [9]:
# print abilities by category
print(skills_by_category['rang'])

['morph_agi', 'morph_str', 'ether_shock', 'reincarnation', 'voodoo_restoration', 'sadist', 'lunar_blessing', 'dragon_tail', 'shadow_wave', 'battery_assault', 'darkness', 'flaming_lasso', 'invoke', 'drunken_brawler', 'primal_split', 'spirit_bear', 'true_form', 'spell_steal', 'thunder_strike', 'song_of_the_siren', 'stone_caller', 'reflection', 'tempest_double', 'untransform']


In [10]:
# print categories which were separated good enough from others
sorted_abilities = dict()
for category in skills_by_category:
    if len(skills_by_category[category]) <= 30:
        sorted_abilities[category] = skills_by_category[category]
        
pprint(sorted_abilities)

{'allow': ['reincarnation',
           'guardian_angel',
           'quas',
           'wex',
           'exort',
           'invoke',
           'deafening_blast',
           'primal_split',
           'conjure_image',
           'splinter_blast',
           'tempest_double'],
 'ani': ['voodoo',
         'morph_replicate',
         'phase_shift',
         'teleportation',
         'wall_of_replica',
         'devour',
         'enrage',
         'divided_we_stand',
         'relocate',
         'launch_snowball',
         'mischief',
         'untransform'],
 'armor': ['berserkers_call',
           'dark_lord',
           'eye_of_the_storm',
           'carrion_swarm',
           'warcry',
           'amplify_damage',
           'gush',
           'wave_of_terror',
           'frost_armor',
           'meld',
           'dragon_blood',
           'weave',
           'the_swarm',
           'acid_spray',
           'forge_spirit',
           'reality_rift',
           'rip_tide',
     