# Topic modeling of abilities texts
In this file I'm trying to cluster the abilities by performing topic modeling (with NMF decomposition) on their descriptions.

In [1]:
import pandas as pd
from pprint import pprint
from sklearn.decomposition import NMF
from nltk.stem.snowball import EnglishStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

from atod import Abilities, Heroes

## NMF

In [2]:
# create stop-words list
heroes = Heroes.all()
heroes_names = [h.name for h in heroes]
# names of heroes commonly occur in descriptions, so
# deleting them is a good idea
words_in_heroes_names = [word.lower() for name in heroes_names for word in name.split(' ')]

eng_stop_words = TfidfVectorizer(stop_words='english').get_stop_words()
stop_words = set(words_in_heroes_names + list(eng_stop_words) 
                 + ['font', 'color', '7998b5', 'target', 'enemy', 'friendly', 'allied',
                    'remnant', 'aghanim', 'scepter', 'units', 'deal damage'
                    'cause', 'creep'])

No abilities for this HeroID == 16


In [3]:
# get all texts for all abilities
texts = Abilities.all().get_texts()
# get abilities names and  descriptions
descriptions = texts[['description', 'name']]
corpus = [a.replace('\\n', ' ').replace('%%', '%') for a in descriptions['description']]
corpus.extend(['stun holds enemies in place', 'silence', 
               'blink is short distance teleportation', 
               'silence prevents from casting spells', 
               'healing is the process of restoring friendly unit health',
               'invisibility', 'area of usage', 'armor', 'percentage',
               'DOT is damage over time (seconds)',
               'summon or place a ward', 'creating illusions, images, duplicates, nemesisis',
               'critical damage', 'movement speed', 'attack speed'])

stemmer = EnglishStemmer()
stemmed_corpus = []
for doc in corpus:
    stemmed_corpus.append(' '.join([stemmer.stem(word) for word in doc.split(' ')]))

corpus = stemmed_corpus

In [4]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,  
                                   stop_words=stop_words,
                                   ngram_range=(1,3))
tfidf_vectorizer.fit(corpus)
tf_corpus = tfidf_vectorizer.transform(corpus)
tf_features_names = tfidf_vectorizer.get_feature_names()

tf_corpus.shape

(503, 1558)

In [5]:
nmf = NMF(n_components=40, init='nndsvd').fit(tf_corpus)

for topic_idx, topic in enumerate(nmf.components_):
    print("Topic %d:" % (topic_idx))
    print(", ".join([tf_features_names[i]
                    for i in topic.argsort()[:-10 - 1:-1]]))

Topic 0:
unit, enemi unit, enemi, send, beast, radius, slowed, swarm, unit location, deal random
Topic 1:
movement speed, movement, speed, increas movement, increas movement speed, increas, movement speed attack, slow movement speed, bonus movement speed, bonus movement
Topic 2:
dispel, type basic dispel, basic dispel, type basic, dispel type basic, basic, dispel type, type, purg, purg buff
Topic 3:
deal, deal damag, damag, deal damag stun, damag stun, stun, wave deal, wave deal damag, unleash, unit wave deal
Topic 4:
nearbi enemi, nearbi, enemi, nearbi enemi unit, damag nearbi enemi, damag nearbi, burn, radius, damag, struck
Topic 5:
level, base level, base, invok, quas, level quas, base level quas, cabe68 exort, exort, cabe68
Topic 6:
dealt, damag dealt, damag, addit damag, addit, damag base, addit damag base, caus, deal addit, base
Topic 7:
attack, allow, great, attack damag, element, damag attack, attack speed, uniqu attack, uniqu, movement attack
Topic 8:
area, unit area, enemi un

In [6]:
# find the ability with `index` in descriptions DataFrame
# and define categories of this ability
index = 19
test = tfidf_vectorizer.transform([descriptions['description'][index]])
weights = nmf.transform(test)
topics = weights.argsort()[0][-3:]

print(descriptions['name'][index])
print(descriptions['description'][index])
print('-' * len(descriptions['name'][index]))

for topic in topics:
    for i in nmf.components_[topic].argsort()[-3:]:
        print(tf_features_names[i], end=', ')
        
    print()

fissure
Slams the ground with a mighty totem, creating an impassable ridge of stone while stunning and damaging enemy units along its line.
-------
nearbi, nearbi enemies, enemies, 
deal extra, extra damag, extra, 
ground, stun damag, stun, 


In [7]:
# distribution of categories
%matplotlib inline
categories_dist = dict()
skills_by_category = dict()

for index, text in enumerate(descriptions['description']):
    vect = tfidf_vectorizer.transform([text])
    weights = nmf.transform(vect)
    topics = weights.argsort()[0][-2:]
    
    for topic in topics:
        # find words describing topic
        for i in nmf.components_[topic].argsort()[-1:]:
            skills_by_category.setdefault(tf_features_names[i], [])
            skills_by_category[tf_features_names[i]].append(descriptions['name'][index])
            
# categories_dist = categories_dist.dropna()
categories_dist = pd.Series({k: len(skills_by_category[k]) for k in skills_by_category})
# categories_dist.plot()

In [8]:
print('NOTE: the name of category can be misleading, check the above example with fissure')
print('#categories', categories_dist.shape[0])
print(categories_dist)

NOTE: the name of category can be misleading, check the above example with fissure
#categories 39
ani                12
area               40
armor              21
attack             58
attack speed        7
bonus              18
caus               17
creat               1
critic              7
damag enemi         5
damage            123
deal                3
dealt              18
dispel             31
enemies            81
extra              12
friend              6
gain               40
health             38
hero               31
invis              13
level              17
mana               17
morphl             19
movement speed     64
nearbi enemi        2
poison              7
rang               25
second             30
seconds            37
silenc             28
slow                3
spells             16
stun               25
teleport           14
time               37
tree               21
unit               24
ward                8
dtype: int64


In [9]:
# print abilities by category
print(skills_by_category['rang'])

['morph_agi', 'morph_str', 'ether_shock', 'reincarnation', 'heat_seeking_missile', 'sadist', 'lunar_blessing', 'dragon_tail', 'shadow_wave', 'battery_assault', 'untouchable', 'darkness', 'global_silence', 'shapeshift', 'drunken_brawler', 'primal_split', 'spirit_bear', 'true_form', 'null_field', 'thunder_strike', 'song_of_the_siren', 'concussive_shot', 'reflection', 'tempest_double', 'untransform']


In [10]:
# print categories which were separated good enough from others
sorted_abilities = dict()
for category in skills_by_category:
    if len(skills_by_category[category]) <= 30:
        sorted_abilities[category] = skills_by_category[category]
        
pprint(sorted_abilities)

{'ani': ['waveform',
         'illusory_orb',
         'ethereal_jaunt',
         'plasma_field',
         'chain_frost',
         'wild_axes',
         'chronosphere',
         'wall_of_replica',
         'the_swarm',
         'devour',
         'last_word',
         'launch_snowball'],
 'armor': ['berserkers_call',
           'dark_lord',
           'eye_of_the_storm',
           'carrion_swarm',
           'warcry',
           'amplify_damage',
           'gush',
           'wave_of_terror',
           'frost_armor',
           'meld',
           'dragon_blood',
           'weave',
           'the_swarm',
           'acid_spray',
           'forge_spirit',
           'reality_rift',
           'rip_tide',
           'berserkers_rage',
           'reactive_armor',
           'viscous_nasal_goo',
           'natural_order'],
 'attack speed': ['inner_beast',
                  'strafe',
                  'geminate_attack',
                  'overpower',
                  'alacrity',
   