# Topic modeling of abilities texts
In this file I'm trying to cluster the abilities by performing topic modeling (with NMF decomposition) on their descriptions.

In [1]:
import pandas as pd
from pprint import pprint
from sklearn.decomposition import NMF
from nltk.stem.snowball import EnglishStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

from atod import Abilities, Heroes

## NMF

In [2]:
# create stop-words list
heroes = Heroes.all()
heroes_names = [h.name for h in heroes]
# names of heroes commonly occur in descriptions, so
# deleting them is a good idea
words_in_heroes_names = [word.lower() for name in heroes_names for word in name.split(' ')]

eng_stop_words = TfidfVectorizer(stop_words='english').get_stop_words()
stop_words = set(words_in_heroes_names + list(eng_stop_words) 
                 + ['font', 'color', '7998b5', 'target', 'enemy', 'friendly', 'allied',
                    'remnant', 'aghanim', 'scepter', 'units', 'deal damage'
                    'cause', 'creep'])

No abilities for this HeroID == 16


In [3]:
# get all texts for all abilities
texts = Abilities.all().get_texts()
# get abilities names and  descriptions
descriptions = texts[['description', 'name']]
corpus = [a.replace('\\n', ' ').replace('%%', '%') for a in descriptions['description']]
corpus.extend(['stun', 'silence', 
               'blink is short distance teleportation', 
               'silence', 
               'healing',
               'invisibility', 'area of usage', 'armor', 'percentage',
               'DOT is damage over time (seconds)',
               'summon or place a ward', 'illusions, images, duplicates, nemesisis',
               'critical damage', 'movement speed', 'attack speed'])

stemmer = EnglishStemmer()
stemmed_corpus = []
for doc in corpus:
    stemmed_corpus.append(' '.join([stemmer.stem(word) for word in doc.split(' ')]))

corpus = stemmed_corpus

In [4]:
tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words,
                                   ngram_range=(1,3))
tfidf_vectorizer.fit(corpus)
tf_corpus = tfidf_vectorizer.transform(corpus)
tf_features_names = tfidf_vectorizer.get_feature_names()

tf_corpus.shape

(503, 12251)

In [5]:
nmf = NMF(n_components=40, init='nndsvd').fit(tf_corpus)

for topic_idx, topic in enumerate(nmf.components_):
    print("Topic %d:" % (topic_idx))
    print(", ".join([tf_features_names[i]
                    for i in topic.argsort()[:-10 - 1:-1]]))

Topic 0:
area, area deal, raze, raze area, area deal damag, raze area deal, deal damag enemi, enemi unit area, unit area, damag enemi unit
Topic 1:
movement speed, movement, speed, increas movement speed, increas movement, increas, attack movement speed, attack movement, bear, bonus movement
Topic 2:
damag time, time, deal damag time, deal damag, deal, damag, poison, enemi unit, seconds, unit
Topic 3:
dispel, type, dispel type, dispel type basic, basic dispel, type basic dispel, type basic, basic, purg, type strong
Topic 4:
base level, level, base, invok, level quas, base level quas, quas, level cabe68 exort, base level cabe68, level cabe68
Topic 5:
slow movement, slow movement speed, damag slow movement, damag slow, slow, movement, movement speed, speed, unit damag slow, kick unit damag
Topic 6:
friend, friend unit, heal friend unit, heal friend, unit, friend unit damag, unit damag, instant heal, unit damag nearbi, instant heal friend
Topic 7:
stun, damag stun, stun deal, stun damag, 

In [6]:
# find the ability with `index` in descriptions DataFrame
# and define categories of this ability
index = 19
test = tfidf_vectorizer.transform([descriptions['description'][index]])
weights = nmf.transform(test)
topics = weights.argsort()[0][-3:]

print(descriptions['name'][index])
print(descriptions['description'][index])
print('-' * len(descriptions['name'][index]))

for topic in topics:
    for i in nmf.components_[topic].argsort()[-3:]:
        print(tf_features_names[i], end=', ')
        
    print()

fissure
Slams the ground with a mighty totem, creating an impassable ridge of stone while stunning and damaging enemy units along its line.
-------
enemi, enemi unit, unit, 
deal extra damag, extra damag, extra, 
damag nearbi enemi, nearbi, nearbi enemi, 


In [7]:
# distribution of categories
%matplotlib inline
categories_dist = dict()
skills_by_category = dict()

for index, text in enumerate(descriptions['description']):
    vect = tfidf_vectorizer.transform([text])
    weights = nmf.transform(vect)
    topics = weights.argsort()[0][-2:]
    
    for topic in topics:
        # find words describing topic
        for i in nmf.components_[topic].argsort()[-1:]:
            skills_by_category.setdefault(tf_features_names[i], [])
            skills_by_category[tf_features_names[i]].append(descriptions['name'][index])
            
# categories_dist = categories_dist.dropna()
categories_dist = pd.Series({k: len(skills_by_category[k]) for k in skills_by_category})
# categories_dist.plot()



In [8]:
print('#categories', categories_dist.shape[0])
print(categories_dist)

NOTE: the name of category can be misleading, check the above example with fissure
#categories 39
ani                        37
armor                      20
attack speed               29
base level                 18
bonus                      10
creat                      43
critic                     14
damag time                  1
damage                    161
deal damag stun            10
dispel                     27
enemies                    78
extra                      12
friend                     11
gain                       79
great                       8
heal                       15
hero                       29
instanc provid increas      7
invis                      13
lightn                     28
mana                       24
miss                       32
movement speed             68
nearbi enemi                7
percentag                  30
point                      14
poison slow                 8
second                     27
short distanc teleport     16
si

In [14]:
# print abilities by category
print(skills_by_category['deal damag stun'])

['dragon_slave', 'death_pulse', 'sonic_wave', 'shadow_wave', 'dual_breath', 'deafening_blast', 'fireblast', 'unrefined_fireblast', 'rip_tide', 'shockwave']


In [None]:
# print categories which were separated good enough from others
sorted_abilities = dict()
for category in skills_by_category:
    if len(skills_by_category[category]) <= 30:
        sorted_abilities[category] = skills_by_category[category]
        
pprint(sorted_abilities)