# Topic modeling of abilities texts
In this file I'm trying to cluster the abilities by performing topic modeling (with NMF decomposition) on their descriptions.

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from nltk.stem.snowball import EnglishStemmer

from atod import Abilities, Heroes

In [2]:
# create stop-words list
heroes = Heroes.all()
heroes_names = [h.name for h in heroes]
# names of heroes commonly occur in descriptions, so
# deleting them is a good idea
words_in_heroes_names = [word.lower() for name in heroes_names for word in name.split(' ')]

eng_stop_words = TfidfVectorizer(stop_words='english').get_stop_words()
stop_words = set(words_in_heroes_names + list(eng_stop_words) 
                 + ['font', 'color', '7998b5'])

No abilities for this HeroID == 16


In [3]:
# get all texts for all abilities
texts = Abilities.all().get_texts()
# get abilities names and  descriptions
descriptions = texts[['description', 'name']]
corpus = [a.replace('\\n', ' ').replace('%%', '%') for a in descriptions['description']]
corpus.extend(['stun', 'silence', 'short distance teleportation', 'casting spells'])

stemmer = EnglishStemmer()
stemmed_corpus = []
for doc in corpus:
    stemmed_corpus.append(' '.join([stemmer.stem(word) for word in doc.split(' ')])])

corpus = stemmed_corpus

In [4]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,  
                                   stop_words=stop_words,
                                   ngram_range=(1,3))
tfidf_vectorizer.fit(corpus)
tf_corpus = tfidf_vectorizer.transform(corpus)
tf_features_names = tfidf_vectorizer.get_feature_names()

tf_corpus.shape

(492, 1627)

In [5]:
nmf = NMF(n_components=30, init='nndsvd').fit(tf_corpus)

for topic_idx, topic in enumerate(nmf.components_):
    print("Topic %d:" % (topic_idx))
    print(", ".join([tf_features_names[i]
                    for i in topic.argsort()[:-10 - 1:-1]]))

Topic 0:
unit, enemi unit, enemi, target, target unit, target enemi, target enemi unit, send, damag, unit damag
Topic 1:
upgrad aghanim scepter, scepter, upgrad aghanim, upgrad, aghanim, aghanim scepter, seconds upgrad aghanim, seconds upgrad, damage upgrad aghanim, damage upgrad
Topic 2:
movement speed, movement, speed, increas movement, increas movement speed, increas, slow movement, slow movement speed, movement speed attack, caus
Topic 3:
units, enemi units, nearbi enemi units, nearbi enemi, nearbi, enemi, damag nearbi, damag nearbi enemi, burn, damag
Topic 4:
dispel, type, dispel type, basic dispel, type basic dispel, type basic, dispel type basic, basic, strong dispel, strong
Topic 5:
bonus, bonus damag, deal bonus, deal bonus damag, damag, grant, grant bonus, deal, buildings, doubl
Topic 6:
level, base level, base, invok, quas, base level quas, level quas, exort, cabe68, cabe68 exort
Topic 7:
friend, friend unit, heal, heal friend, heal friend unit, unit, target friend, unit hea

In [46]:
# find the ability with `index` in descriptions DataFrame
# and define categories of this ability
index = 49
test = tfidf_vectorizer.transform([descriptions['description'][index]])
weights = nmf.transform(test)
topics = weights.argsort()[0][-3:]

print(descriptions['name'][index])
print('-' * len(descriptions['name'][index]))

for topic in topics:
    for i in nmf.components_[topic].argsort()[-3:]:
        print(tf_features_names[i], end=', ')
        
    print()

shadowraze1
-----------
nearbi enemi units, enemi units, units, 
damag enemi unit, damag enemi, area, 
target, area, target area, 


## Conclusion
NMF is able to define categories of abilities pretty well by taking the most important words in 3 the most valuable topics of the ability description.