# Topic Finding for Short Texts

## 1. Introduction


In [1]:
%matplotlib inline

In [29]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, NMF, LatentDirichletAllocation
from sklearn.preprocessing import normalize

## 2. Topic Finding Models

First, let's generate some texts for test.

In [145]:
def generate_clearcut_topics():
    return np.repeat(["we love cats", "we hate dogs"], [1000, 1000])

def generate_unbalanced_topics():
    return np.repeat(["we love cats", "we love dogs"], [10, 1000])

clearcut_topics = generate_clearcut_topics()
unbalanced_topics = generate_unbalanced_topics()

In [146]:
def find_topic(texts, topic_model, n_topics, vec_model="tf", thr=1e-2):
    """Return a list of topics from texts - for demostration of simple data
    texts: array-like strings
    topic_model: {"nmf", "svd", "lda"} for LSA_NMF, LSA_SVD, LDA
    n_topics: # of topics in texts
    vec_model: {"tf", "tfidf"} for term_freq, term_freq_inverse_doc_freq
    thr: threshold for finding keywords in a topic model
    """
    ## vectorization
    vectorizer = CountVectorizer() if vec_model == "tf" else TfidfVectorizer()
    text_vec = vectorizer.fit_transform(texts)
    words = np.array(vectorizer.get_feature_names())
    ## topic finding
    topic_models = {"nmf": NMF, "svd": TruncatedSVD, "lda": LatentDirichletAllocation}
    topicfinder = topic_models[topic_model](n_topics).fit(text_vec)
    topic_dists = topicfinder.components_
    ## keywords for topics
    ## Unlike other models, LSA_SVD will generate negative values in topic_word distribution,
    ## which makes it ambiguous to choose keywords for topics. I choose to keep the sign
    ## with the words here.
    def _topic_keywords(topic_dist):
        keywords_index = np.abs(topic_dist) >= thr
        keywords_prefix = np.where(np.sign(topic_dist) > 0, "", "^")[keywords_index]
        keywords = " | ".join(map(lambda x: "".join(x), zip(keywords_prefix, words[keywords_index])))
        return keywords
    
    topic_keywords = map(_topic_keywords, topic_dists)
    return topic_keywords

## SVD: from complete vector, flip some bits, then flip more bits, to keep the directions orthogonal
pick the most principal (significant direction to flip off)

In [152]:
find_topic(clearcut_topics, "svd", 3, vec_model="tf")

[u'cats | dogs | hate | love | we',
 u'cats | ^dogs | ^hate | love',
 u'^cats | ^dogs | ^hate | ^love | we']