In [1]:
import numpy, pandas, time
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.cluster import KMeans
from nltk.corpus import stopwords
from sklearn.utils import shuffle
from sklearn.decomposition import TruncatedSVD

In [3]:
docs = shuffle(pandas.read_csv('../../data/dataset_old.csv').NormalizedMessage.astype(str))
stop_words = list(stopwords.words('english')) + ["navy", "federal", "credit", "union"]
vec = TfidfVectorizer(stop_words=stop_words)
doc_term_mat = vec.fit_transform(docs.head(100000))

In [4]:
start_time = time.time()
pca = TruncatedSVD(n_components=100)
result = pca.fit_transform(doc_term_mat)
print("Took " + str(time.time()-start_time) + " seconds to perform TruncatedSVD.")

Took 7.546389818191528 seconds to perform TruncatedSVD.


In [5]:
numpy.shape(result)

(100000, 100)

In [6]:
start_time = time.time()
kmeans = KMeans(n_clusters=2, random_state=0, n_init=100).fit(result)
print("Took " + str(time.time()-start_time) + " seconds to compute clusters.")

Took 52.631526708602905 seconds to compute clusters.


In [7]:
def extract_topics_lda(words, doc_term_mat, number_topics = 10, number_words = 10):
    # Create and fit the LDA model
    lda = LDA(n_components=number_topics, n_jobs=-1)
    lda.fit(doc_term_mat)
    # Print the topics found by the LDA model
    topics = []
    for topic in lda.components_:
        topics.append([words[i] for i in topic.argsort()[:-number_words - 1:-1]])
    return topics

In [8]:
doc_term_mat1 = doc_term_mat[[i for i in range(numpy.shape(doc_term_mat)[0]) if kmeans.labels_[i]==0]]
doc_term_mat2 = doc_term_mat[[i for i in range(numpy.shape(doc_term_mat)[0]) if kmeans.labels_[i]==1]]
#doc_term_mat3 = doc_term_mat[[i for i in range(numpy.shape(doc_term_mat)[0]) if kmeans.labels_[i]==2]]
#doc_term_mat4 = doc_term_mat[[i for i in range(numpy.shape(doc_term_mat)[0]) if kmeans.labels_[i]==3]]
#doc_term_mat5 = doc_term_mat[[i for i in range(numpy.shape(doc_term_mat)[0]) if kmeans.labels_[i]==4]]

In [9]:
start_time = time.time()
words = vec.get_feature_names()
topics1 = extract_topics_lda(words, doc_term_mat1, number_topics=1, number_words=10)
topics2 = extract_topics_lda(words, doc_term_mat2, number_topics=1, number_words=10)
#topics3 = extract_topics_lda(words, doc_term_mat3, number_topics=1, number_words=4)
#topics4 = extract_topics_lda(words, doc_term_mat4, number_topics=1, number_words=4)
#topics5 = extract_topics_lda(words, doc_term_mat5, number_topics=1, number_words=4)

print("Took " + str(time.time()-start_time) + " seconds to compute topics by cluster.")

Took 19.17492914199829 seconds to compute topics by cluster.


In [10]:
topics = pandas.DataFrame(data={
    'cluster1_topics': topics1[0], 
    'cluster2_topics': topics2[0]#,
    #'cluster3_topics': topics3[0]#,  
    #'cluster4_topics': topics4[0], 
    #'cluster5_topics': topics5[0]
})

In [11]:
topics

Unnamed: 0,cluster1_topics,cluster2_topics
0,thank,
1,cards,zzzzs
2,thanks,fuuuu
3,us,fuzzies
