In [174]:
import gensim
import pandas as pd
import logging
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

tokenizer = RegexpTokenizer(r'\w+')
STOPWORDS = set(stopwords.words('english'))
STOPWORDS.update(['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}'])

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [175]:
data = pd.read_csv("data/sample.csv")
data.head()

Unnamed: 0,url,title,content,tag
0,https://www.reuters.com/article/clariant-resul...,Clariant keeps full year guidance after meetin...,"ZURICH, July 27 Swiss chemicals group Clariant...",markets
1,https://www.reuters.com/article/clariant-resul...,Clariant keeps full year guidance after meetin...,"ZURICH, July 27 Swiss chemicals group Clariant...",markets
2,https://www.reuters.com/article/clariant-resul...,Clariant keeps full year guidance after meetin...,"ZURICH, July 27 Swiss chemicals group Clariant...",markets
3,https://www.reuters.com/article/clariant-resul...,Clariant keeps full year guidance after meetin...,"ZURICH, July 27 Swiss chemicals group Clariant...",markets
4,https://www.reuters.com/article/clariant-resul...,Clariant keeps full year guidance after meetin...,"ZURICH, July 27 Swiss chemicals group Clariant...",markets


In [176]:
# remove common words and tokenize
data['full_content'] = data['title'] + ' ' + data['content']
full_content = data['full_content'].tolist()
full_content = [[z for z in tokenizer.tokenize(t.lower()) if z not in STOPWORDS] for t in full_content]
print full_content[:2]

[['clariant', 'keeps', 'full', 'year', 'guidance', 'meeting', 'forecasts', 'h1', 'zurich', 'july', '27', 'swiss', 'chemicals', 'group', 'clariant', 'carrying', '20', 'billion', 'merger', 'u', 'peer', 'huntsman', 'reported', 'first', 'half', 'operating', 'profit', 'line', 'expectations', 'thursday', 'confirmed', 'guidance', 'year'], ['clariant', 'keeps', 'full', 'year', 'guidance', 'meeting', 'forecasts', 'h1', 'zurich', 'july', '27', 'swiss', 'chemicals', 'group', 'clariant', 'carrying', '20', 'billion', 'merger', 'u', 'peer', 'huntsman', 'reported', 'first', 'half', 'operating', 'profit', 'line', 'wi', 'guidance', 'year']]


In [177]:
# remove words that appear only once
from collections import defaultdict
frequency = defaultdict(int)
for text in full_content:
    for token in text:
        frequency[token] += 1

full_content = [[token for token in text if frequency[token] > 1] for text in full_content]
print full_content[:2]

[['clariant', 'keeps', 'full', 'year', 'guidance', 'meeting', 'forecasts', 'h1', 'zurich', 'july', '27', 'swiss', 'chemicals', 'group', 'clariant', 'carrying', '20', 'billion', 'merger', 'u', 'peer', 'huntsman', 'reported', 'first', 'half', 'operating', 'profit', 'line', 'guidance', 'year'], ['clariant', 'keeps', 'full', 'year', 'guidance', 'meeting', 'forecasts', 'h1', 'zurich', 'july', '27', 'swiss', 'chemicals', 'group', 'clariant', 'carrying', '20', 'billion', 'merger', 'u', 'peer', 'huntsman', 'reported', 'first', 'half', 'operating', 'profit', 'line', 'wi', 'guidance', 'year']]


In [178]:
dictionary = gensim.corpora.Dictionary(full_content)
dictionary.save('model/contents.dict')
print dictionary

2017-07-27 14:03:46,051 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2017-07-27 14:03:46,067 : INFO : built Dictionary(28 unique tokens: [u'zurich', u'h1', u'year', u'line', u'27']...) from 282 documents (total 8712 corpus positions)
2017-07-27 14:03:46,068 : INFO : saving Dictionary object under model/contents.dict, separately None
2017-07-27 14:03:46,069 : INFO : saved model/contents.dict


Dictionary(28 unique tokens: [u'zurich', u'h1', u'year', u'line', u'27']...)


In [179]:
corpus = [dictionary.doc2bow(text) for text in full_content]
gensim.corpora.MmCorpus.serialize('model/contents.mm', corpus)

2017-07-27 14:03:46,221 : INFO : storing corpus in Matrix Market format to model/contents.mm
2017-07-27 14:03:46,222 : INFO : saving sparse matrix to model/contents.mm
2017-07-27 14:03:46,222 : INFO : PROGRESS: saving document #0
2017-07-27 14:03:46,245 : INFO : saved 282x28 matrix, density=99.658% (7869/7896)
2017-07-27 14:03:46,245 : INFO : saving MmCorpus index to model/contents.mm.index


# LDA

In [180]:
# Config 
NUM_TOPICS = 10
NUM_TERMS = 100

In [181]:
lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, 
                                      num_topics=10, update_every=1, 
                                      chunksize=10000, passes=1)

lda.save('model/lda.save')

2017-07-27 14:03:46,876 : INFO : using symmetric alpha at 0.1
2017-07-27 14:03:46,877 : INFO : using symmetric eta at 0.0357142857143
2017-07-27 14:03:46,877 : INFO : using serial LDA version on this node
2017-07-27 14:03:46,881 : INFO : running online LDA training, 10 topics, 1 passes over the supplied corpus of 282 documents, updating model once every 282 documents, evaluating perplexity every 282 documents, iterating 50x with a convergence threshold of 0.001000
2017-07-27 14:03:47,651 : INFO : -4.120 per-word bound, 17.4 perplexity estimate based on a held-out corpus of 282 documents with 8712 words
2017-07-27 14:03:47,651 : INFO : PROGRESS: pass 0, at document #282/282
2017-07-27 14:03:48,143 : INFO : topic #3 (0.100): 0.067*"year" + 0.063*"guidance" + 0.062*"clariant" + 0.038*"zurich" + 0.038*"group" + 0.037*"meeting" + 0.036*"20" + 0.036*"wi" + 0.036*"forecasts" + 0.035*"half"
2017-07-27 14:03:48,144 : INFO : topic #9 (0.100): 0.068*"guidance" + 0.050*"clariant" + 0.049*"year" + 

In [182]:
lda.print_topics(3)

2017-07-27 14:03:48,155 : INFO : topic #0 (0.100): 0.075*"guidance" + 0.057*"clariant" + 0.054*"year" + 0.041*"first" + 0.040*"peer" + 0.040*"carrying" + 0.039*"huntsman" + 0.038*"swiss" + 0.037*"u" + 0.037*"keeps"
2017-07-27 14:03:48,155 : INFO : topic #7 (0.100): 0.067*"year" + 0.063*"guidance" + 0.061*"clariant" + 0.036*"operating" + 0.036*"huntsman" + 0.036*"full" + 0.035*"reported" + 0.035*"profit" + 0.034*"july" + 0.034*"keeps"
2017-07-27 14:03:48,156 : INFO : topic #6 (0.100): 0.068*"clariant" + 0.067*"year" + 0.063*"guidance" + 0.034*"carrying" + 0.034*"chemicals" + 0.034*"line" + 0.034*"forecasts" + 0.034*"keeps" + 0.034*"first" + 0.033*"wi"


[(0,
  u'0.075*"guidance" + 0.057*"clariant" + 0.054*"year" + 0.041*"first" + 0.040*"peer" + 0.040*"carrying" + 0.039*"huntsman" + 0.038*"swiss" + 0.037*"u" + 0.037*"keeps"'),
 (7,
  u'0.067*"year" + 0.063*"guidance" + 0.061*"clariant" + 0.036*"operating" + 0.036*"huntsman" + 0.036*"full" + 0.035*"reported" + 0.035*"profit" + 0.034*"july" + 0.034*"keeps"'),
 (6,
  u'0.068*"clariant" + 0.067*"year" + 0.063*"guidance" + 0.034*"carrying" + 0.034*"chemicals" + 0.034*"line" + 0.034*"forecasts" + 0.034*"keeps" + 0.034*"first" + 0.033*"wi"')]

In [183]:
# select top 100 words for each of the 10 LDA topics
TOP_KEYWORDS = [[word for word, _ in lda.show_topic(topicno, topn=NUM_TERMS)]
             for topicno in range(lda.num_topics)]
print(TOP_KEYWORDS[:3])

[[u'guidance', u'clariant', u'year', u'first', u'peer', u'carrying', u'huntsman', u'swiss', u'u', u'keeps', u'full', u'wi', u'20', u'chemicals', u'group', u'meeting', u'27', u'merger', u'billion', u'forecasts', u'operating', u'half', u'reported', u'line', u'zurich', u'h1', u'profit', u'july'], [u'guidance', u'clariant', u'year', u'27', u'operating', u'wi', u'first', u'merger', u'half', u'peer', u'full', u'u', u'keeps', u'huntsman', u'h1', u'july', u'line', u'zurich', u'billion', u'chemicals', u'20', u'swiss', u'meeting', u'forecasts', u'reported', u'profit', u'carrying', u'group'], [u'year', u'guidance', u'clariant', u'meeting', u'profit', u'reported', u'h1', u'zurich', u'billion', u'chemicals', u'july', u'operating', u'huntsman', u'wi', u'27', u'20', u'carrying', u'half', u'group', u'swiss', u'first', u'peer', u'merger', u'line', u'full', u'keeps', u'u', u'forecasts']]


In [184]:
# Export 
n = 0
for _ in TOP_KEYWORDS:
    save_to = "exported/lda/lda_topic_%s.csv" % n
    pd.DataFrame(_, columns=['keyword']).to_csv(save_to, index=False)
    n += 1
    print 'Saved to ', save_to

Saved to  exported/lda/lda_topic_0.csv
Saved to  exported/lda/lda_topic_1.csv
Saved to  exported/lda/lda_topic_2.csv
Saved to  exported/lda/lda_topic_3.csv
Saved to  exported/lda/lda_topic_4.csv
Saved to  exported/lda/lda_topic_5.csv
Saved to  exported/lda/lda_topic_6.csv
Saved to  exported/lda/lda_topic_7.csv
Saved to  exported/lda/lda_topic_8.csv
Saved to  exported/lda/lda_topic_9.csv


# TF-IDF

In [185]:
NUM_CLUSTERS = 10
NUM_TERMS = 100

In [186]:
tfidf = gensim.models.tfidfmodel.TfidfModel(corpus, id2word=dictionary)
corpus_tfidf = tfidf[corpus]
tfidf.save("model/tfidf.save")

2017-07-27 14:03:48,177 : INFO : collecting document frequencies
2017-07-27 14:03:48,178 : INFO : PROGRESS: processing document #0
2017-07-27 14:03:48,180 : INFO : calculating IDF weights for 282 documents and 27 features (7869 matrix non-zeros)
2017-07-27 14:03:48,181 : INFO : saving TfidfModel object under model/tfidf.save, separately None
2017-07-27 14:03:48,182 : INFO : saved model/tfidf.save


In [187]:
corpus_tfidf_sorted = []
TOP_KEYWORDS = []
for c in corpus_tfidf:
    top_keywords = sorted(c, key=lambda t: t[1], reverse=True)
    top_keywords = top_keywords[:NUM_TERMS]
    corpus_tfidf_sorted.append(top_keywords)
    TOP_KEYWORDS.append([dictionary[id] for id, _ in top_keywords])

In [188]:
print TOP_KEYWORDS[:2]

[[u'year', u'guidance', u'zurich', u'h1', u'line', u'27', u'20', u'merger', u'meeting', u'full', u'huntsman', u'forecasts', u'reported', u'chemicals', u'carrying', u'operating', u'half', u'peer', u'group', u'july', u'billion', u'profit', u'u', u'swiss', u'first'], [u'wi', u'year', u'guidance', u'zurich', u'h1', u'line', u'27', u'20', u'merger', u'meeting', u'full', u'huntsman', u'forecasts', u'reported', u'chemicals', u'carrying', u'operating', u'half', u'peer', u'group', u'july', u'billion', u'profit', u'u', u'swiss', u'first']]


In [189]:
# Combine top keywords
import itertools
TOP_KEYWORDS_MERGED = list(itertools.chain(*TOP_KEYWORDS))
print "Total keywords: ", len(TOP_KEYWORDS_MERGED)
TOP_KEYWORDS_MERGED = list(set(TOP_KEYWORDS_MERGED))
print "Total keywords after combined: ", len(TOP_KEYWORDS_MERGED)
print TOP_KEYWORDS_MERGED[:10]

Total keywords:  7305
Total keywords after combined:  26
[u'zurich', u'h1', u'year', u'line', u'27', u'20', u'merger', u'guidance', u'meeting', u'full']


In [190]:
# Export 
save_to = "exported/tfidf/tfidf_keywords.csv"
pd.DataFrame(TOP_KEYWORDS_MERGED, columns=['keyword']).to_csv(save_to, index=False)
print 'Saved to ', save_to

Saved to  exported/tfidf/tfidf_keywords.csv


# TF-IDF with K-means

In [170]:
NUM_CLUSTERS = 10
NUM_TERMS = 100

In [171]:
from sklearn.cluster import KMeans
import numpy as np

In [172]:
X = np.array(corpus_tfidf).tolist()

In [173]:
kmeans = KMeans(n_clusters=NUM_CLUSTERS, random_state=0).fit(X)

ValueError: setting an array element with a sequence.