In [23]:
import gensim
import pandas as pd
import logging
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

tokenizer = RegexpTokenizer(r'\w+')
STOPWORDS = set(stopwords.words('english'))
STOPWORDS.update(['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}'])

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [24]:
data = pd.read_csv("data/sample.csv")
data.head()

Unnamed: 0,url,title,content,tag
0,https://www.reuters.com/article/clariant-resul...,Clariant keeps full year guidance after meetin...,"ZURICH, July 27 Swiss chemicals group Clariant...",markets
1,https://www.reuters.com/article/clariant-resul...,Clariant keeps full year guidance after meetin...,"ZURICH, July 27 Swiss chemicals group Clariant...",markets
2,https://www.reuters.com/article/clariant-resul...,Clariant keeps full year guidance after meetin...,"ZURICH, July 27 Swiss chemicals group Clariant...",markets
3,https://www.reuters.com/article/clariant-resul...,Clariant keeps full year guidance after meetin...,"ZURICH, July 27 Swiss chemicals group Clariant...",markets
4,https://www.reuters.com/article/clariant-resul...,Clariant keeps full year guidance after meetin...,"ZURICH, July 27 Swiss chemicals group Clariant...",markets


In [25]:
# remove common words and tokenize
data['full_content'] = data['title'] + ' ' + data['content']
full_content = data['full_content'].tolist()
full_content = [[z for z in tokenizer.tokenize(t.lower()) if z not in STOPWORDS] for t in full_content]
print full_content[:2]

[['clariant', 'keeps', 'full', 'year', 'guidance', 'meeting', 'forecasts', 'h1', 'zurich', 'july', '27', 'swiss', 'chemicals', 'group', 'clariant', 'carrying', '20', 'billion', 'merger', 'u', 'peer', 'huntsman', 'reported', 'first', 'half', 'operating', 'profit', 'line', 'expectations', 'thursday', 'confirmed', 'guidance', 'year'], ['clariant', 'keeps', 'full', 'year', 'guidance', 'meeting', 'forecasts', 'h1', 'zurich', 'july', '27', 'swiss', 'chemicals', 'group', 'clariant', 'carrying', '20', 'billion', 'merger', 'u', 'peer', 'huntsman', 'reported', 'first', 'half', 'operating', 'profit', 'line', 'wi', 'guidance', 'year']]


In [26]:
# remove words that appear only once
from collections import defaultdict
frequency = defaultdict(int)
for text in full_content:
    for token in text:
        frequency[token] += 1

full_content = [[token for token in text if frequency[token] > 1] for text in full_content]
print full_content[:2]

[['clariant', 'keeps', 'full', 'year', 'guidance', 'meeting', 'forecasts', 'h1', 'zurich', 'july', '27', 'swiss', 'chemicals', 'group', 'clariant', 'carrying', '20', 'billion', 'merger', 'u', 'peer', 'huntsman', 'reported', 'first', 'half', 'operating', 'profit', 'line', 'guidance', 'year'], ['clariant', 'keeps', 'full', 'year', 'guidance', 'meeting', 'forecasts', 'h1', 'zurich', 'july', '27', 'swiss', 'chemicals', 'group', 'clariant', 'carrying', '20', 'billion', 'merger', 'u', 'peer', 'huntsman', 'reported', 'first', 'half', 'operating', 'profit', 'line', 'wi', 'guidance', 'year']]


In [27]:
dictionary = gensim.corpora.Dictionary(full_content)
dictionary.save('model/contents.dict')
print dictionary

2017-07-27 14:04:53,866 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2017-07-27 14:04:53,882 : INFO : built Dictionary(28 unique tokens: [u'zurich', u'h1', u'year', u'line', u'27']...) from 282 documents (total 8712 corpus positions)
2017-07-27 14:04:53,884 : INFO : saving Dictionary object under model/contents.dict, separately None
2017-07-27 14:04:53,885 : INFO : saved model/contents.dict


Dictionary(28 unique tokens: [u'zurich', u'h1', u'year', u'line', u'27']...)


In [28]:
corpus = [dictionary.doc2bow(text) for text in full_content]
gensim.corpora.MmCorpus.serialize('model/contents.mm', corpus)

2017-07-27 14:04:53,899 : INFO : storing corpus in Matrix Market format to model/contents.mm
2017-07-27 14:04:53,900 : INFO : saving sparse matrix to model/contents.mm
2017-07-27 14:04:53,901 : INFO : PROGRESS: saving document #0
2017-07-27 14:04:53,926 : INFO : saved 282x28 matrix, density=99.658% (7869/7896)
2017-07-27 14:04:53,927 : INFO : saving MmCorpus index to model/contents.mm.index


# LDA

In [29]:
# Config 
NUM_TOPICS = 10
NUM_TERMS = 100

In [30]:
lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, 
                                      num_topics=10, update_every=1, 
                                      chunksize=10000, passes=1)

lda.save('model/lda.save')

2017-07-27 14:04:53,935 : INFO : using symmetric alpha at 0.1
2017-07-27 14:04:53,936 : INFO : using symmetric eta at 0.0357142857143
2017-07-27 14:04:53,936 : INFO : using serial LDA version on this node
2017-07-27 14:04:53,940 : INFO : running online LDA training, 10 topics, 1 passes over the supplied corpus of 282 documents, updating model once every 282 documents, evaluating perplexity every 282 documents, iterating 50x with a convergence threshold of 0.001000
2017-07-27 14:04:54,650 : INFO : -4.113 per-word bound, 17.3 perplexity estimate based on a held-out corpus of 282 documents with 8712 words
2017-07-27 14:04:54,650 : INFO : PROGRESS: pass 0, at document #282/282
2017-07-27 14:04:55,105 : INFO : topic #8 (0.100): 0.038*"clariant" + 0.037*"guidance" + 0.036*"year" + 0.036*"h1" + 0.036*"group" + 0.036*"chemicals" + 0.036*"forecasts" + 0.036*"huntsman" + 0.036*"u" + 0.036*"profit"
2017-07-27 14:04:55,106 : INFO : topic #5 (0.100): 0.067*"clariant" + 0.067*"guidance" + 0.056*"yea

In [31]:
lda.print_topics(3)

2017-07-27 14:04:55,116 : INFO : topic #8 (0.100): 0.038*"clariant" + 0.037*"guidance" + 0.036*"year" + 0.036*"h1" + 0.036*"group" + 0.036*"chemicals" + 0.036*"forecasts" + 0.036*"huntsman" + 0.036*"u" + 0.036*"profit"
2017-07-27 14:04:55,117 : INFO : topic #5 (0.100): 0.067*"clariant" + 0.067*"guidance" + 0.056*"year" + 0.037*"zurich" + 0.037*"profit" + 0.036*"20" + 0.036*"full" + 0.034*"july" + 0.033*"chemicals" + 0.033*"27"
2017-07-27 14:04:55,119 : INFO : topic #9 (0.100): 0.075*"year" + 0.062*"clariant" + 0.060*"guidance" + 0.038*"group" + 0.036*"u" + 0.036*"carrying" + 0.035*"operating" + 0.035*"27" + 0.034*"july" + 0.034*"peer"


[(8,
  u'0.038*"clariant" + 0.037*"guidance" + 0.036*"year" + 0.036*"h1" + 0.036*"group" + 0.036*"chemicals" + 0.036*"forecasts" + 0.036*"huntsman" + 0.036*"u" + 0.036*"profit"'),
 (5,
  u'0.067*"clariant" + 0.067*"guidance" + 0.056*"year" + 0.037*"zurich" + 0.037*"profit" + 0.036*"20" + 0.036*"full" + 0.034*"july" + 0.033*"chemicals" + 0.033*"27"'),
 (9,
  u'0.075*"year" + 0.062*"clariant" + 0.060*"guidance" + 0.038*"group" + 0.036*"u" + 0.036*"carrying" + 0.035*"operating" + 0.035*"27" + 0.034*"july" + 0.034*"peer"')]

In [32]:
# select top 100 words for each of the 10 LDA topics
TOP_KEYWORDS = [[word for word, _ in lda.show_topic(topicno, topn=NUM_TERMS)]
             for topicno in range(lda.num_topics)]
print(TOP_KEYWORDS[:3])

[[u'clariant', u'year', u'guidance', u'swiss', u'forecasts', u'merger', u'chemicals', u'profit', u'line', u'billion', u'h1', u'half', u'operating', u'u', u'reported', u'keeps', u'wi', u'first', u'july', u'carrying', u'20', u'full', u'meeting', u'zurich', u'peer', u'huntsman', u'27', u'group'], [u'clariant', u'guidance', u'year', u'20', u'july', u'group', u'zurich', u'carrying', u'h1', u'full', u'profit', u'line', u'forecasts', u'keeps', u'merger', u'chemicals', u'half', u'27', u'huntsman', u'billion', u'operating', u'meeting', u'reported', u'u', u'swiss', u'wi', u'first', u'peer'], [u'guidance', u'year', u'clariant', u'reported', u'peer', u'wi', u'meeting', u'huntsman', u'27', u'zurich', u'full', u'20', u'first', u'half', u'july', u'line', u'u', u'merger', u'group', u'swiss', u'operating', u'chemicals', u'keeps', u'carrying', u'billion', u'h1', u'forecasts', u'profit']]


In [33]:
# Export 
n = 0
for _ in TOP_KEYWORDS:
    save_to = "exported/lda/lda_topic_%s.csv" % n
    pd.DataFrame(_, columns=['keyword']).to_csv(save_to, index=False)
    n += 1
    print 'Saved to ', save_to

Saved to  exported/lda/lda_topic_0.csv
Saved to  exported/lda/lda_topic_1.csv
Saved to  exported/lda/lda_topic_2.csv
Saved to  exported/lda/lda_topic_3.csv
Saved to  exported/lda/lda_topic_4.csv
Saved to  exported/lda/lda_topic_5.csv
Saved to  exported/lda/lda_topic_6.csv
Saved to  exported/lda/lda_topic_7.csv
Saved to  exported/lda/lda_topic_8.csv
Saved to  exported/lda/lda_topic_9.csv


# TF-IDF

In [34]:
NUM_CLUSTERS = 10
NUM_TERMS = 100

In [35]:
tfidf = gensim.models.tfidfmodel.TfidfModel(corpus, id2word=dictionary)
corpus_tfidf = tfidf[corpus]
tfidf.save("model/tfidf.save")

2017-07-27 14:04:55,148 : INFO : collecting document frequencies
2017-07-27 14:04:55,148 : INFO : PROGRESS: processing document #0
2017-07-27 14:04:55,150 : INFO : calculating IDF weights for 282 documents and 27 features (7869 matrix non-zeros)
2017-07-27 14:04:55,152 : INFO : saving TfidfModel object under model/tfidf.save, separately None
2017-07-27 14:04:55,153 : INFO : saved model/tfidf.save


In [36]:
corpus_tfidf_sorted = []
TOP_KEYWORDS = []
for c in corpus_tfidf:
    top_keywords = sorted(c, key=lambda t: t[1], reverse=True)
    top_keywords = top_keywords[:NUM_TERMS]
    corpus_tfidf_sorted.append(top_keywords)
    TOP_KEYWORDS.append([dictionary[id] for id, _ in top_keywords])

In [37]:
print TOP_KEYWORDS[:2]

[[u'year', u'guidance', u'zurich', u'h1', u'line', u'27', u'20', u'merger', u'meeting', u'full', u'huntsman', u'forecasts', u'reported', u'chemicals', u'carrying', u'operating', u'half', u'peer', u'group', u'july', u'billion', u'profit', u'u', u'swiss', u'first'], [u'wi', u'year', u'guidance', u'zurich', u'h1', u'line', u'27', u'20', u'merger', u'meeting', u'full', u'huntsman', u'forecasts', u'reported', u'chemicals', u'carrying', u'operating', u'half', u'peer', u'group', u'july', u'billion', u'profit', u'u', u'swiss', u'first']]


In [38]:
# Combine top keywords
import itertools
TOP_KEYWORDS_MERGED = list(itertools.chain(*TOP_KEYWORDS))
print "Total keywords: ", len(TOP_KEYWORDS_MERGED)
TOP_KEYWORDS_MERGED = list(set(TOP_KEYWORDS_MERGED))
print "Total keywords after combined: ", len(TOP_KEYWORDS_MERGED)
print TOP_KEYWORDS_MERGED[:10]

Total keywords:  7305
Total keywords after combined:  26
[u'zurich', u'h1', u'year', u'line', u'27', u'20', u'merger', u'guidance', u'meeting', u'full']


In [39]:
# Export 
save_to = "exported/tfidf/tfidf_keywords.csv"
pd.DataFrame(TOP_KEYWORDS_MERGED, columns=['keyword']).to_csv(save_to, index=False)
print 'Saved to ', save_to

Saved to  exported/tfidf/tfidf_keywords.csv


# TF-IDF with K-means

In [40]:
NUM_CLUSTERS = 10
NUM_TERMS = 100

In [41]:
from sklearn.cluster import KMeans
import numpy as np

In [42]:
# X = np.array(corpus_tfidf).tolist()

In [43]:
# kmeans = KMeans(n_clusters=NUM_CLUSTERS, random_state=0).fit(X)