In [301]:
import glob
import gensim
import pandas as pd
import logging
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

wordnet_lemmatizer = WordNetLemmatizer()
tokenizer = RegexpTokenizer(r'\w+')
STOPWORDS = set(stopwords.words('english'))
STOPWORDS.update(['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}'])

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [437]:
KEYWORD_TYPE = 'entrepreneur' # entrepreneur, layoff, reuter
INPUT_FILE = "data/"+ KEYWORD_TYPE +".csv"

data = pd.read_csv(INPUT_FILE, encoding='utf-8', dtype={'short_text': str, 'long_text': str})
print 'Reading', INPUT_FILE, ' => ' , len(data.index), 'rows'
data.drop_duplicates(inplace=True)
data['short_text'] = data.short_text.astype(str)
print "Unique row:", len(data)
data.head()

Reading data/entrepreneur.csv  =>  826 rows
Unique row: 826


Unnamed: 0,url,title,short_text,long_text,tag,time
0,/article/296425,3 Touch Points to Better Engage a Multigenerat...,Smart HCM technology can help organizations cr...,Free Webinar | August 16thFind out how to opti...,Human Resources,2017-07-24 16:30:00
1,/article/297614,The 'Al Capone Theory' Shows How Sexual Harass...,"When you hire and partner, look for people who...","Recode reported yesterday that Vinod Khosla, t...",Inspiring Your Team,2017-07-21 17:48:00
2,/article/296710,This Question Reveals the Truth About Workplac...,How is your business handling workplace divers...,Imagine your organization is expanding into Te...,Inspiring Your Team,2017-07-13 16:45:00
3,/article/296696,How to Know It's Time to Add an HR Department,HR activities at startups are often reactive i...,"When starting their businesses, most entrepren...",Business Moving Forward,2017-07-10 19:15:00
4,/article/296461,4 Gamification Platforms That Show Why You Can...,"Companies are letting their employees play ""ga...",Free Webinar | August 16thFind out how to opti...,Gamification,2017-06-29 17:00:00


In [404]:
allFiles = glob.glob("data/*.csv")
data = pd.DataFrame()
list_ = []
for file_ in allFiles:
    df = pd.read_csv(file_, dtype={'short_text': str, 'long_text': str})
    list_.append(df)
data = pd.concat(list_)

data.drop_duplicates(inplace=True)
print "Unique row:", len(data)
data.head()

data.to_csv("concat.csv")

Unique row: 1858


In [438]:
# remove common words and tokenize
data['full_content'] = data['title'] + ' ' + data['long_text']
full_content = data['full_content'].tolist()
full_content = [[wordnet_lemmatizer.lemmatize(z) for z in tokenizer.tokenize(str(t).decode('utf-8').lower()) if z not in STOPWORDS] for t in full_content]
# full_content = [[z for z in str(t).lower().split(" ") if z not in STOPWORDS] for t in full_content]
print full_content[:2]
with open("full_content", "wb") as f:
    f.write(str(full_content))

[[u'3', u'touch', u'point', u'better', u'engage', u'multigenerational', u'workforce', u'free', u'webinar', u'august', u'16thfind', u'optimize', u'website', u'give', u'customer', u'experience', u'biggest', u'roi', u'business', u'register', u'many', u'workplace', u'today', u'unprecedented', u'position', u'five', u'generation', u'working', u'together', u'side', u'side', u'exact', u'definition', u'generation', u'may', u'vary', u'slightly', u'office', u'workplace', u'today', u'could', u'include', u'member', u'traditionalist', u'born', u'1927', u'1945', u'baby', u'boomer', u'1946', u'1964', u'generation', u'x', u'1965', u'1980', u'millennials', u'generation', u'1981', u'1996', u'generation', u'z', u'born', u'1997', u'later', u'would', u'agree', u'generalization', u'like', u'generational', u'bucket', u'helpful', u'point', u'multigenerational', u'workforce', u'challenge', u'employer', u'meet', u'broad', u'range', u'need', u'expectation', u'making', u'matter', u'complicated', u'typical', u'full

In [439]:
# remove words that appear only once
from collections import defaultdict
frequency = defaultdict(int)
for text in full_content:
    for token in text:
        frequency[token] += 1

# full_content = [[token for token in text if frequency[token] > 1] for text in full_content]
print full_content[:2]

[[u'3', u'touch', u'point', u'better', u'engage', u'multigenerational', u'workforce', u'free', u'webinar', u'august', u'16thfind', u'optimize', u'website', u'give', u'customer', u'experience', u'biggest', u'roi', u'business', u'register', u'many', u'workplace', u'today', u'unprecedented', u'position', u'five', u'generation', u'working', u'together', u'side', u'side', u'exact', u'definition', u'generation', u'may', u'vary', u'slightly', u'office', u'workplace', u'today', u'could', u'include', u'member', u'traditionalist', u'born', u'1927', u'1945', u'baby', u'boomer', u'1946', u'1964', u'generation', u'x', u'1965', u'1980', u'millennials', u'generation', u'1981', u'1996', u'generation', u'z', u'born', u'1997', u'later', u'would', u'agree', u'generalization', u'like', u'generational', u'bucket', u'helpful', u'point', u'multigenerational', u'workforce', u'challenge', u'employer', u'meet', u'broad', u'range', u'need', u'expectation', u'making', u'matter', u'complicated', u'typical', u'full

In [440]:
for i in frequency.keys():
    if 'layoff' in i:
        print i, frequency[i]

layoff 43


In [441]:
dictionary = gensim.corpora.Dictionary(full_content)
dictionary.save('model/contents.dict')
print dictionary

2017-07-28 11:18:41,926 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2017-07-28 11:18:42,213 : INFO : built Dictionary(18469 unique tokens: [u'aided', u'himmelryan', u'opinionsdr', u'fishermargie', u'candidates6']...) from 826 documents (total 305244 corpus positions)
2017-07-28 11:18:42,213 : INFO : saving Dictionary object under model/contents.dict, separately None
2017-07-28 11:18:42,254 : INFO : saved model/contents.dict


Dictionary(18469 unique tokens: [u'aided', u'himmelryan', u'opinionsdr', u'fishermargie', u'candidates6']...)


In [442]:
corpus = [dictionary.doc2bow(text) for text in full_content]
gensim.corpora.MmCorpus.serialize('model/contents.mm', corpus)

2017-07-28 11:18:44,603 : INFO : storing corpus in Matrix Market format to model/contents.mm
2017-07-28 11:18:44,605 : INFO : saving sparse matrix to model/contents.mm
2017-07-28 11:18:44,605 : INFO : PROGRESS: saving document #0
2017-07-28 11:18:45,192 : INFO : saved 826x18469 matrix, density=1.293% (197310/15255394)
2017-07-28 11:18:45,193 : INFO : saving MmCorpus index to model/contents.mm.index


# LDA

In [443]:
# Config 
NUM_TOPICS = 10
NUM_TERMS = 100

In [444]:
lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, 
                                      num_topics=NUM_TOPICS, update_every=1, 
                                      chunksize=10000, passes=1)

lda.save('model/lda.save')

2017-07-28 11:18:53,041 : INFO : using symmetric alpha at 0.1
2017-07-28 11:18:53,041 : INFO : using symmetric eta at 5.41447831501e-05
2017-07-28 11:18:53,045 : INFO : using serial LDA version on this node
2017-07-28 11:18:54,816 : INFO : running online LDA training, 10 topics, 1 passes over the supplied corpus of 826 documents, updating model once every 826 documents, evaluating perplexity every 826 documents, iterating 50x with a convergence threshold of 0.001000
2017-07-28 11:19:03,794 : INFO : -11.083 per-word bound, 2169.0 perplexity estimate based on a held-out corpus of 826 documents with 305244 words
2017-07-28 11:19:03,795 : INFO : PROGRESS: pass 0, at document #826/826
2017-07-28 11:19:05,591 : INFO : topic #4 (0.100): 0.015*"employee" + 0.010*"company" + 0.008*"business" + 0.007*"time" + 0.006*"people" + 0.005*"team" + 0.005*"work" + 0.005*"job" + 0.004*"say" + 0.004*"make"
2017-07-28 11:19:05,592 : INFO : topic #7 (0.100): 0.013*"employee" + 0.012*"company" + 0.008*"busine

In [445]:
lda.print_topics(2, 100)

2017-07-28 11:19:05,652 : INFO : topic #8 (0.100): 0.021*"employee" + 0.009*"company" + 0.008*"work" + 0.008*"business" + 0.006*"time" + 0.006*"hr" + 0.006*"job" + 0.006*"related" + 0.005*"need" + 0.005*"say" + 0.005*"team" + 0.005*"one" + 0.004*"new" + 0.004*"make" + 0.004*"percent" + 0.004*"get" + 0.003*"may" + 0.003*"process" + 0.003*"performance" + 0.003*"hire" + 0.003*"employer" + 0.003*"manager" + 0.003*"help" + 0.003*"would" + 0.003*"people" + 0.003*"hiring" + 0.003*"best" + 0.003*"way" + 0.003*"take" + 0.003*"worker" + 0.003*"organization" + 0.003*"leader" + 0.003*"want" + 0.002*"small" + 0.002*"like" + 0.002*"year" + 0.002*"first" + 0.002*"read" + 0.002*"also" + 0.002*"human" + 0.002*"experience" + 0.002*"know" + 0.002*"candidate" + 0.002*"many" + 0.002*"great" + 0.002*"resource" + 0.002*"plan" + 0.002*"find" + 0.002*"use" + 0.002*"cost" + 0.002*"person" + 0.002*"pay" + 0.002*"right" + 0.002*"skill" + 0.002*"management" + 0.002*"good" + 0.002*"important" + 0.002*"data" + 0.002

[(8,
  u'0.021*"employee" + 0.009*"company" + 0.008*"work" + 0.008*"business" + 0.006*"time" + 0.006*"hr" + 0.006*"job" + 0.006*"related" + 0.005*"need" + 0.005*"say" + 0.005*"team" + 0.005*"one" + 0.004*"new" + 0.004*"make" + 0.004*"percent" + 0.004*"get" + 0.003*"may" + 0.003*"process" + 0.003*"performance" + 0.003*"hire" + 0.003*"employer" + 0.003*"manager" + 0.003*"help" + 0.003*"would" + 0.003*"people" + 0.003*"hiring" + 0.003*"best" + 0.003*"way" + 0.003*"take" + 0.003*"worker" + 0.003*"organization" + 0.003*"leader" + 0.003*"want" + 0.002*"small" + 0.002*"like" + 0.002*"year" + 0.002*"first" + 0.002*"read" + 0.002*"also" + 0.002*"human" + 0.002*"experience" + 0.002*"know" + 0.002*"candidate" + 0.002*"many" + 0.002*"great" + 0.002*"resource" + 0.002*"plan" + 0.002*"find" + 0.002*"use" + 0.002*"cost" + 0.002*"person" + 0.002*"pay" + 0.002*"right" + 0.002*"skill" + 0.002*"management" + 0.002*"good" + 0.002*"important" + 0.002*"data" + 0.002*"might" + 0.002*"someone" + 0.002*"talent

In [446]:
# select top 100 words for each of the 10 LDA topics
TOP_KEYWORDS = [[word for word, _ in lda.show_topic(topicno, topn=NUM_TERMS)]
             for topicno in range(lda.num_topics)]
print(TOP_KEYWORDS[:3])

[[u'employee', u'company', u'business', u'time', u'work', u'people', u'need', u'say', u'job', u'help', u'one', u'way', u'related', u'new', u'year', u'hr', u'may', u'team', u'employer', u'entrepreneur', u'candidate', u'make', u'hire', u'also', u'hiring', u'read', u'get', u'percent', u'take', u'process', u'thing', u'worker', u'right', u'even', u'plan', u'resource', u'small', u'performance', u'know', u'leader', u'best', u'many', u'like', u'talent', u'go', u'question', u'based', u'experience', u'pay', u'person', u'benefit', u'want', u'give', u'important', u'well', u'often', u'organization', u'others', u'goal', u'someone', u'human', u'culture', u'ceo', u'u', u'interview', u'service', u'individual', u'manager', u'owner', u'find', u'leave', u'skill', u'customer', u'would', u'good', u'use', u'3', u'keep', u'day', u'issue', u'first', u'could', u'workplace', u'management', u'start', u'big', u'ask', u'better', u'offer', u'think', u'provide', u'information', u'review', u'1', u'every', u'place', u'

In [447]:
# Export 
n = 0
combined = []
for _ in TOP_KEYWORDS:
    combined += _

combined_df = pd.DataFrame(combined)
combined_df = pd.DataFrame(combined_df[0].value_counts().sort_values()).reset_index()
combined_df.columns = ['keyword', 'c']

save_to = "exported/%s.csv" % KEYWORD_TYPE
combined_df.to_csv(save_to, index=False)
print 'Saved to ', save_to

Saved to  exported/entrepreneur.csv


In [312]:
break

SyntaxError: 'break' outside loop (<ipython-input-312-b20fcc2397e7>, line 1)

# TF-IDF

In [None]:
NUM_CLUSTERS = 10
NUM_TERMS = 50

In [None]:
tfidf = gensim.models.tfidfmodel.TfidfModel(corpus, id2word=dictionary)
corpus_tfidf = tfidf[corpus]
tfidf.save("model/tfidf.save")

In [None]:
corpus_tfidf_sorted = []
TOP_KEYWORDS = []
for c in corpus_tfidf:
    top_keywords = sorted(c, key=lambda t: t[1], reverse=True)
    top_keywords = top_keywords[:NUM_TERMS]
    corpus_tfidf_sorted.append(top_keywords)
    TOP_KEYWORDS.append([dictionary[id] for id, _ in top_keywords])

In [None]:
print TOP_KEYWORDS[:2]

In [None]:
# Combine top keywords
import itertools
TOP_KEYWORDS_MERGED = list(itertools.chain(*TOP_KEYWORDS))
print "Total keywords: ", len(TOP_KEYWORDS_MERGED)
TOP_KEYWORDS_MERGED = list(set(TOP_KEYWORDS_MERGED))
print "Total keywords after combined: ", len(TOP_KEYWORDS_MERGED)
print TOP_KEYWORDS_MERGED[:10]

In [None]:
# Export 
save_to = "exported/tfidf/tfidf_keywords.csv"
pd.DataFrame(TOP_KEYWORDS_MERGED, columns=['keyword']).to_csv(save_to, index=False, encoding='utf-8')
print 'Saved to ', save_to

# TF-IDF with K-means

In [None]:
# NUM_CLUSTERS = 10
# NUM_TERMS = 100

In [None]:
# from sklearn.cluster import KMeans
# from scipy.sparse import csr_matrix
# import numpy as np

In [None]:
# _corpus_tfidf = gensim.matutils.corpus2csc(corpus_tfidf).transpose()
# print _corpus_tfidf[:5]

In [None]:
# kmeans = KMeans(n_clusters=NUM_CLUSTERS)
# X = kmeans.fit_transform(_corpus_tfidf)

In [None]:
# X.shape

In [None]:
# dir(kmeans)
# kmeans.n_clusters