In [1]:
import pandas as pd
import re
import gensim
import pickle
from datetime import datetime
from matplotlib import pyplot as plt
import networkx as nx
import operator
%matplotlib inline

In [2]:
# load data
data_folder = '../data/csv_export/'
util_folder = '../util/'
df_organizations = pd.read_csv(data_folder + 'organizations.csv'.format(data_folder), dtype={'first_funding_on': str, 'last_funding_on':str})
df_description = pd.read_csv(data_folder + 'organization_descriptions.csv')

In [4]:
# df_organizations['founded_on'] = pd.to_datetime(df_organizations['founded_on'], errors='ignore')

In [3]:
# clean string into list of word tokens
# input: string
# output: list of clean word tokens
def clean(string):
    # remove non-alphabet
    string = re.sub("[^a-zA-Z]", " ", string)
    # lower case
    string = string.lower()
    # remove word with length 1
    string = [word for word in string.split() if len(word) > 1]    
    return string

In [4]:
# 1. create corpus (sentences)
# 2. learn bigram from corpus
print('reading descriptions')
sentences = []
for index, row in df_description.iterrows():
    try:
        token_list = clean(row['description'])
    except TypeError:
        print(row['description'])
    sentences.append(token_list)
    if index % 50000 == 0: print('{} / {}'.format(index, df_description.shape[0]))
        
print('learning bigrams')
bigram_transformer = gensim.models.Phrases(sentences)
print('getting bigram sentences')
bigram_sentences = bigram_transformer[sentences]

reading descriptions
0 / 346275
50000 / 346275
nan
nan
100000 / 346275
150000 / 346275
200000 / 346275
250000 / 346275
300000 / 346275
nan
learning bigrams
getting bigram sentences




In [16]:
CREATE = False
if CREATE:
    # create dictionary
    print('creating dictionary of {} documents'.format(len(sentences)))
    dictionary = gensim.corpora.dictionary.Dictionary()
    dictionary.add_documents(bigram_sentences)

    # filter extreme
    print('before: {}'.format(dictionary))
    dictionary.filter_extremes(no_above=0.1, no_below=10)
    print('after: {}'.format(dictionary))

    # save dictionary 
    dictionary.save(util_folder + 'dictionary')

dictionary = gensim.corpora.dictionary.Dictionary.load(util_folder + 'dictionary')

creating dictionary of 346275 documents
before: Dictionary(128515 unique tokens: [u'must_pass', u'trekkingpartners', u'majumder', u'digipath', u'capistrano_california']...)
after: Dictionary(15275 unique tokens: [u'publicly_traded', u'yellow', u'advices', u'different_categories', u'woods']...)


In [32]:
count = 0
bigram_sentences_for_lda = []
for sentence in sentences:
    if count % 10000 == 0: print count, len(sentences)
    bigram_sentence = bigram_transformer[sentence]
    bigram_sentence = dictionary.doc2bow(bigram_sentence)
    bigram_sentences_for_lda.append(bigram_sentence)
    count += 1

0 346275
10000 346275
20000 346275
30000 346275
40000 346275
50000 346275
60000 346275
70000 346275
80000 346275
90000 346275
100000 346275
110000 346275
120000 346275
130000 346275
140000 346275
150000 346275
160000 346275
170000 346275
180000 346275
190000 346275
200000 346275
210000 346275
220000 346275
230000 346275
240000 346275
250000 346275
260000 346275
270000 346275
280000 346275
290000 346275
300000 346275
310000 346275
320000 346275
330000 346275
340000 346275


In [42]:
%time model = gensim.models.ldamodel.LdaModel(corpus = bigram_sentences_for_lda[:10000], id2word = dictionary)
model.show_topics()

CPU times: user 30.5 s, sys: 353 ms, total: 30.8 s
Wall time: 31.1 s


[(69,
  u'0.028*"research" + 0.017*"state" + 0.017*"art" + 0.014*"patient" + 0.013*"organization" + 0.012*"owned" + 0.011*"clinical" + 0.010*"scientific" + 0.010*"north" + 0.009*"region"'),
 (15,
  u'0.025*"video" + 0.020*"videos" + 0.020*"investing" + 0.013*"home" + 0.013*"led" + 0.012*"statistics" + 0.012*"source" + 0.011*"content" + 0.011*"found" + 0.009*"natural"'),
 (46,
  u'0.015*"media" + 0.014*"board" + 0.012*"waters" + 0.011*"premier" + 0.011*"radio" + 0.009*"content" + 0.009*"gift" + 0.009*"adults" + 0.009*"medicine" + 0.008*"came"'),
 (60,
  u'0.059*"software" + 0.047*"web" + 0.027*"mobile" + 0.016*"applications" + 0.015*"website" + 0.011*"design" + 0.010*"customers" + 0.010*"enterprise" + 0.009*"solution" + 0.009*"integration"'),
 (74,
  u'0.017*"will" + 0.012*"education" + 0.011*"skills" + 0.011*"uk" + 0.011*"opportunity" + 0.010*"organic" + 0.010*"program" + 0.010*"electrical" + 0.010*"residents" + 0.009*"designed"'),
 (72,
  u'0.045*"students" + 0.036*"school" + 0.031*"c

In [43]:
%time model = gensim.models.ldamodel.LdaModel(corpus = bigram_sentences_for_lda, id2word = dictionary)
model.show_topics()

CPU times: user 11min 54s, sys: 7.61 s, total: 12min 2s
Wall time: 12min 7s


[(34,
  u'0.030*"we_re" + 0.027*"everyone" + 0.022*"great" + 0.021*"love" + 0.018*"but" + 0.016*"mind" + 0.014*"think" + 0.013*"getting" + 0.013*"do_not" + 0.013*"too"'),
 (79,
  u'0.036*"innovation" + 0.030*"entrepreneurs" + 0.021*"growth" + 0.016*"ideas" + 0.016*"start_up" + 0.015*"global" + 0.014*"vision" + 0.013*"grow" + 0.012*"culture" + 0.012*"build"'),
 (62,
  u'0.127*"digital" + 0.125*"brands" + 0.099*"consumers" + 0.065*"consumer" + 0.034*"premium" + 0.024*"award_winning" + 0.018*"leading" + 0.017*"combines" + 0.017*"owns" + 0.017*"exclusive"'),
 (15,
  u'0.118*"businesses" + 0.098*"small" + 0.033*"enterprise" + 0.023*"large" + 0.021*"buyers" + 0.018*"microsoft" + 0.017*"marketplace" + 0.015*"accounting" + 0.013*"business_owners" + 0.013*"workflow"'),
 (0,
  u'0.041*"projects" + 0.038*"project" + 0.026*"over" + 0.023*"has_been" + 0.022*"since" + 0.020*"first" + 0.019*"now" + 0.018*"one" + 0.018*"currently" + 0.016*"year"'),
 (1,
  u'0.052*"commercial" + 0.045*"house" + 0.042*"

In [53]:
for i in range(100):
    print i, model.print_topic(i)

0 0.041*"projects" + 0.038*"project" + 0.026*"over" + 0.023*"has_been" + 0.022*"since" + 0.020*"first" + 0.019*"now" + 0.018*"one" + 0.018*"currently" + 0.016*"year"
1 0.052*"commercial" + 0.045*"house" + 0.042*"agents" + 0.035*"booking" + 0.018*"guides" + 0.018*"residential" + 0.018*"guests" + 0.018*"bar" + 0.018*"all_types" + 0.017*"quick"
2 0.201*"app" + 0.095*"llc" + 0.048*"mobile_app" + 0.046*"developer" + 0.042*"apps" + 0.041*"has_developed" + 0.034*"over_million" + 0.034*"ios" + 0.028*"original" + 0.024*"apple"
3 0.097*"tech" + 0.048*"natural" + 0.026*"player" + 0.026*"photography" + 0.025*"senior" + 0.025*"near" + 0.023*"organic" + 0.021*"certain" + 0.021*"found" + 0.021*"lets"
4 0.053*"face" + 0.045*"cash" + 0.043*"machine" + 0.042*"contacts" + 0.040*"starting" + 0.036*"changes" + 0.035*"hub" + 0.030*"rapid" + 0.025*"machines" + 0.024*"manner"
5 0.105*"marketing" + 0.039*"social_media" + 0.033*"web" + 0.029*"advertising" + 0.022*"sales" + 0.020*"search_engine" + 0.020*"clients

92 0.045*"risk" + 0.043*"advice" + 0.040*"compliance" + 0.034*"globally" + 0.033*"billion" + 0.031*"board" + 0.028*"managers" + 0.027*"european" + 0.023*"germany" + 0.022*"consulting"
93 0.147*"network" + 0.048*"cloud" + 0.043*"infrastructure" + 0.039*"networks" + 0.028*"secure" + 0.027*"wireless" + 0.025*"provider" + 0.024*"internet" + 0.024*"storage" + 0.015*"customers"
94 0.066*"students" + 0.042*"university" + 0.030*"school" + 0.026*"college" + 0.018*"student" + 0.014*"education" + 0.014*"academic" + 0.013*"campus" + 0.012*"programs" + 0.011*"schools"
95 0.096*"database" + 0.079*"mobile_application" + 0.053*"list" + 0.032*"learn_more" + 0.031*"recognition" + 0.031*"function" + 0.030*"result" + 0.021*"high_tech" + 0.021*"databases" + 0.020*"clinics"
96 0.055*"usa" + 0.053*"transportation" + 0.051*"logistics" + 0.044*"ads" + 0.037*"campaign" + 0.036*"ad" + 0.032*"segment" + 0.031*"builds" + 0.029*"contract" + 0.028*"third_party"
97 0.315*"mobile" + 0.107*"applications" + 0.051*"limit

In [55]:
%time model_500 = gensim.models.ldamodel.LdaModel(corpus = bigram_sentences_for_lda, id2word = dictionary, iterations = 500)
model_500.show_topics()
model.save(util_folder + 'topic_500')

CPU times: user 11min 50s, sys: 7.31 s, total: 11min 57s
Wall time: 11min 59s


[(44,
  u'0.056*"family" + 0.054*"children" + 0.034*"safe" + 0.030*"series" + 0.029*"kids" + 0.026*"patented" + 0.025*"television" + 0.021*"produces" + 0.017*"shows" + 0.016*"less_than"'),
 (37,
  u'0.085*"provider" + 0.065*"llc" + 0.043*"secure" + 0.041*"managed" + 0.031*"address" + 0.028*"risk" + 0.028*"serves" + 0.027*"service_providers" + 0.026*"telecommunications" + 0.023*"outsourcing"'),
 (60,
  u'0.066*"clients" + 0.050*"team" + 0.043*"experience" + 0.023*"industry" + 0.017*"expertise" + 0.016*"client" + 0.016*"us" + 0.015*"design" + 0.014*"work" + 0.014*"help"'),
 (65,
  u'0.075*"planning" + 0.061*"employees" + 0.057*"plan" + 0.048*"plans" + 0.036*"print" + 0.034*"campaigns" + 0.030*"project_management" + 0.030*"methods" + 0.024*"does" + 0.024*"execution"'),
 (33,
  u'0.157*"web" + 0.072*"social_media" + 0.059*"marketing" + 0.053*"advertising" + 0.040*"design" + 0.036*"search_engine" + 0.028*"seo" + 0.026*"optimization" + 0.023*"clients" + 0.018*"digital_marketing"'),
 (71,
  u

In [None]:
# 2000년 이전 이후
