In [43]:
import pandas as pd
import pyarrow
import os
import numpy as np
import gensim
import nltk
import pickle
import ipdb
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from wordcloud import WordCloud
from tqdm import tqdm
from pprint import pprint
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer




In [126]:
pd.options.display.max_rows = 500
GEN_WORDCLOUD = False
DATA_DIR = "bz_desc_data/"
WORDCLOUD_DIR = "wordcloud/"
WORDCLOUD_TOPICS_DIR = "wordcloud_topics/"

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package stopwords to /home/fanpu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/fanpu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/fanpu/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [45]:
def get_available_tickers():
    files = os.listdir(DATA_DIR)
    return [file.rstrip(".parquet") for file in files]

In [46]:
tickers = get_available_tickers()

In [160]:
stop_words = stopwords.words('english')
stop_words.extend(
    ['product', 'company', 'include', 'service', 'market', 'business', 'u', 'customer', 'financial', 'also', 'state', 'may', 'use', 'provide', 'new', 'year', 'result', 'cost', 'certain']

#     ['business', 'also', 'financial', 'market', 'u', 'may', 'company', 'include', 
#      'service', 'provide', 'customers', 'cost', 'operate', 'management', 'risk', 'report', 'information',
#      'result', 'approximately', 'require', 'use', 'us', 'relate', 'certain', 'operation', 'system', 'regulation', 'state', 'new',
#      'program', 'segment', 'base', 'sale', 'regulatory', 'million', 'year', 'time', 'subject', 'continue', 'change', 'well', 
#      'december', 'requirement', 'addition', 'term', 'could', 'increase', 'plan', 'revenue', 'believe', 'affect', 'product', 'customer'
#     ]
)
lemmatizer = WordNetLemmatizer()

def save_wordcloud(ticker, tokens):
    # Create a WordCloud object
    wordcloud = WordCloud(background_color="white", max_words=5000, contour_width=3, contour_color='steelblue')
    # Generate a word cloud
    wordcloud.generate(",".join(tokens))
    # Visualize the word cloud
    wordcloud.to_file(f"{WORDCLOUD_DIR}/{ticker}.png")
    
def wordlist_for_ticker(ticker):
    """
    Returns:
        tokens, is_malformed
    """
    data = pd.read_parquet(f'{DATA_DIR}/{ticker}.parquet', engine='pyarrow')
    if len(data["word_list"]) == 0:
        return [], True
    tokens = np.concatenate(data["word_list"])
    if len(tokens) < 1000:
        return [], True
    def remove_numbers(tokens):
        return list(filter(lambda x : not x.isnumeric(), tokens))
    def remove_additional_stop_words(tokens):
        return list(filter(lambda x : not x in stop_words, tokens))
    def lemmatize(tokens):
        return [lemmatizer.lemmatize(token) for token in tokens]
    tokens = remove_numbers(tokens)
    tokens = lemmatize(tokens)
    tokens = remove_additional_stop_words(tokens)
    if GEN_WORDCLOUD:
        save_wordcloud(ticker, tokens)
    return tokens, False

In [161]:
clean_tickers = []
ticker_wordlist = {}
for ticker in tqdm(tickers):
    tokens, is_malformed = wordlist_for_ticker(ticker)
    if not is_malformed:
        clean_tickers.append(ticker)
        ticker_wordlist[ticker] = tokens
    else:
        # print(ticker, "is malformed, discarding")
        pass

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 726/726 [01:28<00:00,  8.21it/s]


In [162]:
len(tickers), len(clean_tickers)

(726, 536)

In [163]:
# Create Dictionary
id2word = corpora.Dictionary(ticker_wordlist.values())

# Term Document Frequency
corpus = [id2word.doc2bow(ticker_wordlist[ticker]) for ticker in clean_tickers]

In [164]:
# number of topics
num_topics = 50
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics)
# Print the Keyword in the top topics
pprint(lda_model.print_topics(num_topics=num_topics))
doc_lda = lda_model[corpus]

[(0,
  '0.005*"sale" + 0.005*"report" + 0.004*"system" + 0.004*"information" + '
  '0.004*"base" + 0.004*"million" + 0.004*"regulation" + 0.004*"operate" + '
  '0.004*"technology" + 0.004*"solution"'),
 (1,
  '0.005*"require" + 0.005*"health" + 0.004*"information" + 0.004*"operation" '
  '+ 0.004*"system" + 0.004*"sale" + 0.004*"federal" + 0.004*"operate" + '
  '0.003*"regulation" + 0.003*"law"'),
 (2,
  '0.005*"operate" + 0.005*"operation" + 0.004*"report" + 0.004*"base" + '
  '0.004*"approximately" + 0.004*"facility" + 0.003*"regulation" + '
  '0.003*"relate" + 0.003*"risk" + 0.003*"require"'),
 (3,
  '0.005*"operate" + 0.005*"management" + 0.005*"offer" + 0.004*"system" + '
  '0.004*"operation" + 0.004*"sale" + 0.004*"technology" + 0.004*"data" + '
  '0.003*"security" + 0.003*"information"'),
 (4,
  '0.004*"risk" + 0.004*"bank" + 0.004*"regulation" + 0.004*"operation" + '
  '0.004*"subject" + 0.004*"requirement" + 0.004*"base" + 0.004*"act" + '
  '0.003*"capital" + 0.003*"require"')

In [165]:
import pyLDAvis
import pyLDAvis.gensim_models as gensim_models
import pickle 
# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_data_filepath = os.path.join('./results/ldavis_prepared_'+str(num_topics))
# # this is a bit time consuming - make the if statement True
# # if you want to execute visualization prep yourself
if 1 == 1:
    LDAvis_prepared = gensim_models.prepare(lda_model, corpus, id2word)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)
# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)
pyLDAvis.save_html(LDAvis_prepared, './results/ldavis_prepared_'+ str(num_topics) +'.html')
LDAvis_prepared


  by='saliency', ascending=False).head(R).drop('saliency', 1)


In [166]:
len(corpus), len(clean_tickers)

(536, 536)

In [167]:
lda_model.show_topics(num_topics)

[(0,
  '0.005*"sale" + 0.005*"report" + 0.004*"system" + 0.004*"information" + 0.004*"base" + 0.004*"million" + 0.004*"regulation" + 0.004*"operate" + 0.004*"technology" + 0.004*"solution"'),
 (1,
  '0.005*"require" + 0.005*"health" + 0.004*"information" + 0.004*"operation" + 0.004*"system" + 0.004*"sale" + 0.004*"federal" + 0.004*"operate" + 0.003*"regulation" + 0.003*"law"'),
 (2,
  '0.005*"operate" + 0.005*"operation" + 0.004*"report" + 0.004*"base" + 0.004*"approximately" + 0.004*"facility" + 0.003*"regulation" + 0.003*"relate" + 0.003*"risk" + 0.003*"require"'),
 (3,
  '0.005*"operate" + 0.005*"management" + 0.005*"offer" + 0.004*"system" + 0.004*"operation" + 0.004*"sale" + 0.004*"technology" + 0.004*"data" + 0.003*"security" + 0.003*"information"'),
 (4,
  '0.004*"risk" + 0.004*"bank" + 0.004*"regulation" + 0.004*"operation" + 0.004*"subject" + 0.004*"requirement" + 0.004*"base" + 0.004*"act" + 0.003*"capital" + 0.003*"require"'),
 (5,
  '0.005*"regulation" + 0.005*"information"

In [168]:
lda_model.show_topics(num_topics)[0][1]
lda_model.get_topic_terms(topicid=0)

[(751, 0.005160336),
 (726, 0.004656824),
 (838, 0.0042452235),
 (428, 0.0042172153),
 (86, 0.0039578597),
 (531, 0.0038578364),
 (710, 0.0037940484),
 (574, 0.00355582),
 (843, 0.0035242664),
 (793, 0.0035089357)]

def visualize_topics(topic_id):
    topic_terms = lda_model.get_topic_terms(topicid=topic_id, topn=30)
    freq_dict = {}
    for word_id, freq in topic_terms:
        freq_dict[id2word[word_id]] = freq
        
    wordcloud = WordCloud(background_color="white", max_words=5000, contour_width=3, contour_color='steelblue')
    wordcloud.generate_from_frequencies(freq_dict)
    # Visualize the word cloud
    # wordcloud.to_image()
    wordcloud.to_file(f"{WORDCLOUD_TOPICS_DIR}/{topic_id}.png")
    
for topic_id in range(num_topics):
    visualize_topics(topic_id)

In [169]:
def topics_for_ticker(ticker, ticker_index):
    print(ticker)
    topics = doc_lda[ticker_index]
    # Sort by frequency
    topics = sorted(topics, key = lambda x : x[1], reverse=True)
    return topics
    
    
tickers_and_topics = {}
for index, ticker in enumerate(clean_tickers):
    print("Ticker", ticker)
    topics = topics_for_ticker(ticker, index)
    print("Topics", topics[:5])
    tickers_and_topics[ticker] = topics

with open('tickers_and_topics.pkl', 'wb') as f:
    pickle.dump(tickers_and_topics, f)


    

Ticker CAH
CAH
Topics [(1, 0.33378452), (27, 0.12716901), (37, 0.07881262), (11, 0.058325604), (12, 0.03676731)]
Ticker FLS
FLS
Topics [(13, 0.39536703), (45, 0.095369875), (36, 0.06890167), (30, 0.05370329), (9, 0.043349426)]
Ticker HBAN
HBAN
Topics [(31, 0.5501809), (4, 0.4142187), (46, 0.014183641)]
Ticker TSLA
TSLA
Topics [(45, 0.34868646), (44, 0.104018174), (19, 0.10190891), (24, 0.100066304), (36, 0.08941529)]
Ticker EMN
EMN
Topics [(8, 0.23391593), (13, 0.23071912), (42, 0.19171426), (11, 0.09675678), (17, 0.05424794)]
Ticker HLT
HLT
Topics [(9, 0.56429857), (21, 0.08657837), (38, 0.06740098), (12, 0.063067876), (15, 0.04909628)]
Ticker ETN
ETN
Topics [(17, 0.8651563), (26, 0.03685063), (37, 0.03675851), (13, 0.016530469)]
Ticker VRSK
VRSK
Topics [(3, 0.21240066), (7, 0.16008802), (1, 0.076986626), (0, 0.049978845), (11, 0.046793398)]
Ticker NLSN
NLSN
Topics [(35, 0.22681284), (3, 0.08651455), (9, 0.068575464), (19, 0.0596712), (32, 0.057262056)]
Ticker AWK
AWK
Topics [(2, 0.11

Topics [(10, 0.12732962), (37, 0.109119095), (30, 0.06613854), (11, 0.06198711), (20, 0.046205696)]
Ticker J
J
Topics [(1, 0.09797286), (39, 0.07073906), (10, 0.061821755), (18, 0.058909193), (11, 0.054877345)]
Ticker OTIS
OTIS
Topics [(10, 0.17565167), (7, 0.09815061), (20, 0.07477274), (30, 0.0524277), (31, 0.04568953)]
Ticker CPB
CPB
Topics [(6, 0.74951416), (17, 0.06494423), (47, 0.028567996), (36, 0.026700143), (26, 0.024598673)]
Ticker MA
MA
Topics [(15, 0.21267861), (14, 0.12442113), (12, 0.07038718), (43, 0.064368576), (35, 0.063581385)]
Ticker LRCX
LRCX
Topics [(10, 0.3188747), (11, 0.22088046), (19, 0.1197106), (35, 0.0838145), (33, 0.05468206)]
Ticker ENPH
ENPH
Topics [(37, 0.2705908), (48, 0.18511853), (24, 0.1725513), (0, 0.13360058), (35, 0.053291354)]
Ticker DGX
DGX
Topics [(1, 0.39324903), (35, 0.38524768), (5, 0.026070086), (22, 0.024279743), (28, 0.021290146)]
Ticker PWR
PWR
Topics [(44, 0.12309118), (18, 0.07822147), (24, 0.077911176), (10, 0.07124359), (2, 0.0630827

Topics [(1, 0.8772271), (31, 0.03525925), (29, 0.014096313), (19, 0.010851927), (46, 0.010145498)]
Ticker SE
SE
Topics [(18, 0.88094723), (44, 0.10944321)]
Ticker FANG
FANG
Topics [(17, 0.35981047), (18, 0.25000075), (25, 0.23787361), (44, 0.09714728), (34, 0.031713028)]
Ticker PEP
PEP
Topics [(11, 0.7699981), (15, 0.13972072), (5, 0.071768366)]
Ticker BF
BF
Topics [(36, 0.15532523), (11, 0.13619505), (17, 0.08556725), (45, 0.064432174), (27, 0.05854447)]
Ticker CB
CB
Topics [(7, 0.44493178), (43, 0.15760559), (46, 0.12634002), (10, 0.03855742), (31, 0.03695773)]
Ticker IT
IT
Topics [(14, 0.28935575), (32, 0.098621145), (3, 0.09073622), (0, 0.090282105), (9, 0.08350518)]
Ticker SJM
SJM
Topics [(11, 0.482373), (15, 0.12957391), (27, 0.08716104), (2, 0.04935801), (36, 0.04576362)]
Ticker NOW
NOW
Topics [(9, 0.27197155), (3, 0.21353556), (35, 0.1632739), (48, 0.10185015), (0, 0.086470306)]
Ticker HOG
HOG
Topics [(45, 0.6819567), (37, 0.13218725), (10, 0.027857102), (31, 0.02122715), (23, 

Topics [(26, 0.27434188), (21, 0.22049858), (27, 0.19625556), (36, 0.15958421), (37, 0.030052189)]
Ticker ALTR
ALTR
Topics [(35, 0.5271668), (0, 0.15082029), (9, 0.115094975), (48, 0.09820874), (11, 0.026665188)]
Ticker TT
TT
Topics [(26, 0.1078617), (42, 0.081851274), (36, 0.060853343), (17, 0.058829747), (10, 0.057709124)]
Ticker AFL
AFL
Topics [(31, 0.3719166), (43, 0.29827243), (4, 0.07293611), (46, 0.06778872), (7, 0.044050008)]
Ticker AKAM
AKAM
Topics [(9, 0.26124144), (3, 0.23758039), (35, 0.11438791), (48, 0.11365909), (14, 0.10956018)]
Ticker ANET
ANET
Topics [(0, 0.6421303), (3, 0.11182425), (9, 0.10860753), (35, 0.08327835), (48, 0.023699619)]
Ticker NSM
NSM
Topics [(30, 0.14108583), (9, 0.11798396), (35, 0.09851532), (26, 0.08278554), (48, 0.08082828)]
Ticker WYNN
WYNN
Topics [(47, 0.5370229), (21, 0.23791927), (23, 0.058616944), (28, 0.04280849), (9, 0.04022948)]
Ticker MCHP
MCHP
Topics [(30, 0.45674458), (35, 0.11173096), (48, 0.08561297), (42, 0.08521582), (11, 0.0455030

Topics [(20, 0.5831671), (8, 0.2367044), (17, 0.04364496), (19, 0.030819206), (34, 0.018559042)]
Ticker BBY
BBY
Topics [(27, 0.5389853), (26, 0.11679684), (13, 0.058666088), (37, 0.0464917), (39, 0.034186274)]
Ticker COO
COO
Topics [(1, 0.31172323), (42, 0.18669964), (11, 0.16262472), (28, 0.101697), (9, 0.07056425)]
Ticker AMAT
AMAT
Topics [(10, 0.43141082), (11, 0.2306791), (35, 0.06928979), (48, 0.068299636), (42, 0.052066006)]
Ticker EQT
EQT
Topics [(18, 0.4344834), (44, 0.26824722), (25, 0.23709922), (34, 0.021013502), (17, 0.017455764)]
Ticker RIG
RIG
Topics [(17, 0.69420224), (25, 0.117295325), (34, 0.0939835), (26, 0.04216933), (44, 0.010174057)]
Ticker RDC
RDC
Topics [(17, 0.51662296), (25, 0.32934192), (2, 0.09827096), (34, 0.013413398), (40, 0.012150188)]
Ticker WAB
WAB
Topics [(36, 0.6507768), (32, 0.08255182), (9, 0.08129395), (26, 0.062848754), (29, 0.030497478)]
Ticker MKTX
MKTX
Topics [(35, 0.8491018), (3, 0.11245139)]
Ticker MMI
MMI
Topics [(1, 0.20195091), (0, 0.13697

Topics [(24, 0.42771342), (39, 0.2306074), (10, 0.20038296), (7, 0.033578552), (45, 0.031023437)]
Ticker SLM
SLM
Topics [(31, 0.59181), (4, 0.24546379), (46, 0.06718821), (10, 0.031485707), (27, 0.012109888)]
Ticker HRL
HRL
Topics [(6, 0.520002), (2, 0.29609233), (36, 0.096531324), (26, 0.029072428), (15, 0.019582564)]
Ticker NFLX
NFLX
Topics [(32, 0.2956352), (35, 0.1394172), (21, 0.11941686), (47, 0.102520026), (6, 0.0526831)]
Ticker ZBRA
ZBRA
Topics [(35, 0.38291103), (0, 0.14751127), (21, 0.07001457), (9, 0.0591328), (10, 0.05399974)]
Ticker CEPH
CEPH
Topics [(22, 0.5151306), (12, 0.4151143), (18, 0.031928364)]
Ticker BLL
BLL
Topics [(17, 0.2236429), (4, 0.1048941), (15, 0.10118023), (26, 0.0629949), (36, 0.055555735)]
Ticker WAG
WAG
Topics [(1, 0.73628765), (19, 0.06721201), (36, 0.02572658), (11, 0.023999454), (28, 0.02109265)]
Ticker DXC
DXC
Topics [(14, 0.37865287), (3, 0.32028988), (0, 0.10061), (32, 0.045859676), (13, 0.03403147)]
Ticker MGM
MGM
Topics [(21, 0.3826336), (6, 0

  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
