In [1]:
import pandas as pd 
import numpy as np
from gensim import corpora, models
from collections import defaultdict
import os
from pyLDAvis import gensim, show
os.chdir('C:/Users/g1sml02/Dropbox (Research)/Projects/fed-statements')

In [32]:
# specs 
MIN_SPEECH_LENGTH = 50
N = 2
N_GRAMS = '{}-gram'.format(N)
VOCAB_SIZE = 15000
NUM_TOPICS = 5
PCT_TOPIC_THRESHOLD = 0.1

# path info  - call from root
ROOT_DIR = os.getcwd()
DATA_DIR = os.path.join(ROOT_DIR, 'data')
MASTER_DF = os.path.join(DATA_DIR, 'master_df.csv')
DF_WITH_LDA = os.path.join(DATA_DIR, '{}_topic_lda.csv'.format(NUM_TOPICS))
LDA_FILE_NAME_TEMPLATE = os.path.join(DATA_DIR, 'models', '{}_topic_lda')

In [3]:
def get_ngrams(df): 
    def ngram_str_to_lst(ngram_str):
        if isinstance(ngram_str, float): 
            ngram_str = ''
        return ngram_str.split('.')
    col = df[N_GRAMS].copy()
    return col.apply(ngram_str_to_lst)

def print_topics(): 
    global lda
    for idx, topic in lda.print_topics(-1):
        print('Topic: {} \nWords: {}'.format(idx, topic), end='\n\n')

In [4]:
print('Reading data...')
df = pd.read_csv(MASTER_DF).drop('Unnamed: 0', axis=1)
dates = df['Date'].copy()

# get ngrams, build vocab, and bag of ngrams
doc_ngrams = get_ngrams(df)
dates = df['Date'].copy()

Reading data...


In [5]:
# get ngrams, build vocab, and bag of ngrams
doc_ngrams = get_ngrams(df)
vocab_dict = corpora.Dictionary(doc_ngrams)
vocab_dict.filter_extremes(no_below=15, no_above=0.25, keep_n=VOCAB_SIZE)
corpus = [vocab_dict.doc2bow(doc) for doc in doc_ngrams]

In [33]:
print('Fitting model...')
lda_fname = LDA_FILE_NAME_TEMPLATE.format(NUM_TOPICS)
lda = models.LdaMulticore(corpus=corpus, id2word=vocab_dict, num_topics=NUM_TOPICS, passes=2, workers=1)
# lda.save(lda_fname)
print_topics()

Fitting model...
Topic: 0 
Words: 0.001*"toobigtofail problem" + 0.001*"central clear" + 0.001*"macroprudenti polici" + 0.001*"consolid supervis" + 0.001*"capit assess" + 0.001*"assess program" + 0.001*"neutral rate" + 0.001*"incent compens" + 0.001*"securit market" + 0.001*"labor cost"

Topic: 1 
Words: 0.006*"neutral rate" + 0.002*"macroprudenti polici" + 0.002*"central clear" + 0.001*"home countri" + 0.001*"financi disrupt" + 0.001*"price level" + 0.001*"new normal" + 0.001*"neutral interest" + 0.001*"sheet normal" + 0.001*"bank independ"

Topic: 2 
Words: 0.002*"basel framework" + 0.002*"oper risk" + 0.001*"financi educ" + 0.001*"neutral rate" + 0.001*"volcker rule" + 0.001*"longrun inflat" + 0.001*"feder reserveâ" + 0.001*"incom wealth" + 0.001*"senior manag" + 0.001*"econom opportun"

Topic: 3 
Words: 0.002*"treasuri market" + 0.002*"task forc" + 0.001*"margin requir" + 0.001*"real feder" + 0.001*"headlin inflat" + 0.001*"neutral rate" + 0.001*"credit deriv" + 0.001*"central clea

## Look at a specific set of speeches

In [34]:
date = df['Date'].iloc[0]
dates = df['Speech Dates'].iloc[0]
document = corpus[0]
print(date)
print(dates)                      # <- url to read: https://www.federalreserve.gov/newsevents/speeches.htm

2019-09-18
[Timestamp('2019-09-05 00:00:00'), Timestamp('2019-09-04 00:00:00'), Timestamp('2019-08-23 00:00:00'), Timestamp('2019-08-20 00:00:00'), Timestamp('2019-08-05 00:00:00')]


In [35]:
topics = lda.get_document_topics(document)
print(topics)

[(4, 0.99850863)]


#### Implicaitions 
This says that the speeches on the dates above are most representative of topics 2 and 5, i.e. the focus on words. 

In [36]:
lda.show_topic(topics[0][0])

[('mortgag servic', 0.0018232445),
 ('headlin inflat', 0.0013142938),
 ('mortgag securit', 0.0010453563),
 ('macroprudenti polici', 0.0010249977),
 ('asset bubbl', 0.0010223262),
 ('financi turmoil', 0.0010091946),
 ('develop countri', 0.00100722),
 ('neutral rate', 0.0009840349),
 ('countercycl capit', 0.0009570446),
 ('subprim loan', 0.0009505573)]

In [37]:
vis = gensim.prepare(lda, corpus, vocab_dict)
show(vis)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  return pd.concat([default_term_info] + list(topic_dfs))



Note: if you're in the IPython notebook, pyLDAvis.show() is not the best command
      to use. Consider using pyLDAvis.display(), or pyLDAvis.enable_notebook().
      See more information at http://pyLDAvis.github.io/quickstart.html .

You must interrupt the kernel to end this command

Serving to http://127.0.0.1:8894/    [Ctrl-C to exit]


127.0.0.1 - - [27/Sep/2019 08:45:25] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [27/Sep/2019 08:45:25] "GET /LDAvis.css HTTP/1.1" 200 -
127.0.0.1 - - [27/Sep/2019 08:45:25] "GET /d3.js HTTP/1.1" 200 -
127.0.0.1 - - [27/Sep/2019 08:45:25] "GET /LDAvis.js HTTP/1.1" 200 -



stopping Server...
