In [1]:
import nltk
from nltk.stem import *

import pandas as pd
import numpy as np

from sklearn.datasets import fetch_20newsgroups

import re

import gensim
import pickle 

from gensim.models import CoherenceModel

import matplotlib.pyplot as plt

import pyLDAvis
import pyLDAvis.gensim_models

from FedTools import FederalReserveMins

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [2]:
nltk.download('punkt') 
nltk.download('wordnet') 
nltk.download('stopwords')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/dsimbandumwe/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/dsimbandumwe/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dsimbandumwe/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/dsimbandumwe/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:
stopwords = set(nltk.corpus.stopwords.words('english'))

# Functions

In [4]:
def remove_url(text):
    return re.sub(r'https?:\S*','',text)


def remove_mentions_and_tags(text):
     text = re.sub(r'@\S*','',text)
     return re.sub(r'#\S*','',text)


def text_preprocessing(df):
    corpus=[]
    
    lem = WordNetLemmatizer() # For Lemmatization
    for news in df['text']:
        words=[w for w in nltk.tokenize.word_tokenize(news) if (w not in stopwords)] # word_tokenize function tokenizes text on each word by default
        words=[lem.lemmatize(w) for w in words if len(w)>2]
        corpus.append(words)
    return corpus





# Download

In [5]:
fed_mins = FederalReserveMins(
            main_url = 'https://www.federalreserve.gov', 
            calendar_url ='https://www.federalreserve.gov/monetarypolicy/fomccalendars.htm',
            start_year = 2015,        
            historical_split = 2017,
            verbose = True,
            thread_num = 10)

df = fed_mins.find_minutes()

Constructing links between 2015 and 2023
Extracting Federal Reserve Minutes.
Retrieving articles.
...................................................................

In [6]:
df.rename(columns={'Federal_Reserve_Mins': 'text'}, inplace=True)

In [7]:
df

Unnamed: 0,text
2015-01-28,"The Federal Reserve, the central bank of the U..."
2015-03-18,"The Federal Reserve, the central bank of the U..."
2015-04-29,"The Federal Reserve, the central bank of the U..."
2015-06-17,"The Federal Reserve, the central bank of the U..."
2015-07-29,"The Federal Reserve, the central bank of the U..."
...,...
2022-11-02,"The Federal Reserve, the central bank of the U..."
2022-12-14,"The Federal Reserve, the central bank of the U..."
2023-02-01,"The Federal Reserve, the central bank of the U..."
2023-03-22,"The Federal Reserve, the central bank of the U..."


In [8]:
#fetch20newsgroups = fetch_20newsgroups(subset='train')

In [9]:
#df = pd.DataFrame(fetch20newsgroups.data, columns=['text'])

# Clean Data

In [10]:
df.text = df.text.apply(remove_url)

In [11]:
df.text = df.text.apply(remove_mentions_and_tags)

In [12]:
df

Unnamed: 0,text
2015-01-28,"The Federal Reserve, the central bank of the U..."
2015-03-18,"The Federal Reserve, the central bank of the U..."
2015-04-29,"The Federal Reserve, the central bank of the U..."
2015-06-17,"The Federal Reserve, the central bank of the U..."
2015-07-29,"The Federal Reserve, the central bank of the U..."
...,...
2022-11-02,"The Federal Reserve, the central bank of the U..."
2022-12-14,"The Federal Reserve, the central bank of the U..."
2023-02-01,"The Federal Reserve, the central bank of the U..."
2023-03-22,"The Federal Reserve, the central bank of the U..."


# Pre-Processing

In [37]:
# Apply this function on our data frame
corpus = text_preprocessing(df)

In [38]:
#corpus = text_preprocessing(df.iloc[[23]])

In [39]:
# Transform to gensim dictionary
dic = gensim.corpora.Dictionary(corpus) 
bow_corpus = [dic.doc2bow(doc) for doc in corpus]

# Useful for storing big datasets
pickle.dump(bow_corpus, open('./data/corpus.pkl', 'wb'))
dic.save('./data/dictionary.gensim')

# Model

In [55]:
lda_model = gensim.models.LdaMulticore(bow_corpus,
                                   num_topics = 7,
                                    id2word = dic,
                                      passes = 10,
                                      workers = 20)
lda_model.save('./data/model4.gensim')

In [56]:
# We print words occuring in each of the topics as we iterate through them
for idx, topic in lda_model.print_topics(num_words=10):    
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.016*"inflation" + 0.013*"rate" + 0.012*"Board" + 0.010*"participant" + 0.009*"policy" + 0.009*"market" + 0.009*"Committee" + 0.008*"The" + 0.008*"Reserve" + 0.008*"Federal"
Topic: 1 
Words: 0.019*"Committee" + 0.012*"The" + 0.010*"inflation" + 0.010*"rate" + 0.010*"Bank" + 0.009*"market" + 0.008*"foreign" + 0.007*"Federal" + 0.007*"Reserve" + 0.007*"policy"
Topic: 2 
Words: 0.017*"inflation" + 0.013*"rate" + 0.008*"market" + 0.008*"Committee" + 0.007*"The" + 0.007*"participant" + 0.007*"percent" + 0.006*"year" + 0.006*"September" + 0.006*"would"
Topic: 3 
Words: 0.014*"rate" + 0.014*"inflation" + 0.012*"Committee" + 0.011*"market" + 0.009*"The" + 0.009*"economic" + 0.008*"price" + 0.008*"policy" + 0.007*"participant" + 0.007*"would"
Topic: 4 
Words: 0.015*"rate" + 0.012*"inflation" + 0.010*"participant" + 0.009*"Committee" + 0.009*"market" + 0.008*"The" + 0.008*"policy" + 0.007*"Board" + 0.007*"economic" + 0.007*"Federal"
Topic: 5 
Words: 0.012*"inflation" + 0.011*"p

# Evaluation

In [57]:
# instantiate topic coherence model
cm = CoherenceModel(model=lda_model, corpus=bow_corpus, texts=corpus, coherence='c_v')

In [58]:
# get topic coherence score
coherence_lda = cm.get_coherence()
print(coherence_lda)

nan


  m_lr_i = np.log(numerator / denominator)
  return cv1.T.dot(cv2)[0, 0] / (_magnitude(cv1) * _magnitude(cv2))


topics = []
score = []
for i in range(2,7,1):
     lda = gensim.models.LdaMulticore(corpus=bow_corpus, id2word=dic, iterations=5, num_topics=i, workers = 20, passes=10, random_state=42)
     cm = CoherenceModel(model=lda, corpus=bow_corpus, texts=corpus, coherence='c_v')
     topics.append(i) # Append number of topics modeled
     score.append(cm.get_coherence()) # Append coherence scores to list
plt.plot(topics, score)
plt.xlabel('# of Topics')
plt.ylabel('Coherence Score')
plt.show()

In [59]:
# Loading the dictionary and corpus files we saved earlier
dictionary = gensim.corpora.Dictionary.load('./data/dictionary.gensim')
corpus = pickle.load(open('./data/corpus.pkl', 'rb'))

In [60]:
# Loading the num_of_topics = 2 model we saved earlier
lda = gensim.models.ldamodel.LdaModel.load('./data/model4.gensim')
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda, bow_corpus, dic, sort_topics=False)
pyLDAvis.display(vis)