In [1]:
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk import pos_tag

import spacy
import scattertext as st

from FedTools import FederalReserveMins

In [2]:
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess

import pyLDAvis
import pyLDAvis.gensim_models
from sklearn.datasets import fetch_20newsgroups
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [3]:



def lemmatize(docs, allowed_postags = ["NOUN", "ADJ", "VERB", "ADV"]):
  '''
  Performs lemmization of input documents.

  Args:
    - docs: list of strings with input documents
    - allowed_postags: list of accepted Part of Speech (POS) types
  Output:
    - list of strings with lemmatized input
  '''
  nlp = spacy.load("en_core_web_sm", disable = ["parser", "ner"])
  lemmatized_docs = []
  for doc in docs:
    doc = nlp(doc)
    tokens = []
    for token in doc:
      if token.pos_ in allowed_postags:
        tokens.append(token.lemma_)
    lemmatized_docs.append(" ".join(tokens))
  return (lemmatized_docs)


def tokenize(docs):
  '''
  Performs tokenization of input documents.

  Args:
    - docs: list of strings with input documents
  Output:
    - list of strings with tokenized input
  '''
  tokenized_docs = []
  for doc in docs:
    tokens = gensim.utils.simple_preprocess(doc, deacc=True)
    tokenized_docs.append(tokens)
  return (tokenized_docs)




# Load Data

In [4]:
fed_mins = FederalReserveMins(
            main_url = 'https://www.federalreserve.gov', 
            calendar_url ='https://www.federalreserve.gov/monetarypolicy/fomccalendars.htm',
            start_year = 2015,        
            historical_split = 2017,
            verbose = True,
            thread_num = 10)

df = fed_mins.find_minutes()

Constructing links between 2015 and 2023
Extracting Federal Reserve Minutes.
Retrieving articles.
...................................................................

In [5]:
df.rename(columns={'Federal_Reserve_Mins': 'Text'}, inplace=True)
#df['Date'] = df.index
#df['year'] = df['Date'].dt.year

In [6]:
df

Unnamed: 0,Text
2015-01-28,"The Federal Reserve, the central bank of the U..."
2015-03-18,"The Federal Reserve, the central bank of the U..."
2015-04-29,"The Federal Reserve, the central bank of the U..."
2015-06-17,"The Federal Reserve, the central bank of the U..."
2015-07-29,"The Federal Reserve, the central bank of the U..."
...,...
2022-11-02,"The Federal Reserve, the central bank of the U..."
2022-12-14,"The Federal Reserve, the central bank of the U..."
2023-02-01,"The Federal Reserve, the central bank of the U..."
2023-03-22,"The Federal Reserve, the central bank of the U..."


# Process Docs

In [7]:

# Pre-process input: lemmatization and tokenization
lemmatized_docs = lemmatize(df)
tokenized_docs = tokenize(lemmatized_docs)

# Mapping from word IDs to words
id2word = corpora.Dictionary(tokenized_docs)

# Prepare Document-Term Matrix
corpus = []
for doc in tokenized_docs:
    corpus.append(id2word.doc2bow(doc))



# Model

In [8]:
# Fit LDA model: See [1] for more details
topic_model = gensim.models.ldamodel.LdaModel(
    corpus = corpus,      # Document-Term Matrix
    id2word = id2word,    # Map word IDs to words
    num_topics = 5,      # Number of latent topics to extract
    random_state = 100,
    passes = 100,         # N° of passes through the corpus during training
    )



In [9]:
lda_model = gensim.models.LdaMulticore(bow_corpus,
                                   num_topics = 3,
                                    id2word = dic,
                                      passes = 10,
                                      workers = 20)
lda_model.save('./data/model4.gensim')

NameError: name 'bow_corpus' is not defined

# Visualization

In [None]:
# Visualize with pyLDAvis: See [2] for more details
pyLDAvis.enable_notebook()
visualization = pyLDAvis.gensim_models.prepare(
    topic_model, 
    corpus,
    id2word, 
    mds = "mmds", 
    R = 30)

visualization

In [None]:
df

In [None]:
df = preprocess_speech(df)