Links

https://notebook.community/pombredanne/gensim/docs/notebooks/ldaseqmodel

https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/dtm_example.ipynb

https://deepnote.com/workspace/first-deepnote-workspace-f527-ed9b-a85a7f85-3f6d-4f29-ba3f-8f29b3c555fe/project/gensim-1f425538-54de-41ab-b117-339019a0b104/%2Fdocs%2Fnotebooks%2Fdtm_example.ipynb


In [26]:
# setting up our imports
import numpy as np
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk import pos_tag

from gensim.models import ldaseqmodel
from gensim.corpora import Dictionary, bleicorpus
from gensim.matutils import hellinger

import spacy
import scattertext as st
from scattertext.termranking import AbsoluteFrequencyRanker

from FedTools import FederalReserveMins

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [33]:
nltk.download("averaged_perceptron_tagger")
nltk.download("omw-1.4")
nltk.download("wordnet")

try:
    nltk.data.find("punkt")
except LookupError:
    nltk.download("punkt")

try:
    nltk.data.find("stopwords")
except LookupError:
    nltk.download("stopwords")

try:
    nltk.data.find("vader_lexicon")
except LookupError:
    nltk.download("vader_lexicon")

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/dsimbandumwe/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/dsimbandumwe/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/dsimbandumwe/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/dsimbandumwe/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dsimbandumwe/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/dsimbandumwe/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


# Functions

In [14]:
def stem_token(token):
    """
    Stems the given token using the PorterStemmer from the nltk library
    Input: a single token
    Output: the stem of the token
    """
    ps = PorterStemmer()
    stemmed_word = ps.stem(token)
    return stemmed_word


def penn2morphy(penntag):
    """Converts Penn Treebank tags to WordNet."""
    morphy_tag = {"NN": "n", "JJ": "a", "VB": "v", "RB": "r"}
    try:
        return morphy_tag[penntag[:2]]
    except:
        return "n"
    
    
    
    
def lemmatize_token(token):
    """
    Lemmatize the token using nltk library
    Input: a single token
    Output: the lemmatization of the token
    """
    wordnet = WordNetLemmatizer()
    token_tagged = pos_tag([token])
    tag = token_tagged[0][1]
    morphy_tag = penn2morphy(tag)
    lemmatized_word = wordnet.lemmatize(token, pos=morphy_tag)
    return lemmatized_word





def filter_common_words(words):
    common_words = [
        "first",
        "like",
        "welcome",
        "pleased",
        "let",
        "good",
        "afternoon",
        "press",
        "conference",
        "meeting",
        "would",
        "outcome",
        "going",
        "know",
        "said",
        "along",
        "together",
        "also",
        "formally",
        "meetings",
        "evening",
        "annual",
        "one",
        "two",
        "second",
        "third",
        "last",
        "next",
        "point",
        "per",
        "answer",
        "ask",
        "say",
        "said",
        "mention",
        "talk",
        "tell",
        "told",
        "suggest",
        "think",
        "wonder",
        "mean",
        "understand",
        "know",
        "maybe",
        "perhaps",
        "remain",
        "generally",
        "thus",
        "member",
        "seem",
        "see",
        "look",
        "consider",
        "regard",
        "include",
        "hear",
        "going",
        "go",
        "goes",
        "come",
        "came",
        "give",
        "use",
        "using",
        "get",
        "can",
        "could",
        "should",
        "may",
        "might",
        "way",
        "yes",
        "no",
        "lot",
        "bit",
        "also",
        "case",
        "fact",
        "like",
        "want",
        "believe",
        "feel",
        "actual",
        "well",
        "kin",
        "moment",
        "time",
        "now"
    ]
    return [word for word in words if word not in common_words]





def preprocess_speech(speech):
    """
    This function does the preprocessing
    """
    # put all characters in lower case
    speech["Text"] = speech["Text"].str.lower()
    speech["Tokens"] = speech["Text"].apply(lambda x: nltk.word_tokenize(str(x)))
    # remove stop words and non-alphabetic from all the text
    stop_word = nltk.corpus.stopwords.words("english")
    speech["Tokens"] = speech["Tokens"].apply(
        lambda x: [word for word in x if (word not in stop_word) and word.isalpha()]
    )
    # lemmatize
    speech["Tokens"] = speech["Tokens"].apply(
        lambda x: [lemmatize_token(token) for token in x]
    )
    # additional filter
    speech["Tokens"] = speech["Tokens"].apply(filter_common_words)
    speech["Joined_Tokens"] = speech["Tokens"].apply(lambda x: " ".join(x))
    speech = speech.sort_values(by="year").reset_index(drop=True)
    #speech = country_code_cleanup(speech)
    # create a scattertext object for visualization
    speech['parse'] = speech.Joined_Tokens.apply(st.whitespace_nlp_with_sentences)
    return speech

# Load Data

In [15]:
fed_mins = FederalReserveMins(
            main_url = 'https://www.federalreserve.gov', 
            calendar_url ='https://www.federalreserve.gov/monetarypolicy/fomccalendars.htm',
            start_year = 2015,        
            historical_split = 2017,
            verbose = True,
            thread_num = 10)

df = fed_mins.find_minutes()

Constructing links between 2015 and 2023
Extracting Federal Reserve Minutes.
Retrieving articles.
...................................................................

In [16]:
df.rename(columns={'Federal_Reserve_Mins': 'Text'}, inplace=True)
df['Date'] = df.index
df['year'] = df['Date'].dt.year

In [17]:
df

Unnamed: 0,Text,Date,year
2015-01-28,"The Federal Reserve, the central bank of the U...",2015-01-28,2015
2015-03-18,"The Federal Reserve, the central bank of the U...",2015-03-18,2015
2015-04-29,"The Federal Reserve, the central bank of the U...",2015-04-29,2015
2015-06-17,"The Federal Reserve, the central bank of the U...",2015-06-17,2015
2015-07-29,"The Federal Reserve, the central bank of the U...",2015-07-29,2015
...,...,...,...
2022-11-02,"The Federal Reserve, the central bank of the U...",2022-11-02,2022
2022-12-14,"The Federal Reserve, the central bank of the U...",2022-12-14,2022
2023-02-01,"The Federal Reserve, the central bank of the U...",2023-02-01,2023
2023-03-22,"The Federal Reserve, the central bank of the U...",2023-03-22,2023


In [21]:
df_short = df[df['year'] > 2020].copy()

In [22]:
time_slice = df_short['year'].value_counts(sort=True, normalize=False).tolist()
time_slice

[8, 8, 3]

In [23]:
df_short = preprocess_speech(df_short)

# Create Corpus

In [30]:
df_short['parse'] = df_short.Joined_Tokens.apply(st.whitespace_nlp_with_sentences)
#df_2023 = df[df["year"] == 2023]

In [31]:
corpus = (
    st.CorpusWithoutCategoriesFromParsedDocuments(df_short, parsed_col="parse")
    .build()
    .get_unigram_corpus()
)
corpus.remove_infrequent_words(
    minimum_term_count=6, term_ranker=AbsoluteFrequencyRanker
)
corpus.get_categories()

['_']

In [38]:
df_short.head()

Unnamed: 0,Text,Date,year,Tokens,Joined_Tokens,parse
0,"the federal reserve, the central bank of the u...",2021-01-27,2021,"[federal, reserve, central, bank, united, stat...",federal reserve central bank united state prov...,"(federal, reserve, central, bank, united, stat..."
1,"the federal reserve, the central bank of the u...",2021-03-17,2021,"[federal, reserve, central, bank, united, stat...",federal reserve central bank united state prov...,"(federal, reserve, central, bank, united, stat..."
2,"the federal reserve, the central bank of the u...",2021-04-28,2021,"[federal, reserve, central, bank, united, stat...",federal reserve central bank united state prov...,"(federal, reserve, central, bank, united, stat..."
3,"the federal reserve, the central bank of the u...",2021-06-16,2021,"[federal, reserve, central, bank, united, stat...",federal reserve central bank united state prov...,"(federal, reserve, central, bank, united, stat..."
4,"the federal reserve, the central bank of the u...",2021-07-28,2021,"[federal, reserve, central, bank, united, stat...",federal reserve central bank united state prov...,"(federal, reserve, central, bank, united, stat..."


In [40]:
texts = df_short['Tokens']
texts

0     [federal, reserve, central, bank, united, stat...
1     [federal, reserve, central, bank, united, stat...
2     [federal, reserve, central, bank, united, stat...
3     [federal, reserve, central, bank, united, stat...
4     [federal, reserve, central, bank, united, stat...
5     [federal, reserve, central, bank, united, stat...
6     [federal, reserve, central, bank, united, stat...
7     [federal, reserve, central, bank, united, stat...
8     [federal, reserve, central, bank, united, stat...
9     [federal, reserve, central, bank, united, stat...
10    [federal, reserve, central, bank, united, stat...
11    [federal, reserve, central, bank, united, stat...
12    [federal, reserve, central, bank, united, stat...
13    [federal, reserve, central, bank, united, stat...
14    [federal, reserve, central, bank, united, stat...
15    [federal, reserve, central, bank, united, stat...
16    [federal, reserve, central, bank, united, stat...
17    [federal, reserve, central, bank, united, 

In [42]:
dictionary = Dictionary(texts)

In [44]:
corpus.

<scattertext.ParsedCorpus.ParsedCorpus at 0x296d20a90>

# Model

In [43]:
ldaseq = ldaseqmodel.LdaSeqModel(corpus=corpus, id2word=dictionary, time_slice=time_slice, num_topics=5, passes=20)

TypeError: 'ParsedCorpus' object is not iterable

# stuff

In [45]:
from gensim.test.utils import common_corpus
from gensim.models import LdaSeqModel

ldaseq = LdaSeqModel(corpus=common_corpus, time_slice=[2, 4, 3], num_topics=2, chunksize=1)

  convergence = np.fabs((bound - old_bound) / old_bound)


In [46]:
ldaseq

<gensim.models.ldaseqmodel.LdaSeqModel at 0x1779ba980>

In [48]:
import matplotlib.pyplot as plt
import gensim
import numpy as np
import spacy

from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel
from gensim.models.wrappers import LdaMallet
from gensim.corpora import Dictionary
import pyLDAvis.gensim

import os, re, operator, warnings
warnings.filterwarnings('ignore')  # Let's not pay heed to them right now
%matplotlib inline

ModuleNotFoundError: No module named 'gensim.models.wrappers'

In [None]:
mallet_path = './src/mallet-2.0.8/bin/mallet'


In [49]:
import gensim.models.wrappers.LdaMallet

ModuleNotFoundError: No module named 'gensim.models.wrappers'

# LDA

In [5]:
# Essentials
import base64
import re
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datapane as dp
#dp.login(token='INSERT_TOKEN_HERE')
# Gensim and LDA
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.parsing.preprocessing import STOPWORDS
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
# NLP stuff
import contractions
import demoji
import string
import nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
nltk.download('wordnet')
import spacy
# Plotting tools
from bokeh.plotting import figure, output_file, show
from bokeh.models import Label
from bokeh.io import output_notebook
import matplotlib.colors as mcolors
import matplotlib.pyplot as plt
%matplotlib inline
# Miscellaneous
from sklearn.manifold import TSNE
from pprint import pprint

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/dsimbandumwe/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [9]:
from FedTools import FederalReserveMins

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

# Functions

In [7]:
def preprocess(text_col):
    """This function will apply NLP preprocessing lambda functions over a pandas series such as df['text'].
       These functions include converting text to lowercase, removing emojis, expanding contractions, removing punctuation,
       removing numbers, removing stopwords, lemmatization, etc."""
    
    # convert to lowercase
    text_col = text_col.apply(lambda x: ' '.join([w.lower() for w in x.split()]))
    
    # remove emojis
    text_col = text_col.apply(lambda x: demoji.replace(x, ""))
    
    # expand contractions  
    text_col = text_col.apply(lambda x: ' '.join([contractions.fix(word) for word in x.split()]))

    # remove punctuation
    text_col = text_col.apply(lambda x: ''.join([i for i in x if i not in string.punctuation]))
    
    # remove numbers
    text_col = text_col.apply(lambda x: ' '.join(re.sub("[^a-zA-Z]+", " ", x).split()))

    # remove stopwords
    stopwords = [sw for sw in nltk.corpus.stopwords.words('english') if sw not in ['not', 'no']]
    text_col = text_col.apply(lambda x: ' '.join([w for w in x.split() if w not in stopwords]))

    # lemmatization
    text_col = text_col.apply(lambda x: ' '.join([WordNetLemmatizer().lemmatize(w) for w in x.split()]))

    # remove short words
    text_col = text_col.apply(lambda x: ' '.join([w.strip() for w in x.split() if len(w.strip()) >= 3]))

    return text_col


# Load Data

In [23]:
fed_mins = FederalReserveMins(
            main_url = 'https://www.federalreserve.gov', 
            calendar_url ='https://www.federalreserve.gov/monetarypolicy/fomccalendars.htm',
            start_year = 2015,        
            historical_split = 2017,
            verbose = True,
            thread_num = 10)

df = fed_mins.find_minutes()

Constructing links between 2015 and 2023
Extracting Federal Reserve Minutes.
Retrieving articles.
...................................................................

In [24]:
df.reset_index(inplace=True)
df.rename(columns={'Federal_Reserve_Mins': 'Text', 'index':'Date'}, inplace=True)
df['year'] = df['Date'].dt.year

In [25]:
df

Unnamed: 0,Date,Text,year
0,2015-01-28,"The Federal Reserve, the central bank of the U...",2015
1,2015-03-18,"The Federal Reserve, the central bank of the U...",2015
2,2015-04-29,"The Federal Reserve, the central bank of the U...",2015
3,2015-06-17,"The Federal Reserve, the central bank of the U...",2015
4,2015-07-29,"The Federal Reserve, the central bank of the U...",2015
...,...,...,...
62,2022-11-02,"The Federal Reserve, the central bank of the U...",2022
63,2022-12-14,"The Federal Reserve, the central bank of the U...",2022
64,2023-02-01,"The Federal Reserve, the central bank of the U...",2023
65,2023-03-22,"The Federal Reserve, the central bank of the U...",2023


In [31]:
#df['Text'].iloc[2]

In [32]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
tokenizer.tokenize(df['Text'].iloc[2])

['The',
 'Federal',
 'Reserve',
 'the',
 'central',
 'bank',
 'of',
 'the',
 'United',
 'States',
 'provides',
 'the',
 'nation',
 'with',
 'a',
 'safe',
 'flexible',
 'and',
 'stable',
 'monetary',
 'and',
 'financial',
 'system',
 'Federal',
 'Open',
 'Market',
 'Committee',
 'Monetary',
 'Policy',
 'Principles',
 'and',
 'Practice',
 'Policy',
 'Implementation',
 'Reports',
 'Review',
 'of',
 'Monetary',
 'Policy',
 'Strategy',
 'Tools',
 'and',
 'Communications',
 'Institution',
 'Supervision',
 'Reports',
 'Reporting',
 'Forms',
 'Supervision',
 'Regulation',
 'Letters',
 'Banking',
 'Applications',
 'Legal',
 'Developments',
 'Regulatory',
 'Resources',
 'Banking',
 'Data',
 'Structure',
 'Financial',
 'Stability',
 'Assessments',
 'Financial',
 'Stability',
 'Coordination',
 'Actions',
 'Reports',
 'Regulations',
 'Statutes',
 'Payment',
 'Policies',
 'Reserve',
 'Bank',
 'Payment',
 'Services',
 'Data',
 'Financial',
 'Market',
 'Utilities',
 'Infrastructures',
 'Research',
 'C

In [8]:
id2word = corpora.Dictionary(data_preprocessed)
id2word.filter_extremes(no_below=15, no_above=0.4, keep_n=80000)

NameError: name 'data_preprocessed' is not defined

In [13]:
#importing required libraries
import gensim
from gensim import corpora



#creating a sample corpus for demonstration purpose
txt_corpus = [

    "Find end-to-end projects at ProjectPro",

    "Stop wasting time on different online forums to get your project solutions",

    "Each of our projects solve a real business problem from start to finish",

    "All projects come with downloadable solution code and explanatory videos",

    "All our projects are designed modularly so you can rapidly learn and reuse modules"]



# Creating a set of frequent words
stoplist = set('for a of the and to in on of to are at'.split(' '))



# Lowercasing each document, using white space as delimiter and filtering out the stopwords
processed_text = [[word for word in document.lower().split() if word not in stoplist]for document in txt_corpus]



#creating a dictionary
dictionary = corpora.Dictionary(processed_text)



#displaying the dictionary
print(dictionary)

Dictionary<40 unique tokens: ['end-to-end', 'find', 'projectpro', 'projects', 'different']...>


In [14]:
stoplist

{'a', 'and', 'are', 'at', 'for', 'in', 'of', 'on', 'the', 'to'}

In [16]:
txt_corpus

['Find end-to-end projects at ProjectPro',
 'Stop wasting time on different online forums to get your project solutions',
 'Each of our projects solve a real business problem from start to finish',
 'All projects come with downloadable solution code and explanatory videos',
 'All our projects are designed modularly so you can rapidly learn and reuse modules']

In [17]:
#processed_text