In [1]:
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk import pos_tag

import spacy
import scattertext as st

from FedTools import FederalReserveMins

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [2]:
nltk.download("averaged_perceptron_tagger")
nltk.download("omw-1.4")
nltk.download("wordnet")

try:
    nltk.data.find("punkt")
except LookupError:
    nltk.download("punkt")

try:
    nltk.data.find("stopwords")
except LookupError:
    nltk.download("stopwords")

try:
    nltk.data.find("vader_lexicon")
except LookupError:
    nltk.download("vader_lexicon")

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/dsimbandumwe/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/dsimbandumwe/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/dsimbandumwe/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/dsimbandumwe/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dsimbandumwe/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/dsimbandumwe/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


# Functions

def remove_line_number(speech):
    """
    removes the line number at the beginning of speech
    Input: str
    Output: str
    """

    pattern = "\n|^\d+.*?(\w)"
    speech = re.sub(pattern, "\n\g<1>", speech)
    pattern = "\t"
    speech = re.sub(pattern, "", speech)
    pattern = "\n\n"
    speech = re.sub(pattern, "\n", speech)
    pattern = "^\n *"
    speech = re.sub(pattern, "", speech)

    return speech



def remove_first_sentence(speech):
    """
    remove the first sentence
    """
    pattern = r"^.*?\."
    speech = re.sub(pattern, "", speech)

    return speech



In [3]:
def stem_token(token):
    """
    Stems the given token using the PorterStemmer from the nltk library
    Input: a single token
    Output: the stem of the token
    """
    ps = PorterStemmer()
    stemmed_word = ps.stem(token)
    return stemmed_word


def penn2morphy(penntag):
    """Converts Penn Treebank tags to WordNet."""
    morphy_tag = {"NN": "n", "JJ": "a", "VB": "v", "RB": "r"}
    try:
        return morphy_tag[penntag[:2]]
    except:
        return "n"
    
    
    
    
def lemmatize_token(token):
    """
    Lemmatize the token using nltk library
    Input: a single token
    Output: the lemmatization of the token
    """
    wordnet = WordNetLemmatizer()
    token_tagged = pos_tag([token])
    tag = token_tagged[0][1]
    morphy_tag = penn2morphy(tag)
    lemmatized_word = wordnet.lemmatize(token, pos=morphy_tag)
    return lemmatized_word





def filter_common_words(words):
    common_words = [
        "first",
        "like",
        "welcome",
        "pleased",
        "let",
        "good",
        "afternoon",
        "press",
        "conference",
        "meeting",
        "would",
        "outcome",
        "going",
        "know",
        "said",
        "along",
        "together",
        "also",
        "formally",
        "meetings",
        "evening",
        "annual",
        "one",
        "two",
        "second",
        "third",
        "last",
        "next",
        "point",
        "per",
        "answer",
        "ask",
        "say",
        "said",
        "mention",
        "talk",
        "tell",
        "told",
        "suggest",
        "think",
        "wonder",
        "mean",
        "understand",
        "know",
        "maybe",
        "perhaps",
        "remain",
        "generally",
        "thus",
        "member",
        "seem",
        "see",
        "look",
        "consider",
        "regard",
        "include",
        "hear",
        "going",
        "go",
        "goes",
        "come",
        "came",
        "give",
        "use",
        "using",
        "get",
        "can",
        "could",
        "should",
        "may",
        "might",
        "way",
        "yes",
        "no",
        "lot",
        "bit",
        "also",
        "case",
        "fact",
        "like",
        "want",
        "believe",
        "feel",
        "actual",
        "well",
        "kin",
        "moment",
        "time",
        "now"
    ]
    return [word for word in words if word not in common_words]





def preprocess_speech(speech):
    """
    This function does the preprocessing
    """
    # put all characters in lower case
    speech["Text"] = speech["Text"].str.lower()
    speech["Tokens"] = speech["Text"].apply(lambda x: nltk.word_tokenize(str(x)))
    # remove stop words and non-alphabetic from all the text
    stop_word = nltk.corpus.stopwords.words("english")
    speech["Tokens"] = speech["Tokens"].apply(
        lambda x: [word for word in x if (word not in stop_word) and word.isalpha()]
    )
    # lemmatize
    speech["Tokens"] = speech["Tokens"].apply(
        lambda x: [lemmatize_token(token) for token in x]
    )
    # additional filter
    speech["Tokens"] = speech["Tokens"].apply(filter_common_words)
    speech["Joined_Tokens"] = speech["Tokens"].apply(lambda x: " ".join(x))
    speech = speech.sort_values(by="year").reset_index(drop=True)
    #speech = country_code_cleanup(speech)
    # create a scattertext object for visualization
    speech['parse'] = speech.Joined_Tokens.apply(st.whitespace_nlp_with_sentences)
    return speech



# Load Data

In [4]:
fed_mins = FederalReserveMins(
            main_url = 'https://www.federalreserve.gov', 
            calendar_url ='https://www.federalreserve.gov/monetarypolicy/fomccalendars.htm',
            start_year = 2015,        
            historical_split = 2017,
            verbose = True,
            thread_num = 10)

df = fed_mins.find_minutes()

Constructing links between 2015 and 2023
Extracting Federal Reserve Minutes.
Retrieving articles.
...................................................................

In [5]:
df.rename(columns={'Federal_Reserve_Mins': 'Text'}, inplace=True)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 67 entries, 2015-01-28 to 2023-05-03
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    67 non-null     object
dtypes: object(1)
memory usage: 3.1+ KB


In [7]:
df['Date'] = df.index


In [8]:
df['year'] = df['Date'].dt.year

In [9]:
df

Unnamed: 0,Text,Date,year
2015-01-28,"The Federal Reserve, the central bank of the U...",2015-01-28,2015
2015-03-18,"The Federal Reserve, the central bank of the U...",2015-03-18,2015
2015-04-29,"The Federal Reserve, the central bank of the U...",2015-04-29,2015
2015-06-17,"The Federal Reserve, the central bank of the U...",2015-06-17,2015
2015-07-29,"The Federal Reserve, the central bank of the U...",2015-07-29,2015
...,...,...,...
2022-11-02,"The Federal Reserve, the central bank of the U...",2022-11-02,2022
2022-12-14,"The Federal Reserve, the central bank of the U...",2022-12-14,2022
2023-02-01,"The Federal Reserve, the central bank of the U...",2023-02-01,2023
2023-03-22,"The Federal Reserve, the central bank of the U...",2023-03-22,2023


In [10]:
df = preprocess_speech(df)

IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out


# Scatter Plot

In [11]:
import scattertext as st
from scattertext.termranking import AbsoluteFrequencyRanker



In [12]:
df

Unnamed: 0,Text,Date,year,Tokens,Joined_Tokens,parse
0,"the federal reserve, the central bank of the u...",2015-01-28,2015,"[federal, reserve, central, bank, united, stat...",federal reserve central bank united state prov...,"(federal, reserve, central, bank, united, stat..."
1,"the federal reserve, the central bank of the u...",2015-03-18,2015,"[federal, reserve, central, bank, united, stat...",federal reserve central bank united state prov...,"(federal, reserve, central, bank, united, stat..."
2,"the federal reserve, the central bank of the u...",2015-04-29,2015,"[federal, reserve, central, bank, united, stat...",federal reserve central bank united state prov...,"(federal, reserve, central, bank, united, stat..."
3,"the federal reserve, the central bank of the u...",2015-06-17,2015,"[federal, reserve, central, bank, united, stat...",federal reserve central bank united state prov...,"(federal, reserve, central, bank, united, stat..."
4,"the federal reserve, the central bank of the u...",2015-07-29,2015,"[federal, reserve, central, bank, united, stat...",federal reserve central bank united state prov...,"(federal, reserve, central, bank, united, stat..."
...,...,...,...,...,...,...
62,"the federal reserve, the central bank of the u...",2022-11-02,2022,"[federal, reserve, central, bank, united, stat...",federal reserve central bank united state prov...,"(federal, reserve, central, bank, united, stat..."
63,"the federal reserve, the central bank of the u...",2022-12-14,2022,"[federal, reserve, central, bank, united, stat...",federal reserve central bank united state prov...,"(federal, reserve, central, bank, united, stat..."
64,"the federal reserve, the central bank of the u...",2023-03-22,2023,"[federal, reserve, central, bank, united, stat...",federal reserve central bank united state prov...,"(federal, reserve, central, bank, united, stat..."
65,"the federal reserve, the central bank of the u...",2023-02-01,2023,"[federal, reserve, central, bank, united, stat...",federal reserve central bank united state prov...,"(federal, reserve, central, bank, united, stat..."


In [13]:
df['parse'] = df.Joined_Tokens.apply(st.whitespace_nlp_with_sentences)
df_2023 = df[df["year"] == 2023]

In [14]:
corpus = (
    st.CorpusWithoutCategoriesFromParsedDocuments(df_2023, parsed_col="parse")
    .build()
    .get_unigram_corpus()
)
corpus.remove_infrequent_words(
    minimum_term_count=6, term_ranker=AbsoluteFrequencyRanker
)
corpus.get_categories()

['_']

In [15]:
dispersion = st.Dispersion(corpus)
dispersion_df = dispersion.get_df()
dispersion_df.head(5)

Unnamed: 0,Frequency,Range,SD,VC,Juilland's D,Rosengren's S,DP,DP norm,KL-divergence
federal,185,3,8.730534,0.141576,0.933845,0.997734,0.041285,0.0595,0.006538
reserve,148,3,11.145502,0.225922,0.882404,0.993234,0.080474,0.11598,0.019789
central,13,3,1.247219,0.28782,0.840253,0.987665,0.102824,0.14819,0.035789
bank,212,3,10.338708,0.146302,0.900468,0.995096,0.065794,0.094822,0.014305
united,8,3,1.699673,0.637377,0.587252,0.911684,0.266285,0.383771,0.249716


In [16]:
dispersion_df = dispersion_df.assign(
    X=lambda df: df.Frequency,
    Xpos=lambda df: st.Scalers.log_scale(df.X),
    Y=lambda df: df["Rosengren's S"],
    Ypos=lambda df: st.Scalers.scale(df.Y),
)

In [17]:
html = st.dataframe_scattertext(
    corpus,
    plot_df=dispersion_df,
    ignore_categories=True,
    color_score_column="ColorScore",
    x_label="Log Frequency",
    y_label="Rosengren's S",
    y_axis_labels=["Less Dispersion", "Medium", "More Dispersion"],
)

open("unga_dispersion.html", "wb").write(html.encode("utf-8"))

  vec_ss = (vec_ss - vec_ss.min()) * 1. / (vec_ss.max() - vec_ss.min())


1420055