In [1]:
import pandas as pd
import numpy as np
import spacy
import re
from utils.MasterDictionary import load_masterdictionary




In [2]:
def read_data(filename: str) -> pd.DataFrame:
    return pd.read_csv(f"data/dataset/{filename}.csv",
                       index_col=0, parse_dates=True)

df_statement = read_data("FOMC/statement")
df_speech = read_data("FOMC/speech")
df_minutes = read_data("FOMC/minutes")
df_can = read_data("can_statements")


In [3]:
df_speech.shape, df_statement.shape, df_minutes.shape, df_can.shape

((1607, 4), (201, 4), (229, 4), (380, 2))

In [4]:
df_minutes["speaker"].unique()

array(['Alan Greenspan', 'Ben Bernanke', 'Janet Yellen', 'Jerome Powell',
       'other'], dtype=object)

In [5]:
df_speech.dropna(inplace=True)
df_statement.dropna(inplace=True)
df_minutes.dropna(inplace=True)
df_can.dropna(inplace=True)

In [6]:
df_speech[df_speech["speaker"].str.contains(pat='Bernanke|Powell|Yellen|Greenspan', regex=True)]

Unnamed: 0,date,contents,speaker,title
0,1996-06-13,Remarks by Chairman Alan Greenspan\nBank super...,Chairman Alan Greenspan,Bank supervision in a world economy
3,1996-09-19,Remarks by Chairman Alan Greenspan\nRegulation...,Chairman Alan Greenspan,Regulation and electronic payment systems
5,1996-10-05,Remarks by Chairman Alan Greenspan\nBank super...,Chairman Alan Greenspan,"Bank supervision, regulation, and risk"
6,1996-10-07,Remarks by Chairman Alan Greenspan\nU.S. Treas...,Chairman Alan Greenspan,U.S. Treasury securities market: Lessons from ...
9,1996-10-16,Remarks by Chairman Alan Greenspan\nTechnologi...,Chairman Alan Greenspan,Technological advances and productivity
...,...,...,...,...
1574,2022-05-24,"The Federal Reserve, the central bank of the U...",Chair Jerome H. Powell,Video
1575,2022-05-24,"The Federal Reserve, the central bank of the U...",Chair Jerome H. Powell,Welcoming Remarks
1579,2022-06-17,"The Federal Reserve, the central bank of the U...",Chair Jerome H. Powell,Welcoming Remarks
1588,2022-08-26,"The Federal Reserve, the central bank of the U...",Chair Jerome H. Powell,Monetary Policy and Price Stability


On veut conserver seulement les textes présentés par les "Governor"

In [7]:
def filter_president_only(df: pd.DataFrame) -> pd.DataFrame:
    return df[df["speaker"].str.contains(pat='Bernanke|Powell|Yellen|Greenspan', regex=True)]

In [8]:
df_speech = filter_president_only(df_speech)
df_statement = filter_president_only(df_speech)
df_minutes = filter_president_only(df_speech)

In [9]:
df_minutes["contents"].iloc[-1]

"The Federal Reserve, the central bank of the United States, provides\n          the nation with a safe, flexible, and stable monetary and financial\n          system.\n\n[SECTION]\n\nFederal Open Market Committee\n\n[SECTION]\n\nMonetary Policy Principles and Practice\n\n[SECTION]\n\nPolicy Implementation\n\n[SECTION]\n\nReports\n\n[SECTION]\n\nReview of Monetary Policy Strategy, Tools, and\n                    Communications\n\n[SECTION]\n\nInstitution Supervision\n\n[SECTION]\n\nReports\n\n[SECTION]\n\nReporting Forms\n\n[SECTION]\n\nSupervision & Regulation Letters\n\n[SECTION]\n\nBanking Applications & Legal Developments\n\n[SECTION]\n\nRegulatory Resources\n\n[SECTION]\n\nBanking & Data Structure\n\n[SECTION]\n\nFinancial Stability Assessments\n\n[SECTION]\n\nFinancial Stability Coordination & Actions\n\n[SECTION]\n\nReports\n\n[SECTION]\n\nRegulations & Statutes\n\n[SECTION]\n\nPayment Policies\n\n[SECTION]\n\nReserve Bank Payment Services & Data\n\n[SECTION]\n\nFinancial Market

In [10]:
def clean_speech_text(df: pd.DataFrame):
    df_new = df.copy()
    full_text_col = df_new["contents"].apply \
        (lambda x: x.replace('\n\n[SECTION]\n\n', '').replace('\n', ' ').replace('\r', ' ').strip())
    full_text_col = full_text_col.apply(lambda x: re.sub(r'(http)\S+(htm)(l)?', '', x))
    full_text_col = full_text_col.apply(lambda x: re.sub(r'(www.)\S+', '', x))
    full_text_col = full_text_col.apply(lambda x: re.sub(r'[\d]', '', x))
    full_text_col = full_text_col.str.replace('—', ' ')
    full_text_col = full_text_col.str.replace('-', ' ')
    full_text_col = full_text_col.apply(lambda x: re.sub(r'[^\w\s]', '', x))
    full_text_col = full_text_col.apply(lambda x: re.sub(r'([Rr]eturn to text)', '', x))
    full_text_col = full_text_col.apply(lambda x: re.sub(r'([Pp]lay [vV]ideo)', '', x))
    full_text_col = full_text_col.apply(lambda x: re.sub(r'function()', '', x))
    full_text_col = full_text_col.apply(lambda x: re.sub(r'/[^\s]{15,}/', '', x))
    df_new.drop(labels='contents', axis="columns", inplace=True)
    df_new['contents'] = full_text_col
    return df_new

In [11]:
df_minutes = clean_speech_text(df_minutes)
df_speech = clean_speech_text(df_speech)
df_statement = clean_speech_text(df_statement)



In [12]:
df_minutes["contents"].iloc[3]

'Remarks by Chairman Alan Greenspan US Treasury securities market Lessons from Alexander Hamilton At the Annual Public Service Awards Dinner of the Public Securities Association New York New York October  I thank the members of the Public Securities Association for bestowing upon me this award for distinguished public service  I am particularly honored by the company that I keep as a winner of this award as previous recipients have included Senators Daniel Patrick Moynihan Christopher Dodd and Kay Bailey Hutchison and my predecessor as Chairman of the Federal Reserve Paul VolckerI trust that everyone in this audience would agree that the US government securities market works as well as any on earth and generates widespread macroeconomic benefits  In one sense that is regrettable  The market has become so efficient in part because of the  economies of scale associated with the large volume of Treasury debt issued over the years  While the massive federal debt has allowed traders to refi

In [13]:
def find_start_speech(text: str):
    try:
        idx_start = re.search("Share", text).start()
        return text[idx_start + 5:]
    except:
        return text


In [14]:
def replace_white_spaces(text: str):
    return " ".join(text.split())

df_statement["contents"] = df_statement['contents'].apply(lambda x: replace_white_spaces(x))
df_speech["contents"] = df_speech['contents'].apply(lambda x: replace_white_spaces(x))
df_minutes["contents"] = df_minutes['contents'].apply(lambda x: replace_white_spaces(x))

In [15]:
df_statement["contents"].iloc[10]

'It is a pleasure to be with you this afternoon as you discuss some of the most fundamental issues raised by our new information and communications technologiesThe topic Senator Bennett has asked us all to address is privacy in the information age The central dilemma in these discussions almost always involves fundamental choices about how to strike prudent balances among the needs of individuals for privacy in their financial and commercial transactions as well as their personal communications the needs of commerce to bring us new products and new means to communicate and the needs of the authorities to provide for the effective administration of government and to ensure the public safety These are not easy choices I think we all need to have a healthy respect for all sides of the debate Even further we need to be aware that the balances we strike in one era may need to be reexamined as technology and circumstances changeThe dictionary defines privacy as the state of being free from u

In [16]:
df_statement["contents"] = df_statement['contents'].apply(lambda x: find_start_speech(x))
df_speech["contents"] = df_speech['contents'].apply(lambda x: find_start_speech(x))
df_minutes["contents"] = df_minutes['contents'].apply(lambda x: find_start_speech(x))



In [17]:
df_speech["contents"].iloc[10]

'It is a pleasure to be with you this afternoon as you discuss some of the most fundamental issues raised by our new information and communications technologiesThe topic Senator Bennett has asked us all to address is privacy in the information age The central dilemma in these discussions almost always involves fundamental choices about how to strike prudent balances among the needs of individuals for privacy in their financial and commercial transactions as well as their personal communications the needs of commerce to bring us new products and new means to communicate and the needs of the authorities to provide for the effective administration of government and to ensure the public safety These are not easy choices I think we all need to have a healthy respect for all sides of the debate Even further we need to be aware that the balances we strike in one era may need to be reexamined as technology and circumstances changeThe dictionary defines privacy as the state of being free from u

In [20]:
load_model = spacy.load('en_core_web_sm')

In [21]:
stopwords = load_model.Defaults.stop_words


In [22]:
def apply_lemmatization(text: str):
    doc = load_model(text)
    return " ".join([token.lemma_ for token in doc])

In [23]:
N = 100

In [24]:
df_statement["contents"] = df_statement['contents'][-N:-1].apply(lambda x: apply_lemmatization(x))
df_speech["contents"] = df_speech['contents'][-N:-1].apply(lambda x: apply_lemmatization(x))
df_minutes["contents"] = df_minutes['contents'][-N:-1].apply(lambda x: apply_lemmatization(x))