In [1]:
import pandas as pd
import numpy as np
import spacy
import re



In [2]:
def read_data(filename: str) -> pd.DataFrame:
    return pd.read_csv(f"data/dataset/{filename}.csv",
                       index_col=0, parse_dates=True)

df_statement = read_data("FOMC/statement")
df_speech = read_data("FOMC/speech")
df_minutes = read_data("FOMC/minutes")

In [3]:
df_speech.shape, df_statement.shape, df_minutes.shape

((1607, 4), (201, 4), (229, 4))

In [4]:
df_minutes["speaker"].unique()

array(['Alan Greenspan', 'Ben Bernanke', 'Janet Yellen', 'Jerome Powell',
       'other'], dtype=object)

In [5]:
df_speech.dropna(inplace=True)
df_statement.dropna(inplace=True)
df_minutes.dropna(inplace=True)

In [6]:
df_speech[df_speech["speaker"].str.contains("Gov")]

Unnamed: 0,date,contents,speaker,title
1,1996-06-18,"Remarks by Governor Edward W. Kelley, Jr.\nDev...","Governor Edward W. Kelley, Jr.",Developments in electronic money and banking
2,1996-09-08,Monetary Policy Objectives and Strategy\n\n[SE...,Governor Laurence H. Meyer,Monetary policy objectives and strategy
4,1996-10-02,Remarks by Governor Lawrence B. Lindsey\nAt th...,Governor Lawrence B. Lindsey,Small business is big business
7,1996-10-09,Remarks by Governor Lawrence B. Lindsey\nAt th...,Governor Lawrence B. Lindsey,Here we go again?
8,1996-10-11,Remarks by Governor Lawrence B. Lindsey\nAt th...,Governor Lawrence B. Lindsey,How to grow faster
...,...,...,...,...
1600,2022-10-06,"The Federal Reserve, the central bank of the U...",Governor Christopher J. Waller,The Economic Outlook with a Look at the Housin...
1603,2022-10-12,"The Federal Reserve, the central bank of the U...",Governor Michelle W. Bowman,Forward Guidance as a Monetary Policy Tool: Co...
1604,2022-10-14,"The Federal Reserve, the central bank of the U...",Governor Christopher J. Waller,The U.S. Dollar and Central Bank Digital Curre...
1605,2022-10-20,"The Federal Reserve, the central bank of the U...",Governor Michelle W. Bowman,Welcoming Remarks


On veut conserver seulement les textes présentés par les "Governor"

In [7]:
def filter_by_title(df: pd.DataFrame, title: str) -> pd.DataFrame:
    return df[df["speaker"].str.contains(title)]

In [8]:
df_speech = filter_by_title(df_speech, "Governor")
df_statement = filter_by_title(df_speech, "Governor")
df_minutes = filter_by_title(df_speech, "Governor")

In [None]:
df_minutes["contents"].iloc[-1]

In [44]:
def clean_speech_text(df: pd.DataFrame):
    df_new = df.copy()
    full_text_col = df_new["contents"].apply \
        (lambda x: x.replace('\n\n[SECTION]\n\n', '').replace('\n', ' ').replace('\r', ' ').strip())
    full_text_col = full_text_col.apply(lambda x: re.sub(r'(http)\S+(htm)(l)?', '', x))
    full_text_col = full_text_col.apply(lambda x: re.sub(r'(www.)\S+', '', x))
    full_text_col = full_text_col.apply(lambda x: re.sub(r'[\d]', '', x))
    full_text_col = full_text_col.str.replace('—', ' ')
    full_text_col = full_text_col.str.replace('-', ' ')
    full_text_col = full_text_col.apply(lambda x: re.sub(r'[^\w\s]', '', x))
    full_text_col = full_text_col.apply(lambda x: re.sub(r'([Rr]eturn to text)', '', x))
    full_text_col = full_text_col.apply(lambda x: re.sub(r'([Pp]lay [vV]ideo)', '', x))
    full_text_col = full_text_col.apply(lambda x: re.sub(r'function()', '', x))
    full_text_col = full_text_col.apply(lambda x: re.sub(r'/[^\s]{15,}/', '', x))
    df_new.drop(labels='contents', axis="columns", inplace=True)
    df_new['contents'] = full_text_col
    return df_new

In [45]:
df_minutes = clean_speech_text(df_minutes)
df_speech = clean_speech_text(df_speech)
df_statement = clean_speech_text(df_statement)


In [None]:
df_minutes["contents"].iloc[3]

In [None]:
def find_start_speech(text: str):
    try:
        idx_start = re.search("Share", text).start()
        return text[idx_start + 5:]
    except:
        return text


In [30]:
def replace_white_spaces(text: str):
    return " ".join(text.split())

df_statement["contents"] = df_statement['contents'].apply(lambda x: replace_white_spaces(x))
df_speech["contents"] = df_speech['contents'].apply(lambda x: replace_white_spaces(x))
df_minutes["contents"] = df_minutes['contents'].apply(lambda x: replace_white_spaces(x))

In [None]:
df_statement["contents"].iloc[10]

In [32]:
df_statement["contents"] = df_statement['contents'].apply(lambda x: find_start_speech(x))
df_speech["contents"] = df_speech['contents'].apply(lambda x: find_start_speech(x))
df_minutes["contents"] = df_minutes['contents'].apply(lambda x: find_start_speech(x))



In [None]:
df_speech["contents"].iloc[10]

In [18]:
load_model = spacy.load('en_core_web_sm')

In [19]:
stopwords = load_model.Defaults.stop_words


In [52]:
def apply_lemmatization(text: str):
    doc = load_model(text)
    return " ".join([token.lemma_ for token in doc])

In [None]:
df_statement["contents"].apply(lambda x: apply_lemmatization(x))