In [1]:
import pandas as pd
import os

import numpy as np
import tqdm

import gensim
from gensim import models
from gensim.utils import simple_preprocess
import gensim.corpora as corpora
import random

from pprint import pprint
import pickle 


import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk.stem as stemmer

from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS as stop_words


speeches = pd.read_csv('./all_ECB_speeches.csv', delimiter='|', error_bad_lines=False)
speeches.head()



  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,date,speakers,title,subtitle,contents
0,2021-05-27,Isabel Schnabel,Societal responsibility and central bank indep...,"Keynote speech by Isabel Schnabel, Member of t...",SPEECH Societal responsibility and central...
1,2021-05-27,Luis de Guindos,Climate change and financial integration,"Keynote speech by Luis de Guindos, Vice-Presid...",SPEECH Climate change and financial integr...
2,2021-05-25,Philip R. Lane,The ECB strategy review,"Presentation by Philip R. Lane, Member of the ...",
3,2021-05-19,Fabio Panetta,At the edge of tomorrow: preparing the future ...,"Introductory remarks by Fabio Panetta, Member ...",SPEECH At the edge of tomorrow: preparing ...
4,2021-05-06,Christine Lagarde,Towards a green capital markets union for Europe,"Speech by Christine Lagarde, President of the ...",SPEECH Towards a green capital markets uni...


In [2]:
speeches.iloc[-1]

date                                               1997-02-07
speakers                                 Alexandre Lamfalussy
title       Conference organised by the Hungarian Banking ...
subtitle    Address by Alexandre Lamfalussy, President of ...
contents      Conference organised by the Hungarian Bankin...
Name: 2487, dtype: object

In [3]:
speeches.iloc[0]

date                                               2021-05-27
speakers                                      Isabel Schnabel
title       Societal responsibility and central bank indep...
subtitle    Keynote speech by Isabel Schnabel, Member of t...
contents       SPEECH  Societal responsibility and central...
Name: 0, dtype: object

In [4]:
#28 na rows
print(len(speeches))
speeches = speeches.dropna()
print(len(speeches))
speeches.head()

2488
2460


Unnamed: 0,date,speakers,title,subtitle,contents
0,2021-05-27,Isabel Schnabel,Societal responsibility and central bank indep...,"Keynote speech by Isabel Schnabel, Member of t...",SPEECH Societal responsibility and central...
1,2021-05-27,Luis de Guindos,Climate change and financial integration,"Keynote speech by Luis de Guindos, Vice-Presid...",SPEECH Climate change and financial integr...
3,2021-05-19,Fabio Panetta,At the edge of tomorrow: preparing the future ...,"Introductory remarks by Fabio Panetta, Member ...",SPEECH At the edge of tomorrow: preparing ...
4,2021-05-06,Christine Lagarde,Towards a green capital markets union for Europe,"Speech by Christine Lagarde, President of the ...",SPEECH Towards a green capital markets uni...
6,2021-04-29,Frank Elderson,All the way to zero: guiding banks towards a c...,"Keynote speech by Frank Elderson, Vice-Chair o...",SPEECH All the way to zero: guiding banks ...


In [5]:

from langdetect import detect

def isEnglish(text):
    try:
        if detect(text) == 'en':
            return True
        else:
            # print(text[:40])
            return False
    except:
        print(text)
        return False

def isLongerThan(text):
    return len(text)>500

def filter(text):
    return isEnglish(text) and isLongerThan(text)

# non_en_idx = []
# for i in range(len(speeches)):
#     if not isEnglish(speeches.iloc[i]['contents']):
#         non_en_idx.append(i)

        

# print(len(non_en_idx))
print(len(speeches))
speeches = speeches[speeches.apply(lambda x: filter(x['contents']), axis=1)]   
print(len(speeches))

2460
 
2269


In [6]:
len(list(set(speeches.speakers.values.tolist()))) #speakers

27

In [7]:
# get index range of speeches

quarters = ["(1|2|3)","(4|5|6)", "(7|8|9)","(10|11|12)"]
indices = []
for year in range(2000,2022):
    for index, quarter in enumerate(quarters):
        data = speeches.loc[speeches['date'].str.contains(str(year) + '-0?' + quarter + '-'), :]
        indices.append(len(data))
indices = indices[:-2]
print(indices)
print(sum(indices)/len(indices))

[14, 18, 17, 21, 14, 16, 13, 22, 20, 18, 8, 19, 12, 18, 10, 24, 16, 31, 14, 29, 13, 29, 13, 26, 20, 30, 16, 29, 21, 33, 25, 37, 29, 40, 28, 34, 26, 34, 20, 32, 22, 42, 23, 30, 26, 44, 12, 33, 10, 33, 16, 30, 27, 41, 25, 37, 20, 31, 23, 28, 18, 25, 17, 39, 23, 30, 20, 34, 34, 43, 29, 36, 28, 28, 20, 36, 30, 26, 17, 34, 20, 15, 20, 27, 19, 10]
24.651162790697676


  data = speeches.loc[speeches['date'].str.contains(str(year) + '-0?' + quarter + '-'), :]


In [8]:

# stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()
words = set(nltk.corpus.words.words())


# preprocessing functions
def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(nltk.tokenize.word_tokenize(str(sentence)))

def remove_non_english(texts):
    return [[w for w in nltk.wordpunct_tokenize(" ".join(doc)) if w.lower() in words or not w.isalpha()] for doc in texts]

#financial ones
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]

def remove_word_length(texts):
    return [[w for w in doc if len(w)>3] for doc in texts]

def lemmatize(texts):
    return [[ lemmatizer.lemmatize(w,pos='v') for w in doc] for doc in texts]

def stemming(texts):
    return [[nltk.ste.lemmatize(w,pos='v') for w in doc] for doc in texts]

def noun_only(texts):
    return [[word[0] for word in nltk.pos_tag(doc) if word[1] in ['NN','JJ','JJR','JJS','NNP','NNS']] for doc in texts]



def preprocess(input_data):
    data = input_data.contents.values.tolist()
    data.reverse()

    # data = [input_data.iloc[1].contents]

    data_words = list(sent_to_words(data))
    
    data_words = remove_non_english(data_words)
    data_words = remove_stopwords(data_words)
    data_words = remove_word_length(data_words)
    
    bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
    bigram_mod = gensim.models.phrases.Phraser(bigram)

    def make_bigrams(texts):
        return [bigram_mod[doc] for doc in texts]

    data_words = make_bigrams(data_words)

    
    data_words = remove_stopwords(data_words)
    

    data_words = lemmatize(data_words)

    data_words = noun_only(data_words)

    

    return data_words


In [9]:
data = speeches.contents.values.tolist()
data_words = list(sent_to_words(data))
data_len = [len(i) for i in data_words]
average_len = sum(data_len)/len(data_len)
print(average_len)

3501.9554869986778


In [10]:

#Regex cleaning

speeches['contents'] = speeches['contents'].replace('SPEECH', ' ', regex=True)
speeches['contents'] = speeches['contents'].replace('\(.*?\)', ' ', regex=True)
speeches['contents'] = speeches['contents'].replace('\[.*?\]', ' ', regex=True)
speeches['contents'] = speeches['contents'].replace('Note.*?\.', ' ', regex=True)
speeches['contents'] = speeches['contents'].replace('Chart .*?\..*?\.', ' ', regex=True)

speeches['contents'] = speeches['contents'].replace('I\..*?References', ' ', regex=True) #edge caSe
speeches['contents'] = speeches['contents'].replace('References.*', ' ', regex=True)
speeches['contents'] = speeches['contents'].replace('REFERENCES.*', ' ', regex=True)
speeches['contents'] = speeches['contents'].replace('LITERATURE.*', ' ', regex=True)
speeches['contents'] = speeches['contents'].replace('BIBLIOGRAPHY.*', ' ', regex=True)
speeches['contents'] = speeches['contents'].replace(' [0-9]\. ', ' ', regex=True)


speeches['contents'] = speeches['contents'].replace('Vol.*?pp.*?\.', ' ', regex=True)

speeches['contents'] = speeches['contents'].replace('Vol\..*?[0-9]*,.*?No\..*?\.', ' ', regex=True)


speeches['contents'] = speeches['contents'].replace('op\..*?cit\..*?\.', ' ', regex=True)
speeches['contents'] = speeches['contents'].replace('op\..*?cit\.', ' ', regex=True)


speeches['contents'] = speeches['contents'].replace('See.*?\.', ' ', regex=True)


speeches['contents'] = speeches['contents'].replace('SEE ALSO.*', ' ', regex=True)

speeches['contents'] = speeches['contents'].replace('Thank you\..*', ' ', regex=True)
speeches['contents'] = speeches['contents'].replace('Thank you for your kind attention\..*', ' ', regex=True)
speeches['contents'] = speeches['contents'].replace('Thank you for your attention\..*', ' ', regex=True)
speeches['contents'] = speeches['contents'].replace('I thank you for your attention\..*', ' ', regex=True)

In [11]:
data = speeches.contents.values.tolist()
data_words = list(sent_to_words(data))
data_len = [len(i) for i in data_words]
average_len = sum(data_len)/len(data_len)
print(average_len)

3122.4583516967828


In [12]:


data_words = remove_non_english(data_words)

In [13]:
data_len = [len(i) for i in data_words]
average_len = sum(data_len)/len(data_len)
print(average_len)

2768.321286910533


In [14]:

data_words = remove_stopwords(data_words)

In [15]:
data_len = [len(i) for i in data_words]
average_len = sum(data_len)/len(data_len)
print(average_len)

1029.445570736007


In [16]:

data_words = remove_word_length(data_words)


In [17]:
data_len = [len(i) for i in data_words]
average_len = sum(data_len)/len(data_len)
print(average_len)

986.177170559718


In [18]:

bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
bigram_mod = gensim.models.phrases.Phraser(bigram)

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

data_words = make_bigrams(data_words)



In [19]:
data_len = [len(i) for i in data_words]
average_len = sum(data_len)/len(data_len)
print(average_len)

963.5927721463199


In [20]:

data_words = remove_stopwords(data_words)


data_words = lemmatize(data_words)


In [21]:
data_len = [len(i) for i in data_words]
average_len = sum(data_len)/len(data_len)
print(average_len)

953.6416923754958


In [22]:

data_words = noun_only(data_words)

In [23]:
data_len = [len(i) for i in data_words]
average_len = sum(data_len)/len(data_len)
print(average_len)

815.5945350374615
