In [1]:
import pandas as pd
import os

speeches = pd.read_csv('./all_ECB_speeches.csv', delimiter='|', error_bad_lines=False)
speeches.head()

Unnamed: 0,date,speakers,title,subtitle,contents
0,2021-05-27,Isabel Schnabel,Societal responsibility and central bank indep...,"Keynote speech by Isabel Schnabel, Member of t...",SPEECH Societal responsibility and central...
1,2021-05-27,Luis de Guindos,Climate change and financial integration,"Keynote speech by Luis de Guindos, Vice-Presid...",SPEECH Climate change and financial integr...
2,2021-05-25,Philip R. Lane,The ECB strategy review,"Presentation by Philip R. Lane, Member of the ...",
3,2021-05-19,Fabio Panetta,At the edge of tomorrow: preparing the future ...,"Introductory remarks by Fabio Panetta, Member ...",SPEECH At the edge of tomorrow: preparing ...
4,2021-05-06,Christine Lagarde,Towards a green capital markets union for Europe,"Speech by Christine Lagarde, President of the ...",SPEECH Towards a green capital markets uni...


In [2]:
speeches.iloc[0]

date                                               2021-05-27
speakers                                      Isabel Schnabel
title       Societal responsibility and central bank indep...
subtitle    Keynote speech by Isabel Schnabel, Member of t...
contents       SPEECH  Societal responsibility and central...
Name: 0, dtype: object

In [3]:

speeches = speeches.dropna()

In [4]:
speeches = speeches.loc[speeches.subtitle.str.contains("\sPresident\s"),:]

In [5]:

speeches['contents'] = speeches['contents'].replace('SPEECH', '', regex=True)
speeches['contents'] = speeches['contents'].replace('\((.*?)\)', '', regex=True)
speeches['contents'] = speeches['contents'].replace('\[(.*?)\]', '', regex=True)
speeches['contents'] = speeches['contents'].replace('Note.*?\.', '', regex=True)
speeches['contents'] = speeches['contents'].replace('Chart .*?\..*?\.', '', regex=True)
speeches['contents'] = speeches['contents'].replace('[,\.!?]', '', regex=True)

speeches['contents'] = speeches['contents'].replace('\s[a-z]{1,2}\s', '', regex=True)

speeches['contents'] = speeches['contents'].replace('[^\x00-\x7F]+',' ', regex=True)

speeches['contents'] = speeches['contents'].replace('[^\w\s]', '', regex=True)


In [6]:
import gensim
from gensim.utils import simple_preprocess

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS as stop_words

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('words')
nltk.download('averaged_perceptron_tagger')
# stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()
words = set(nltk.corpus.words.words())

def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

def remove_non_english(texts):
    return [[w for w in nltk.wordpunct_tokenize(" ".join(doc)) if w.lower() in words or not w.isalpha()] for doc in texts]

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]

def lemmatize(texts):
    return [[lemmatizer.lemmatize(w) for w in doc] for doc in texts]

def noun_only(texts):
    return [[word[0] for word in nltk.pos_tag(doc) if word[1] in ['NN','JJ','JJR','JJS','NNP','NNS']] for doc in texts]


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\felix\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\felix\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\felix\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\felix\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [7]:
s2004 = speeches.loc[speeches['date'].str.contains("2004"),:]

In [8]:
s2008 = speeches.loc[speeches['date'].str.contains("2008"),:]

In [9]:
s2012 = speeches.loc[speeches['date'].str.contains("2012"),:]

In [10]:
s2016 = speeches.loc[speeches['date'].str.contains("2016"),:]

In [11]:
s202021 = speeches.loc[speeches['date'].str.contains("2020|2021"),:]

In [12]:
len(s2016)

24

In [13]:
from gensim import models
import gensim.corpora as corpora
from pprint import pprint
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS
from sklearn.decomposition import LatentDirichletAllocation
import pickle 
import pyLDAvis

import pyLDAvis.gensim_models as gensimvis
import pyLDAvis.sklearn 
import numpy as np

num_topics = 5

def preprocess(input_data):
    data = input_data.contents.values.tolist()


    # data = [input_data.iloc[1].contents]

    data_words = list(sent_to_words(data))


    data_words = remove_non_english(data_words)
    
    data_words = remove_stopwords(data_words)
    data_words = lemmatize(data_words)

    data_words = remove_stopwords(data_words)

    data_words = noun_only(data_words)

    bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
    bigram_mod = gensim.models.phrases.Phraser(bigram)

    def make_bigrams(texts):
        return [bigram_mod[doc] for doc in texts]

    data_words = make_bigrams(data_words)

    return data_words

def gen_corpus(data_words):
    # Create Dictionary
    id2word = corpora.Dictionary(data_words)

    # id2word.filter_extremes( no_above=0.9, keep_n=100000)
    # Create Corpus
    texts = data_words
    # Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in texts]

    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]

    return id2word, corpus, corpus_tfidf

def lda_sklearn(data_words):

    vect = CountVectorizer(ngram_range=(1,2), stop_words='english')

    vect_tfidf = TfidfVectorizer(ngram_range=(1,2), stop_words='english')

    docs = []

    for doc in data_words:
        docs.append(" ".join(doc))

    corpus_sklearn_bow = vect.fit_transform(docs)
    corpus_sklearn_tfidf = vect_tfidf.fit_transform(docs)

    lda = LatentDirichletAllocation(n_components=num_topics)


    lda_dtf=lda.fit_transform(corpus_sklearn_bow)

    lda_tfidf = LatentDirichletAllocation(n_components=num_topics)


    lda_dtf_tfidf=lda.fit_transform(corpus_sklearn_tfidf)

    

    # sorting=np.argsort(lda.components_)[:,::-1]

    # features=np.array(vect.get_feature_names())

    return vect, vect_tfidf, lda, lda_tfidf, corpus_sklearn_bow, corpus_sklearn_tfidf

def run_lda(id2word, corpus, data_words, k=5,  a='symmetric', b=None):
    
    # Build LDA model
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                        id2word=id2word, 
                                        workers=10, 
                                        num_topics=k, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b)

    coherence_model_lda = gensim.models.coherencemodel.CoherenceModel(model=lda_model, texts=data_words, dictionary=id2word, coherence='c_v')
    
    # Print the Keyword in the 10 topics
    pprint(lda_model.print_topics())
    print(coherence_model_lda.get_coherence())
    doc_lda = lda_model[corpus]

    return lda_model, coherence_model_lda.get_coherence()

def lda_mallet(id2word, corpus):
    mallet_path = 'mallet-2.0.8\\bin\\mallet'
    ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=id2word)
    pprint(lda_model.print_topics())
    doc_lda = lda_model[corpus]
    
    return ldamallet

def visualize(lda_model, corpus, id2word, mode, k=5):
    # Visualize the topics
    pyLDAvis.enable_notebook()
    LDAvis_data_filepath = os.path.join('ldavis_'+ mode +'_'+str(k))

    LDAvis_prepared = gensimvis.prepare(lda_model, corpus, id2word)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)

    # load the pre-prepared pyLDAvis data from disk
    with open(LDAvis_data_filepath, 'rb') as f:
        LDAvis_prepared = pickle.load(f)
    pyLDAvis.save_html(LDAvis_prepared,'ldavis_'+ mode +'_'+ str(k) +'.html')
    LDAvis_prepared


def visualize_sklearn(lda_model, corpus, id2word, mode):
    # Visualize the topics
    pyLDAvis.enable_notebook()
    LDAvis_data_filepath = os.path.join('ldavis_'+ mode +'_'+str(num_topics))

    LDAvis_prepared = pyLDAvis.sklearn.prepare(lda_model, corpus, id2word,mds='mmds')
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)

    # load the pre-prepared pyLDAvis data from disk
    with open(LDAvis_data_filepath, 'rb') as f:
        LDAvis_prepared = pickle.load(f)
    pyLDAvis.save_html(LDAvis_prepared,'ldavis_'+ mode +'_'+ str(num_topics) +'.html')
    LDAvis_prepared

In [14]:
data_words = preprocess(s2016)
id2word, corpus, corpus_tfidf = gen_corpus(data_words)
lda_bow, cv_score = run_lda(id2word, corpus, data_words)
lad_tfidf, cv_score = run_lda(id2word, corpus_tfidf, data_words)

  and should_run_async(code)
[(0,
  '0.021*"union" + 0.013*"trust" + 0.009*"stability" + 0.007*"monetary" + '
  '0.007*"growth" + 0.007*"today" + 0.006*"way" + 0.006*"pact" + '
  '0.006*"compliance" + 0.006*"convergence"'),
 (1,
  '0.034*"area" + 0.030*"monetary" + 0.023*"policy" + 0.022*"economic" + '
  '0.017*"market" + 0.017*"growth" + 0.014*"financial" + 0.012*"recovery" + '
  '0.011*"economy" + 0.010*"structural"'),
 (2,
  '0.026*"policy" + 0.021*"monetary" + 0.015*"income" + 0.014*"asset" + '
  '0.013*"financial" + 0.013*"area" + 0.012*"net" + 0.012*"wealth" + '
  '0.011*"real" + 0.010*"economy"'),
 (3,
  '0.001*"policy" + 0.001*"area" + 0.001*"monetary" + 0.001*"financial" + '
  '0.001*"global" + 0.001*"inflation" + 0.001*"economic" + 0.001*"market" + '
  '0.001*"economy" + 0.001*"crisis"'),
 (4,
  '0.037*"policy" + 0.028*"monetary" + 0.026*"area" + 0.022*"inflation" + '
  '0.014*"low" + 0.014*"global" + 0.014*"economic" + 0.013*"financial" + '
  '0.013*"economy" + 0.013*"growth

In [15]:

visualize(lda_bow, corpus, id2word, "bow")

  and should_run_async(code)


In [16]:

visualize(lad_tfidf, corpus, id2word, "tfidf")

  and should_run_async(code)


In [17]:
data_words = preprocess(s2016)
vect, vect_tfidf, lda, lda_tfidf, corpus_sklearn_bow, corpus_sklearn_tfidf = lda_sklearn(data_words)
visualize_sklearn(lda, corpus_sklearn_bow, vect, "bow")
# visualize_sklearn(lda_tfidf, corpus_sklearn_tfidf, vect_tfidf, "tfidf")

  and should_run_async(code)


In [18]:
data_words = preprocess(speeches)
id2word, corpus, corpus_tfidf = gen_corpus(data_words)
# lda_bow, cv_score = run_lda(id2word, corpus, data_words)
# lad_tfidf, cv_score = run_lda(id2word, corpus_tfidf, data_words)

  and should_run_async(code)


In [19]:
# # grid search

# import numpy as np
# import tqdm


# grid = {}
# grid['Validation_Set'] = {}

# # Topics range
# min_topics = 2
# max_topics = 11
# step_size = 1
# topics_range = range(min_topics, max_topics, step_size)

# # Alpha parameter
# alpha = list(np.arange(0.01, 1, 0.3))
# alpha.append('symmetric')
# alpha.append('asymmetric')

# # Beta parameter
# beta = list(np.arange(0.01, 1, 0.3))
# beta.append('symmetric')

# # Validation sets
# num_of_docs = len(corpus)
# corpus_sets = [# gensim.utils.ClippedCorpus(corpus, num_of_docs*0.25), 
#                # gensim.utils.ClippedCorpus(corpus, num_of_docs*0.5), 
#                gensim.utils.ClippedCorpus(corpus, num_of_docs*0.75), 
#                corpus]
# corpus_title = ['75% Corpus', '100% Corpus']
# model_results = {'Validation_Set': [],
#                  'Topics': [],
#                  'Alpha': [],
#                  'Beta': [],
#                  'Coherence': []
#                 }

# # Can take a long time to run
# if 1 == 1:
#     pbar = tqdm.tqdm(total=540)
    
#     # iterate through validation corpuses
#     for i in range(len(corpus_sets)):
#         # iterate through number of topics
#         for k in topics_range:
#             # iterate through alpha values
#             for a in alpha:
#                 # iterare through beta values
#                 for b in beta:
#                     # get the coherence score for the given parameters
#                     model, cv = run_lda(id2word, corpus_tfidf, data_words, k=k, a=a, b=b)
#                     # compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, 
#                                                 #   k=k, a=a, b=b)
#                     # Save the model results
#                     model_results['Validation_Set'].append(corpus_title[i])
#                     model_results['Topics'].append(k)
#                     model_results['Alpha'].append(a)
#                     model_results['Beta'].append(b)
#                     model_results['Coherence'].append(cv)
                    
#                     pbar.update(1)
#     pd.DataFrame(model_results).to_csv('lda_tuning_results.csv', index=False)
#     pbar.close()

  and should_run_async(code)


In [20]:
# per quarter

quarters = ["(1|2|3)","(4|5|6)", "(7|8|9)","(10|11|12)"]

for year in range(1997,2022):
    for index, quarter in enumerate(quarters):
        data = speeches.loc[speeches['date'].str.contains(str(year) + '-0?' + quarter),:]
        
        print(year, " ", quarter, "("+str(len(data))+" docs)",": ")
        if len(data) == 0:
            continue
        data_words = preprocess(data)
        id2word, corpus, corpus_tfidf = gen_corpus(data_words)
        lda_bow, cv = run_lda(id2word, corpus, data_words)  
        lad_tfidf, cv_tfidf = run_lda(id2word, corpus_tfidf, data_words)
        visualize(lda_bow, corpus, id2word, "bow_"+str(year)+"_"+str(index+1))
        visualize(lad_tfidf, corpus, id2word, "tfidf_"+str(year)+"_"+str(index+1))
    

  and should_run_async(code)
  return func(self, *args, **kwargs)
1997   (1|2|3) (9 docs) : 
[(0,
  '0.001*"policy" + 0.001*"stability" + 0.001*"monetary" + 0.001*"price" + '
  '0.001*"inflation" + 0.001*"economic" + 0.001*"central" + 0.001*"rate" + '
  '0.001*"exchange" + 0.001*"area"'),
 (1,
  '0.057*"monetary" + 0.045*"policy" + 0.029*"price" + 0.027*"inflation" + '
  '0.026*"stability" + 0.024*"central" + 0.014*"rate" + 0.012*"growth" + '
  '0.011*"exchange" + 0.011*"strategy"'),
 (2,
  '0.032*"monetary" + 0.029*"policy" + 0.024*"stability" + 0.022*"price" + '
  '0.022*"inflation" + 0.019*"growth" + 0.015*"economic" + 0.013*"banking" + '
  '0.013*"financial" + 0.011*"central"'),
 (3,
  '0.036*"exchange" + 0.035*"area" + 0.034*"rate" + 0.025*"monetary" + '
  '0.025*"policy" + 0.022*"stability" + 0.020*"convergence" + 0.019*"central" '
  '+ 0.016*"economic" + 0.012*"union"'),
 (4,
  '0.001*"monetary" + 0.001*"stability" + 0.001*"policy" + 0.001*"rate" + '
  '0.001*"central" + 0.001*"

AttributeError: 'tuple' object has no attribute 'num_topics'

In [18]:
# per quarter

quarters = ["(1|2|3)","(4|5|6)", "(7|8|9)","(10|11|12)"]

for year in range(1997,2022):
    for index, quarter in enumerate(quarters):
        data = speeches.loc[speeches['date'].str.contains(str(year) + '-0?' + quarter),:]
        
        print(year, " ", quarter, "("+str(len(data))+" docs)",": ")
        if len(data) == 0:
            continue
        data_words = preprocess(data)
        vect, vect_tfidf, lda, lda_tfidf, corpus_sklearn_bow, corpus_sklearn_tfidf = lda_sklearn(data_words)
        visualize_sklearn(lda, corpus_sklearn_bow, vect, "bow_sklearn_"+str(year)+"_"+str(index+1))
       
    

  and should_run_async(code)
  return func(self, *args, **kwargs)
1997   (1|2|3) (9 docs) : 
  return func(self, *args, **kwargs)
1997   (4|5|6) (5 docs) : 
  return func(self, *args, **kwargs)
  return func(self, *args, **kwargs)
1997   (7|8|9) (1 docs) : 
1997   (10|11|12) (7 docs) : 
  return func(self, *args, **kwargs)
1998   (1|2|3) (16 docs) : 
  return func(self, *args, **kwargs)
  return func(self, *args, **kwargs)
1998   (4|5|6) (1 docs) : 
1998   (7|8|9) (5 docs) : 
  return func(self, *args, **kwargs)
1998   (10|11|12) (11 docs) : 
  return func(self, *args, **kwargs)
1999   (1|2|3) (20 docs) : 
  return func(self, *args, **kwargs)
1999   (4|5|6) (7 docs) : 
  return func(self, *args, **kwargs)
1999   (7|8|9) (4 docs) : 
  return func(self, *args, **kwargs)
1999   (10|11|12) (12 docs) : 
  return func(self, *args, **kwargs)
2000   (1|2|3) (10 docs) : 
  return func(self, *args, **kwargs)
2000   (4|5|6) (5 docs) : 
  return func(self, *args, **kwargs)
2000   (7|8|9) (6 docs) 

In [19]:
# deeper data analytics


# dataset generally a bit short

# 3 docs inconclusive -> a bit too short to work
# tried tfidf / bag of words approach 
# did tuning according to paper, cleaned text, stop words, lemmatize, and stemming.
#     extreme cleaninng of text ( remove sources, charts description, to give more coherent)
# remove extreme popular terms (did not work as well)
# ran over all the dataset and tracking changes on topics on quarterly basis from year 1997


# some word clouds for exploratory sentiment analysis (also applied similar pre-processing techniques)




  and should_run_async(code)


SyntaxError: invalid syntax (<ipython-input-19-d22fc322a253>, line 1)