In [10]:
import pandas as pd
import os

speeches = pd.read_csv('./all_ECB_speeches.csv', delimiter='|', error_bad_lines=False)
speeches.head()

Unnamed: 0,date,speakers,title,subtitle,contents
0,2021-05-27,Isabel Schnabel,Societal responsibility and central bank indep...,"Keynote speech by Isabel Schnabel, Member of t...",SPEECH Societal responsibility and central...
1,2021-05-27,Luis de Guindos,Climate change and financial integration,"Keynote speech by Luis de Guindos, Vice-Presid...",SPEECH Climate change and financial integr...
2,2021-05-25,Philip R. Lane,The ECB strategy review,"Presentation by Philip R. Lane, Member of the ...",
3,2021-05-19,Fabio Panetta,At the edge of tomorrow: preparing the future ...,"Introductory remarks by Fabio Panetta, Member ...",SPEECH At the edge of tomorrow: preparing ...
4,2021-05-06,Christine Lagarde,Towards a green capital markets union for Europe,"Speech by Christine Lagarde, President of the ...",SPEECH Towards a green capital markets uni...


In [11]:
speeches.iloc[0]

date                                               2021-05-27
speakers                                      Isabel Schnabel
title       Societal responsibility and central bank indep...
subtitle    Keynote speech by Isabel Schnabel, Member of t...
contents       SPEECH  Societal responsibility and central...
Name: 0, dtype: object

In [12]:
speeches.columns

Index(['date', 'speakers', 'title', 'subtitle', 'contents'], dtype='object')

In [13]:
# clean the columns (date, title, subtitle, speaker)
sample_size = 10

#speeches = speeches.drop(columns=['date', 'speakers', 'title', 'subtitle'], axis=1).sample(sample_size)

speeches = speeches.drop(columns=['date', 'speakers', 'title', 'subtitle'], axis=1).dropna().iloc[0:1500]
speeches.head()

Unnamed: 0,contents
0,SPEECH Societal responsibility and central...
1,SPEECH Climate change and financial integr...
3,SPEECH At the edge of tomorrow: preparing ...
4,SPEECH Towards a green capital markets uni...
6,SPEECH All the way to zero: guiding banks ...


In [14]:
#preprocessing

import re

# Remove punctuation
# speeches['contents'] = speeches['contents'].replace('[,\.!?]', '', regex=True)
speeches['contents'] = speeches['contents'].replace('SPEECH', '', regex=True)
speeches['contents'] = speeches['contents'].replace('\((.*?)\)', '', regex=True)
speeches['contents'] = speeches['contents'].replace('\[(.*?)\]', '', regex=True)
speeches['contents'] = speeches['contents'].replace('Note.*?\.', '', regex=True)
speeches['contents'] = speeches['contents'].replace('Chart .*?\..*?\.', '', regex=True)


speeches['contents'] = speeches['contents'].replace('[^\x00-\x7F]+',' ', regex=True)



In [15]:
data = list(speeches['contents'])
data = [str(docs) for docs in data]

In [16]:
speeches['contents']

0            Societal responsibility and central bank ...
1            Climate change and financial integration ...
3            At the edge of tomorrow: preparing the fu...
4            Towards a green capital markets union for...
6            All the way to zero: guiding banks toward...
                              ...                        
1522      Introductory statement at the press conferen...
1523      Address at Conferencia Buenos Aires 2008, or...
1524      Financial systems, new technologies and prod...
1525      The current state of the euro area and its f...
1526      Pol tica monetaria y gesti n de liquidez en ...
Name: contents, Length: 1500, dtype: object

In [17]:
data[0][:10000]

"     Societal responsibility and central bank independence   Keynote speech by Isabel Schnabel, Member of the Executive Board of the ECB, at the  VIII. New Paradigm Workshop , organised by the Forum New Economy Frankfurt am Main, 27 May 2021 Central banking in times of shifting societal concerns The best contribution that central banks can make to economic prosperity is to maintain stable prices: this was the broad consensus among academic scholars and policymakers emerging in the late 1970s when inflation in many advanced economies had surged to double-digit levels, thereby eroding purchasing power and hitting the poorest in society the hardest .    This underpins the large degree of political independence that most central banks enjoy, including the ECB, which consistently ranks as one of the most independent central banks in the world .     Index values in Bodea and Hicks  refer to 2014 ; index values in Garriga  refer to 2012. The values correspond to the unweighted indices of cen

In [18]:
from bertopic import BERTopic
topic_model = BERTopic(embedding_model="nli-roberta-base-v2",min_topic_size=10, nr_topics="auto")
topics, _ = topic_model.fit_transform(data)


100%|██████████| 459M/459M [01:12<00:00, 6.29MB/s]
You try to use a model that was created with version 1.1.0, however, your version is 0.4.1. This might cause unexpected behavior or errors. In that case, try to update to the latest version.





In [19]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,529,-1_to_of_financial_have
1,0,354,0_policy_and_monetary_financial
2,1,99,1_to_we_our_have
3,2,75,2_inflation_euro_and_policy
4,3,64,3_banks_are_for_banking
5,4,63,4_european_union_financial_euro
6,5,52,5_financial_as_have_crisis
7,6,49,6_auf_das_eine_nicht
8,7,40,7_payments_payment_retail_digital
9,8,26,8_euro_banknotes_is_it


In [20]:
topic_model.get_topic(0)

[('policy', 0.018708723953169762),
 ('and', 0.017974333625516887),
 ('monetary', 0.015736509567583684),
 ('financial', 0.015319727501085945),
 ('of', 0.015290084397929163),
 ('it', 0.011865278685787416),
 ('banks', 0.011140237804325446),
 ('market', 0.01062877788927638),
 ('inflation', 0.010470659220627522),
 ('bank', 0.010216841090904434)]

In [None]:
topic_model.visualize_topics()