In [1]:
# https://towardsdatascience.com/end-to-end-topic-modeling-in-python-latent-dirichlet-allocation-lda-35ce4ed6b3e0

In [2]:
import pandas as pd
import os

speeches = pd.read_csv('./all_ECB_speeches.csv', delimiter='|', error_bad_lines=False)
speeches.head()

Unnamed: 0,date,speakers,title,subtitle,contents
0,2021-05-27,Isabel Schnabel,Societal responsibility and central bank indep...,"Keynote speech by Isabel Schnabel, Member of t...",SPEECH Societal responsibility and central...
1,2021-05-27,Luis de Guindos,Climate change and financial integration,"Keynote speech by Luis de Guindos, Vice-Presid...",SPEECH Climate change and financial integr...
2,2021-05-25,Philip R. Lane,The ECB strategy review,"Presentation by Philip R. Lane, Member of the ...",
3,2021-05-19,Fabio Panetta,At the edge of tomorrow: preparing the future ...,"Introductory remarks by Fabio Panetta, Member ...",SPEECH At the edge of tomorrow: preparing ...
4,2021-05-06,Christine Lagarde,Towards a green capital markets union for Europe,"Speech by Christine Lagarde, President of the ...",SPEECH Towards a green capital markets uni...


In [3]:
speeches.iloc[-1]

date                                               1997-02-07
speakers                                 Alexandre Lamfalussy
title       Conference organised by the Hungarian Banking ...
subtitle    Address by Alexandre Lamfalussy, President of ...
contents      Conference organised by the Hungarian Bankin...
Name: 2487, dtype: object

In [4]:
speeches.columns

Index(['date', 'speakers', 'title', 'subtitle', 'contents'], dtype='object')

In [5]:
# clean the columns (date, title, subtitle, speaker)
sample_size = 10
num_topics = 5

# speeches = speeches.drop(columns=['date', 'speakers', 'title', 'subtitle'], axis=1).sample(sample_size)

speeches = speeches.drop(columns=['date', 'speakers', 'title', 'subtitle'], axis=1).dropna().iloc[0:3]
speeches.head()

Unnamed: 0,contents
0,SPEECH Societal responsibility and central...
1,SPEECH Climate change and financial integr...
3,SPEECH At the edge of tomorrow: preparing ...


In [6]:


#preprocessing
# Remove punctuation
# speeches['contents'] = speeches['contents'].replace('[,\.!?]', '', regex=True)

speeches['contents'] = speeches['contents'].replace('SPEECH', '', regex=True)
speeches['contents'] = speeches['contents'].replace('\((.*?)\)', '', regex=True)
speeches['contents'] = speeches['contents'].replace('\[(.*?)\]', '', regex=True)
speeches['contents'] = speeches['contents'].replace('Note.*?\.', '', regex=True)
speeches['contents'] = speeches['contents'].replace('Chart .*?\..*?\.', '', regex=True)
speeches['contents'] = speeches['contents'].replace('[,\.!?]', '', regex=True)


speeches['contents'] = speeches['contents'].replace('[^\x00-\x7F]+',' ', regex=True)



In [7]:
# from wordcloud import WordCloud
# long_string = ','.join(list(speeches['contents']))
# wordcloud = WordCloud(background_color="white", max_words=5000, contour_width=3, contour_color='steelblue')
# wordcloud.generate(long_string)
# wordcloud.to_image()

In [8]:
import gensim
from gensim.utils import simple_preprocess
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS as stop_words

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('words')
nltk.download('averaged_perceptron_tagger')
# stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()
words = set(nltk.corpus.words.words())

def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

def remove_non_english(texts):
    return [[w for w in nltk.wordpunct_tokenize(" ".join(doc)) if w.lower() in words or not w.isalpha()] for doc in texts]

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]

def lemmatize(texts):
    return [[lemmatizer.lemmatize(w) for w in doc] for doc in texts]



# tokenize remove stopword, remove non english, punctuations, lemmatize

# data = speeches.contents.values.tolist()
data = [speeches.iloc[2].contents]
data_words = list(sent_to_words(data))

data_words = remove_non_english(data_words)
data_words = lemmatize(data_words)

data_words = remove_stopwords(data_words)

# Example for detecting bigrams 
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder =nltk.collocations.BigramCollocationFinder.from_documents(data_words)
# # Filter only those that occur at least 50 times
# finder.apply_freq_filter(50)
bigram_scores = finder.score_ngrams(bigram_measures.pmi)

# trigram_measures = nltk.collocations.TrigramAssocMeasures()
# finder = nltk.collocations.TrigramCollocationFinder.from_documents(data_words)

# trigram_scores = finder.score_ngrams(trigram_measures.pmi)


bigram_pmi = pd.DataFrame(bigram_scores)
bigram_pmi.columns = ['bigram', 'pmi']
bigram_pmi.sort_values(by='pmi', axis = 0, ascending = False, inplace = True)
# trigram_pmi = pd.DataFrame(trigram_scores)
# trigram_pmi.columns = ['trigram', 'pmi']
# trigram_pmi.sort_values(by='pmi', axis = 0, ascending = False, inplace = True)


# Filter for bigrams with only noun-type structures
def bigram_filter(bigram):
    tag = nltk.pos_tag(bigram)
    if tag[0][1] not in ['JJ', 'NN'] and tag[1][1] not in ['NN']:
        return False
    if bigram[0] in stop_words or bigram[1] in stop_words:
        return False
    if 'n' in bigram or 't' in bigram:
        return False
    if 'PRON' in bigram:
        return False
    return True
# # Filter for trigrams with only noun-type structures
# def trigram_filter(trigram):
#     tag = nltk.pos_tag(trigram)
#     if tag[0][1] not in ['JJ', 'NN'] and tag[1][1] not in ['JJ','NN']:
#         return False
#     if trigram[0] in stop_words or trigram[-1] in stop_words or trigram[1] in stop_words:
#         return False
#     if 'n' in trigram or 't' in trigram:
#          return False
#     if 'PRON' in trigram:
#         return False
#     return True 

filtered_bigram = bigram_pmi[bigram_pmi.apply(lambda bigram:\
                                              bigram_filter(bigram['bigram'])\
                                              and bigram.pmi > 5, axis = 1)][:500]

# filtered_trigram = trigram_pmi[trigram_pmi.apply(lambda trigram: \
#                                                  trigram_filter(trigram['trigram'])\
#                                                  and trigram.pmi > 5, axis = 1)][:500]

print(filtered_bigram)
bigrams = ['_'.join(x) for x in filtered_bigram.bigram.values if len(x[0]) > 2 or len(x[1]) > 2]
# trigrams = [' '.join(x) for x in filtered_trigram.trigram.values if len(x[0]) > 2 or len(x[1]) > 2 and len(x[2]) > 2]

print(bigrams)

# print(trigram_pmi.head())

# Concatenate n-grams
def replace_ngram(x):
    # for gram in trigrams:
    #     x = " ".join(x).replace(gram, '_'.join(gram.split())).split()
    
    # for gram in bigrams:
        
    #     x=x.replace(gram, '_'.join(gram.split()))
    return x+bigrams

data_words = [replace_ngram(doc) for doc in data_words]

# print(data_words[:1][0][:30])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\felix\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\felix\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\felix\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\felix\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
                       bigram       pmi
0        (accelerated, trend)  8.189825
54           (money, privacy)  8.189825
62        (play, operational)  8.189825
60        (overly, dependent)  8.189825
58       (operational, taken)  8.189825
..                        ...       ...
198           (work, ongoing)  5.189825

In [9]:
print("wir" in words)

True


In [10]:
print(data_words[:1][0][:100])

['edge', 'tomorrow', 'future', 'retail', 'introductory', 'member', 'executive', 'board', 'th', 'payment', 'forum', 'bank', 'main', 'thank', 'inviting', 'speak', 'fourteenth', 'edition', 'forum', 'bank', 'finland', 'seen', 'forerunner', 'development', 'retail', 'certainly', 'come', 'effect', 'despite', 'ongoing', 'pandemic', 'come', 'ecosystem', 'today', 'like', 'discus', 'promote', 'digital', 'instant', 'retail', 'payment', 'used', 'digital', 'indisputably', 'come', 'long', 'way', 'retail', 'landscape', 'natural', 'starting', 'retail', 'strategy', 'simply', 'way', 'pay', 'increasingly', 'paying', 'pandemic', 'accelerated', 'trend', 'facing', 'positioned', 'wait', 'attitude', 'past', 'overly', 'dependent', 'foreign', 'card', 'resulting', 'high', 'degree', 'market', 'concentration', 'increase', 'choice', 'resilience', 'competitiveness', 'ecosystem', 'stimulate', 'competition', 'innovative', 'grown', 'payment', 'reason', 'key', 'priority', 'retail', 'strategy', 'development', 'payment', '

In [11]:
# from sklearn.feature_extraction.text import CountVectorizer

# from sklearn.decomposition import LatentDirichletAllocation

# from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS

# vect = CountVectorizer(ngram_range=(1,1), stop_words='english')

# from functools import reduce

# import numpy as np

# docs = []

# for doc in data_words:
#     docs.append(" ".join(doc))

# fin = vect.fit_transform(docs)



# lda = LatentDirichletAllocation(n_components=num_topics)

# lda.fit_transform(fin)

# lda_dtf=lda.fit_transform(fin)

 

# sorting=np.argsort(lda.components_)[:,::-1]

# features=np.array(vect.get_feature_names())

In [12]:
import pickle 
import pyLDAvis
import pyLDAvis.sklearn 

# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_data_filepath = os.path.join('ldavis_prepared_'+str(num_topics))
lda_model, corpus, id2word)
LDAvis_prepared = pyLDAvis.sklearn.prepare(lda, fin, vect)
with open(LDAvis_data_filepath, 'wb') as f:
    pickle.dump(LDAvis_prepared, f)

# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)
pyLDAvis.save_html(LDAvis_prepared, 'ldavis_prepared_'+ str(num_topics) +'.html')
LDAvis_prepared

In [15]:
import gensim.corpora as corpora
# Create Dictionary
id2word = corpora.Dictionary(data_words)
# Create Corpus
texts = data_words
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1][0][:30])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 2), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 3), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 2), (22, 1), (23, 1), (24, 1), (25, 1), (26, 4), (27, 1), (28, 1), (29, 1)]
1


IndexError: list index out of range

In [14]:
# coherence = []
# for k in range(5,25):
#     print('Round: '+str(k))
#     lda_model = gensim.models.LdaMulticore(corpus=corpus,
#                                        id2word=id2word,
#                                        num_topics=k, workers=10, chunksize = 10000)
    
#     cm = gensim.models.coherencemodel.CoherenceModel(model=lda_model, texts=data_words,\
#                                                      dictionary=id2word, coherence='c_v')
#     coherence.append((k,cm.get_coherence()))

In [15]:
# import matplotlib.pyplot as plt
# x_val = [x[0] for x in coherence]
# y_val = [x[1] for x in coherence]
# plt.plot(x_val,y_val)
# plt.scatter(x_val,y_val)
# plt.title('Number of Topics vs. Coherence')
# plt.xlabel('Number of Topics')
# plt.ylabel('Coherence')
# plt.xticks(x_val)
# plt.show()


In [16]:
from pprint import pprint
# number of topics

# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics, workers=10, chunksize = 10000)
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.011*"retail" + 0.010*"instant" + 0.010*"payment" + 0.006*"digital" + '
  '0.006*"strategy" + 0.005*"card" + 0.005*"pan" + 0.005*"bank" + 0.005*"new" '
  '+ 0.005*"future"'),
 (1,
  '0.010*"payment" + 0.009*"retail" + 0.008*"instant" + 0.005*"strategy" + '
  '0.005*"digital" + 0.004*"card" + 0.004*"pan" + 0.004*"come" + 0.004*"cost" '
  '+ 0.004*"solution"'),
 (2,
  '0.019*"instant" + 0.016*"retail" + 0.015*"payment" + 0.011*"digital" + '
  '0.009*"strategy" + 0.007*"pan" + 0.007*"work" + 0.006*"card" + '
  '0.006*"reach" + 0.006*"electronic"'),
 (3,
  '0.013*"instant" + 0.013*"retail" + 0.012*"payment" + 0.009*"strategy" + '
  '0.008*"digital" + 0.006*"card" + 0.005*"cost" + 0.005*"solution" + '
  '0.005*"pan" + 0.005*"reach"'),
 (4,
  '0.023*"instant" + 0.018*"payment" + 0.018*"retail" + 0.009*"strategy" + '
  '0.008*"digital" + 0.008*"work" + 0.006*"card" + 0.006*"release" + '
  '0.006*"pan" + 0.006*"press"')]


In [17]:
import pyLDAvis.gensim_models as gensimvis
import pickle 
import pyLDAvis

# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_data_filepath = os.path.join('ldavis_prepared_'+str(num_topics))

LDAvis_prepared = gensimvis.prepare(lda_model, corpus, id2word)
with open(LDAvis_data_filepath, 'wb') as f:
    pickle.dump(LDAvis_prepared, f)

# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)
pyLDAvis.save_html(LDAvis_prepared, 'ldavis_prepared_'+ str(num_topics) +'.html')
LDAvis_prepared

In [None]:
https://paperswithcode.com/sota/document-summarization-on-cnn-daily-mail?metric=ROUGE-L


abstractive text summarization
or extractive



data exploration: 1997-02-07 to 2021-05-27

