In [1]:
import pandas as pd
import os

import numpy as np
import tqdm

import gensim
from gensim import models
from gensim.utils import simple_preprocess
import gensim.corpora as corpora
import random

from pprint import pprint
import pickle 


import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from nltk.sentiment.vader import SentimentIntensityAnalyzer



from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS as stop_words

import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context



nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('words')
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon')
nltk.download('punkt')



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/felixwong/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/felixwong/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to /Users/felixwong/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/felixwong/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/felixwong/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /Users/felixwong/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:

speeches = pd.read_csv('./all_ECB_speeches.csv', delimiter='|')
speeches.head()


Unnamed: 0,date,speakers,title,subtitle,contents
0,2021-05-27,Isabel Schnabel,Societal responsibility and central bank indep...,"Keynote speech by Isabel Schnabel, Member of t...",SPEECH Societal responsibility and central...
1,2021-05-27,Luis de Guindos,Climate change and financial integration,"Keynote speech by Luis de Guindos, Vice-Presid...",SPEECH Climate change and financial integr...
2,2021-05-25,Philip R. Lane,The ECB strategy review,"Presentation by Philip R. Lane, Member of the ...",
3,2021-05-19,Fabio Panetta,At the edge of tomorrow: preparing the future ...,"Introductory remarks by Fabio Panetta, Member ...",SPEECH At the edge of tomorrow: preparing ...
4,2021-05-06,Christine Lagarde,Towards a green capital markets union for Europe,"Speech by Christine Lagarde, President of the ...",SPEECH Towards a green capital markets uni...


In [3]:

sentiment_score = pd.read_csv('./reprocessed_financial_dictionary.csv')

sentiment_score['score']*=5
sentiment_score.head()


Unnamed: 0,word,score
0,able,4
1,abnormally,-4
2,abrupt,-4
3,absorb,4
4,absorbed,4


In [4]:

#Remove NA entries
speeches = speeches.dropna()

#Only get presidential speeches
# speeches = speeches.loc[speeches.subtitle.str.contains("\sPresident\s"),:]


#Regex cleaning
speeches['contents'] = speeches['contents'].replace('SPEECH', '', regex=True)
speeches['contents'] = speeches['contents'].replace('\((.*?)\)', '', regex=True)
speeches['contents'] = speeches['contents'].replace('\[(.*?)\]', '', regex=True)
speeches['contents'] = speeches['contents'].replace('Note.*?\.', '', regex=True)
speeches['contents'] = speeches['contents'].replace('Chart .*?\..*?\.', '', regex=True)


In [5]:
sentiment_score_dict = dict(zip(sentiment_score.word, sentiment_score.score))

In [6]:

sid = SentimentIntensityAnalyzer()

# sid.lexicon.clear()
sid.lexicon.update(sentiment_score_dict)
print(len(sid.lexicon.keys()))
print(len(sentiment_score_dict))


391
391


In [7]:

from nltk import tokenize

# pre-processing functions

def preprocess(speech):
    return tokenize.sent_tokenize(speech)


# https://blog.quantinsti.com/vader-sentiment/
# tried president only (same)
# removed neutral sentences (same)
# fss alternative index: #neg sent - #pos sent / total
def analyze_sentiment(speech):
    sentiment_score = []
    pos_score = []
    neu_score = []
    neg_score = []

    print(f"Number of Sentences: {len(speech)}")
    for sentence in speech:
        ss = sid.polarity_scores(sentence)
        sentiment_score.append(ss['compound'])
        pos_score.append(ss['pos'])
        neu_score.append(ss['neu'])
        neg_score.append(ss['neg'])

    
    numpy_sentiment_score = np.array(sentiment_score)

    numpy_pos_score = np.array(pos_score)

    numpy_neu_score = np.array(neu_score)

    numpy_neg_score = np.array(neg_score)
    # print(np.array(pos_score).mean(),np.array(neu_score).mean(), np.array(neg_score).mean())
    mean = numpy_sentiment_score.mean()
    std = numpy_sentiment_score.std()
    pos_mean = numpy_pos_score.mean()
    pos_std = numpy_pos_score.std()
    neu_mean = numpy_neu_score.mean()
    neu_std = numpy_neu_score.std()
    neg_mean = numpy_neg_score.mean()
    neg_std = numpy_neg_score.std()


    # fss = pos_mean / (pos_mean + neg_mean) - 0.5
    # fss = ( pos_mean + 1 ) / ( neg_mean + 1 ) - 1

    return mean, std, pos_mean, pos_std, neu_mean, neu_std, neg_mean, neg_std, numpy_pos_score, numpy_neu_score,numpy_neg_score
   

count = 0
def complete_sentiment(speech):
    global count
    count +=1
    print(f"Document processed: {count}")
    tokenized_speeches = preprocess(speech)
    outputs = analyze_sentiment(tokenized_speeches)
    return outputs



In [8]:
def apply_and_concat(dataframe, field, func, column_names):
    return pd.concat((
        dataframe,
        dataframe[field].apply(
            lambda cell: pd.Series(func(cell), index=column_names))), axis=1)

# speeches['mean'], speeches['std'] = speeches.apply(lambda speech: sentiment_analysis(speech.contents), axis=1)


In [9]:
ecb_with_sentiment = apply_and_concat(speeches, 'contents', complete_sentiment,  ['mean', 'std', 'pos_mean','pos_std','neu_mean','neu_std','neg_mean','neg_std',"pos_np","neu_np","neg_np"])

Document processed: 1
Number of Sentences: 151
Document processed: 2
Number of Sentences: 93
Document processed: 3
Number of Sentences: 45
Document processed: 4
Number of Sentences: 93
Document processed: 5
Number of Sentences: 127
Document processed: 6
Number of Sentences: 160
Document processed: 7
Number of Sentences: 200
Document processed: 8
Number of Sentences: 120
Document processed: 9
Number of Sentences: 53
Document processed: 10
Number of Sentences: 63
Document processed: 11
Number of Sentences: 28
Document processed: 12
Number of Sentences: 181
Document processed: 13
Number of Sentences: 58
Document processed: 14
Number of Sentences: 63
Document processed: 15
Number of Sentences: 114
Document processed: 16
Number of Sentences: 69
Document processed: 17
Number of Sentences: 136
Document processed: 18
Number of Sentences: 91
Document processed: 19
Number of Sentences: 61
Document processed: 20
Number of Sentences: 186
Document processed: 21
Number of Sentences: 137
Document pro

  mean = numpy_sentiment_score.mean()
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  pos_mean = numpy_pos_score.mean()
  neu_mean = numpy_neu_score.mean()
  neg_mean = numpy_neg_score.mean()


Document processed: 1962
Number of Sentences: 193
Document processed: 1963
Number of Sentences: 140
Document processed: 1964
Number of Sentences: 136
Document processed: 1965
Number of Sentences: 156
Document processed: 1966
Number of Sentences: 143
Document processed: 1967
Number of Sentences: 67
Document processed: 1968
Number of Sentences: 252
Document processed: 1969
Number of Sentences: 90
Document processed: 1970
Number of Sentences: 137
Document processed: 1971
Number of Sentences: 111
Document processed: 1972
Number of Sentences: 384
Document processed: 1973
Number of Sentences: 215
Document processed: 1974
Number of Sentences: 350
Document processed: 1975
Number of Sentences: 35
Document processed: 1976
Number of Sentences: 146
Document processed: 1977
Number of Sentences: 81
Document processed: 1978
Number of Sentences: 93
Document processed: 1979
Number of Sentences: 88
Document processed: 1980
Number of Sentences: 177
Document processed: 1981
Number of Sentences: 103
Docume

In [10]:
ecb_with_sentiment.columns

Index(['date', 'speakers', 'title', 'subtitle', 'contents', 'mean', 'std',
       'pos_mean', 'pos_std', 'neu_mean', 'neu_std', 'neg_mean', 'neg_std',
       'pos_np', 'neu_np', 'neg_np'],
      dtype='object')

In [11]:
len(ecb_with_sentiment)

2460

In [12]:
ecb_with_sentiment.iloc[0]

date                                               2021-05-27
speakers                                      Isabel Schnabel
title       Societal responsibility and central bank indep...
subtitle    Keynote speech by Isabel Schnabel, Member of t...
contents         Societal responsibility and central bank ...
mean                                                -0.070535
std                                                  0.368793
pos_mean                                             0.021238
pos_std                                              0.065615
neu_mean                                             0.942669
neu_std                                               0.10122
neg_mean                                             0.036079
neg_std                                              0.077886
pos_np      [0.0, 0.056, 0.154, 0.0, 0.0, 0.0, 0.0, 0.116,...
neu_np      [1.0, 0.831, 0.846, 1.0, 1.0, 1.0, 1.0, 0.767,...
neg_np      [0.0, 0.112, 0.0, 0.0, 0.0, 0.0, 0.0, 0.116, 0...
Name: 0,

In [13]:
ecb_with_sentiment.to_csv('./ecb_with_sentiment_vader.csv')

In [14]:
ecb_with_sentiment.tail()

Unnamed: 0,date,speakers,title,subtitle,contents,mean,std,pos_mean,pos_std,neu_mean,neu_std,neg_mean,neg_std,pos_np,neu_np,neg_np
2483,1997-05-13,Alexandre Lamfalussy,The European Central Bank: independent and acc...,Keynote speech delivered by Alexandre Lamfalus...,The European Central Bank: independent and a...,-0.011675,0.363651,0.021365,0.055848,0.950729,0.090324,0.027906,0.074802,"[0.0, 0.0, 0.0, 0.171, 0.0, 0.0, 0.0, 0.0, 0.0...","[1.0, 1.0, 1.0, 0.829, 1.0, 1.0, 1.0, 1.0, 1.0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2484,1997-04-30,Alexandre Lamfalussy,The operation of monetary policy in stage thre...,"Address by Alexandre Lamfalussy, President of ...",The operation of monetary policy in stage th...,0.028192,0.282545,0.016128,0.050456,0.974353,0.064954,0.009519,0.0409,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2485,1997-04-22,Alexandre Lamfalussy,Convergence and the role of the European Centr...,"Remarks by Alexandre Lamfalussy, President of ...",Convergence and the role of the European Cen...,-0.04522,0.424089,0.024232,0.063632,0.940212,0.097647,0.035556,0.083802,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.951, 1.0, 1.0, 0.63, 1.0, 0.5, 0.875, 0.868...","[0.049, 0.0, 0.0, 0.37, 0.0, 0.5, 0.125, 0.132..."
2486,1997-03-10,Alexandre Lamfalussy,Securing the benefits of EMU,"Address by Alexandre Lamfalussy, President of ...",Securing the benefits of EMU Address by Al...,-0.085393,0.446717,0.028227,0.067423,0.917091,0.121311,0.054667,0.102693,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2487,1997-02-07,Alexandre Lamfalussy,Conference organised by the Hungarian Banking ...,"Address by Alexandre Lamfalussy, President of ...",Conference organised by the Hungarian Bankin...,-0.049138,0.376883,0.025006,0.077402,0.934747,0.127894,0.040241,0.100764,"[0.0, 0.208, 0.233, 0.0, 0.0, 0.057, 0.0, 0.0,...","[1.0, 0.792, 0.767, 1.0, 1.0, 0.943, 1.0, 1.0,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
