In [1]:
import pandas as pd
import os

import numpy as np

import matplotlib.pyplot as plt

In [2]:

speeches = pd.read_csv('./ecb_with_sentiment_bert.csv')
speeches.head()



Unnamed: 0.1,Unnamed: 0,date,speakers,title,subtitle,contents,mean,std,pos_mean,pos_std,neu_mean,neu_std,neg_mean,neg_std,pos_np,neu_np,neg_np
0,0,2021-05-27,Isabel Schnabel,Societal responsibility and central bank indep...,"Keynote speech by Isabel Schnabel, Member of t...",Societal responsibility and central bank inde...,-0.029951,0.400268,0.163618,0.23412,0.642814,0.305841,0.193568,0.268448,[0.10956815 0.35029936 0.20687319 0.01825724 0...,[0.87274659 0.59361166 0.7798577 0.93721557 0...,[0.01768526 0.05608896 0.01326916 0.04452714 0...
1,1,2021-05-27,Luis de Guindos,Climate change and financial integration,"Keynote speech by Luis de Guindos, Vice-Presid...",Climate change and financial integration Keyn...,0.061016,0.530413,0.256416,0.326693,0.548183,0.369853,0.1954,0.319901,[0.09710149 0.02649051 0.04752919 0.82506913 0...,[0.88666415 0.15776695 0.26347992 0.16767293 0...,[0.01623432 0.81574255 0.68899089 0.00725802 0...
2,3,2021-05-19,Fabio Panetta,At the edge of tomorrow: preparing the future ...,"Introductory remarks by Fabio Panetta, Member ...",At the edge of tomorrow: preparing the future...,0.26995,0.425732,0.34774,0.300367,0.57447,0.314054,0.07779,0.222976,[0.20408778 0.20069633 0.8980974 0.28038397 0...,[0.78265643 0.788921 0.09119311 0.71117795 0...,[0.01325578 0.01038271 0.01070952 0.00843802 0...
3,4,2021-05-06,Christine Lagarde,Towards a green capital markets union for Europe,"Speech by Christine Lagarde, President of the ...",Towards a green capital markets union for Eur...,0.323093,0.428089,0.397903,0.310706,0.527286,0.306852,0.074811,0.205355,[0.4326309 0.0224579 0.0526524 0.08850522 0...,[0.55388117 0.28344294 0.91742045 0.89186609 0...,[0.01348789 0.69409919 0.02992714 0.01962868 0...
4,6,2021-04-29,Frank Elderson,All the way to zero: guiding banks towards a c...,"Keynote speech by Frank Elderson, Vice-Chair o...",All the way to zero: guiding banks towards a ...,0.209064,0.397128,0.291181,0.291513,0.626703,0.303493,0.082117,0.199824,[0.5190323 0.66055119 0.5339638 0.76919842 0...,[0.46750572 0.33069667 0.44952407 0.22248527 0...,[0.01346206 0.00875203 0.01651208 0.0083163 0...


In [3]:
print(len(speeches))

2266


In [4]:
import nltk
from nltk import tokenize
nltk.download('punkt')

# pre-processing functions

def preprocess(speech):
    return tokenize.sent_tokenize(speech)

def remove_neutral(pos, neu, neg):
    pos = np.fromstring(pos[1:-1],sep=' ')
    neu = np.fromstring(neu[1:-1],sep=' ')
    neg = np.fromstring(neg[1:-1],sep=' ')
    return pos-neg

def get_rgb(score):
    if score == 0:
        return 255,255
    elif score < 0:
        return 255, round((score+1)*255,2)
    elif score > 0:
        return round((1-score)*255), 255

def get_span(text,r,g):
    return f"<span style=\"background-color: rgb({r},{g},0)\">{text}</span>"

def show_sentiment(speech, scores, title, subtitle, speaker):
    assert len(speech) == len(scores)
    # print(f"Number of Sentences: {len(speech)}")

    html_top = f"""
    <html>
    <head></head>
    <body>
    <h1> {title} </h1>
    <h2> {subtitle} </h2>
    <h3> Speaker: {speaker} </h3>

    """
    html_bottom = """
    </body>
    </html>
    """
    html_middle = ""

    for segment, score in zip(speech,scores):
        # print(f"Sentence processed:{(index+1)/len(speech)} Sentence Length:{len(paragraph)}" )
        # print(paragraph)
        r,g = get_rgb(score)
        html_middle += get_span(segment, r, g)


    return html_top + html_middle + html_bottom
   
count = 0
def html_sentiment(speech):
    global count
    count +=1
    # print(f"Document processed: {count}")
    tokenized_speeches = preprocess(speech["contents"])
    # tokenized_speeches = join_to_fit(tokenized_speeches)
    scores = remove_neutral(speech['pos_np'], speech['neu_np'], speech['neg_np'])
    outputs = show_sentiment(tokenized_speeches, scores, speech["title"], speech["subtitle"], speech["speakers"])
    return outputs


def apply_and_concat(dataframe, func, column_names):
    return pd.concat((
        dataframe,
        dataframe.apply(
            lambda row: pd.Series(func(row), index=column_names))), axis=1)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\B\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
speeches['html'] = speeches.apply(lambda row: html_sentiment(row),axis=1)

In [6]:
for i in range(len(speeches)):
    with open(f'html/{speeches.iloc[i]["date"]}_{speeches.iloc[i]["speakers"]}_{i}.html', 'w+', encoding='utf-8') as fh:
        fh.write(speeches.iloc[i]["html"])