In [None]:
# define this if you have more than 1 gpu
import os

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [None]:
from sentence_transformers import SentenceTransformer, InputExample, losses, evaluation, util
from torch.utils.data import DataLoader
import numpy as np
import pickle

In [None]:
def kl_divergence(p,q):
    return np.sum(p * (np.log2(p)-np.log2(q)))

def js_divergence(p,q):
    m = 0.5 * (p + q)
    return 0.5 * kl_divergence(p, m) + 0.5 * kl_divergence(q, m)

def one_hot(p,q):
    return 1 if p==q else 0


def sigmoid(val):
    # return val
    return 1/(1+np.exp(-17*(val-0.5)))

def identity(val):
    return val

# js ensure symmetric
def similarity(p,q,mode="js", func=identity):
    if mode == "js":
        return func(np.exp2(-js_divergence(np.array(p),np.array(q))))
    elif mode == "kl":
        return func(np.exp2(-kl_divergence(np.array(p),np.array(q))))
    elif mode == "one-hot":
        return one_hot(p,q)

def get_random_index_pairs(num_data, amount):
    return np.random.randint(num_data, size=(amount, 2))


In [None]:
# flatten to one list for all 3
with open('train_data.pickle', 'rb') as file:
    train = pickle.load(file)

with open('gpt.pickle', 'rb') as file:
    gpt = pickle.load(file)
    
with open('gpt_p2.pickle', 'rb') as file:
    gpt2 = pickle.load(file)

gpt = [item for sublist in gpt for item in sublist]
gpt2 = [item for sublist in gpt2 for item in sublist]

mixed = gpt + gpt2
test = train

In [None]:
model = SentenceTransformer('./res/sbert_v2')

In [None]:

with open('evaluation.pickle', 'rb') as file:
    eval_dict = pickle.load(file)

In [None]:
classes = ["banking","valuation","household","real estate","corporate","external","sovereign","technology", "climate", "energy", "health", "eu"]

#cosine similarity
#Compute embedding for both lists
embedded_class_dictionary = {label: [] for label in classes}


for label in classes:
    for sentence in eval_dict[label]:
        embeddings = model.encode(sentence, convert_to_tensor=True)
        embedded_class_dictionary[label].append(embeddings)

  


In [None]:
import random
import torch
import math

def rescale(dist):
    beta = torch.mean(dist[math.ceil(0.25*len(dist)):math.floor(0.75*len(dist))])
    alpha = torch.max(torch.tensor([(10+1/(torch.std(dist)/torch.mean(dist))), 400]))
    return 1/(1+torch.exp(-alpha*(dist-beta)))

def query(text, examples=10):
    scores = []
    text_vector = model.encode(text, convert_to_tensor=True)
    for label in classes:
        if label != "eu":
            examples_list = random.sample(embedded_class_dictionary[label], k=examples)
        else:
            examples_list = embedded_class_dictionary[label]
        cosine_scores = torch.tensor([util.pytorch_cos_sim(text_vector,  example) for example in examples_list])
        scores.append(torch.mean(cosine_scores))
    # torch.nn.functional.softmax(torch.tensor(scores))
    scores = torch.tensor(scores)
    scores = scores/torch.sum(scores)
    scores = rescale(scores)
    scores = scores/torch.sum(scores)
    #softmax
    return {label:score for label, score in zip(classes,scores)}, np.array(scores)


In [None]:
sent1 = "In contrast to the radical forces buffeting valuations, for most companies, 2020 was a year of “strategy lockdown."
sent2 = "Climate change is a real thing."

u = model.encode(sent1)
v = model.encode(sent2)
util.pytorch_cos_sim(u,v)

tensor([[0.8185]])

In [None]:

query("In contrast to the radical forces buffeting valuations, for most companies, 2020 was a year of “strategy lockdown.")


({'banking': tensor(0.0018),
  'valuation': tensor(0.1601),
  'household': tensor(0.0124),
  'real estate': tensor(0.0516),
  'corporate': tensor(0.1664),
  'external': tensor(0.1011),
  'sovereign': tensor(0.0059),
  'technology': tensor(0.0579),
  'climate': tensor(0.1490),
  'energy': tensor(0.0793),
  'health': tensor(0.1463),
  'eu': tensor(0.0682)},
 array([0.0018129 , 0.16008356, 0.01243954, 0.05163125, 0.1663717 ,
        0.10113119, 0.00590704, 0.0579321 , 0.14897409, 0.07926115,
        0.14625244, 0.06820304], dtype=float32))

In [None]:
query("Mortgage interest rate in selected European countries as of 4th quarter of 2019 and 2020")

({'banking': tensor(0.0034),
  'valuation': tensor(0.1556),
  'household': tensor(0.1686),
  'real estate': tensor(0.2710),
  'corporate': tensor(0.0097),
  'external': tensor(0.0700),
  'sovereign': tensor(0.0062),
  'technology': tensor(0.0042),
  'climate': tensor(0.1854),
  'energy': tensor(0.0073),
  'health': tensor(0.1020),
  'eu': tensor(0.0167)},
 array([0.00338506, 0.15561585, 0.1686111 , 0.27097803, 0.00974991,
        0.06995411, 0.0061908 , 0.00415097, 0.18535496, 0.00727346,
        0.10199468, 0.01674107], dtype=float32))

In [None]:

from sklearn.metrics import accuracy_score, precision_recall_fscore_support,top_k_accuracy_score
def compute_metrics(labels, preds):
    best = np.argmax(preds, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, best, average='macro')
    acc = accuracy_score(labels, best)
    top3 = top_k_accuracy_score(labels, preds ,k=3)
    top2 = top_k_accuracy_score(labels, preds ,k=2)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'top3': top3,
        'top2': top2
    }

In [None]:
from tqdm import tqdm
labels = []
preds = []

for item in tqdm(test):
    labels.append(np.argmax(np.array(item["dist"])))
    preds.append(query(item["text"])[1])
preds = np.array(preds)

100%|██████████| 261/261 [00:07<00:00, 33.09it/s]


In [None]:
assert len(labels) == len(preds)

In [None]:
compute_metrics(labels,preds)


{'accuracy': 0.7662835249042146,
 'f1': 0.7614132751514595,
 'precision': 0.7893137567453573,
 'recall': 0.7917546391230602,
 'top3': 0.9386973180076629,
 'top2': 0.8735632183908046}

In [None]:
import pandas as pd
import os

import numpy as np
import tqdm


import random

from pprint import pprint
import pickle 


from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS as stop_words

import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context


from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch 




In [None]:

speeches = pd.read_csv('./all_ECB_speeches.csv', delimiter='|')
speeches.head()


Unnamed: 0,date,speakers,title,subtitle,contents
0,2021-05-27,Isabel Schnabel,Societal responsibility and central bank indep...,"Keynote speech by Isabel Schnabel, Member of t...",SPEECH Societal responsibility and central...
1,2021-05-27,Luis de Guindos,Climate change and financial integration,"Keynote speech by Luis de Guindos, Vice-Presid...",SPEECH Climate change and financial integr...
2,2021-05-25,Philip R. Lane,The ECB strategy review,"Presentation by Philip R. Lane, Member of the ...",
3,2021-05-19,Fabio Panetta,At the edge of tomorrow: preparing the future ...,"Introductory remarks by Fabio Panetta, Member ...",SPEECH At the edge of tomorrow: preparing ...
4,2021-05-06,Christine Lagarde,Towards a green capital markets union for Europe,"Speech by Christine Lagarde, President of the ...",SPEECH Towards a green capital markets uni...


In [None]:

#Remove NA entries
speeches = speeches.dropna()

#Only get presidential speeches
# speeches = speeches.loc[speeches.subtitle.str.contains("\sPresident\s"),:]


#Regex cleaning

speeches['contents'] = speeches['contents'].replace('SPEECH', ' ', regex=True)
speeches['contents'] = speeches['contents'].replace('\s+', ' ', regex=True)
speeches['contents'] = speeches['contents'].replace('\(.*?\)', ' ', regex=True)
speeches['contents'] = speeches['contents'].replace('\[.*?\]', ' ', regex=True)
speeches['contents'] = speeches['contents'].replace('Note.*?\.', ' ', regex=True)
speeches['contents'] = speeches['contents'].replace('Chart .*?\..*?\.', ' ', regex=True)

speeches['contents'] = speeches['contents'].replace('I\..*?References', ' ', regex=True) #edge caSe
speeches['contents'] = speeches['contents'].replace('References.*', ' ', regex=True)
speeches['contents'] = speeches['contents'].replace('REFERENCES.*', ' ', regex=True)
speeches['contents'] = speeches['contents'].replace('LITERATURE.*', ' ', regex=True)
speeches['contents'] = speeches['contents'].replace('BIBLIOGRAPHY.*', ' ', regex=True)
speeches['contents'] = speeches['contents'].replace(' [0-9]\. ', ' ', regex=True)


speeches['contents'] = speeches['contents'].replace('Vol.*?pp.*?\.', ' ', regex=True)

speeches['contents'] = speeches['contents'].replace('Vol\..*?[0-9]*,.*?No\..*?\.', ' ', regex=True)


speeches['contents'] = speeches['contents'].replace('op\..*?cit\..*?\.', ' ', regex=True)
speeches['contents'] = speeches['contents'].replace('op\..*?cit\.', ' ', regex=True)


speeches['contents'] = speeches['contents'].replace('See.*?\.', ' ', regex=True)


speeches['contents'] = speeches['contents'].replace('SEE ALSO.*', ' ', regex=True)

speeches['contents'] = speeches['contents'].replace('Thank you\..*', ' ', regex=True)
speeches['contents'] = speeches['contents'].replace('Thank you for your kind attention\..*', ' ', regex=True)
speeches['contents'] = speeches['contents'].replace('Thank you for your attention\..*', ' ', regex=True)
speeches['contents'] = speeches['contents'].replace('I thank you for your attention\..*', ' ', regex=True)



# speeches['contents'] = speeches['contents'].replace('[^\x00-\x7F]+', ' ', regex=True)

# can also clean more edge cases like Thank you./Thank you for your kind attention. etc. kill everything behind 

In [57]:
# remove non-english

from langdetect import detect

def isEnglish(text):
    try:
        if detect(text) == 'en':
            return True
        else:
            # print(text[:40])
            return False
    except:
        print(text)
        return False

def isLongerThan(text):
    return len(text)>500

def filter(text):
    return isEnglish(text) and isLongerThan(text)

print(len(speeches))
speeches = speeches[speeches.apply(lambda x: filter(x['contents']), axis=1)]   
print(len(speeches))

2460
 
2266


In [58]:
import nltk
from nltk import tokenize

nltk.download('punkt')

# pre-processing functions

def preprocess(speech):
    return tokenize.sent_tokenize(speech)

def join_to_fit(tokens):
    results = []
    temp = ""
    count = 0
    for token in tokens:

        if count >= 500:

            results.append(temp[:500])
            temp = temp[500:]
            count = 0 

        count += len(token)
        temp += token

    return results

# tried president only (same)
# removed neutral sentences (same)
# fss alternative index: #neg sent - #pos sent / total
def analyze_topic(speech):
    dists = []
  
    print(f"Number of Sentences: {len(speech)}")
    for index, paragraph in enumerate(speech):
        # print(f"Sentence processed:{(index+1)/len(speech)} Sentence Length:{len(paragraph)}" )
        # print(paragraph)
        out, dist = query(paragraph)
        dists.append(dist)

    return np.arrary(dists)
   
count = 0
def complete_topic(speech):
    global count
    count +=1
    print(f"Document processed: {count}")
    tokenized_speeches = preprocess(speech)
    # tokenized_speeches = join_to_fit(tokenized_speeches)
    outputs = analyze_topic(tokenized_speeches)
    return outputs



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\felix\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [59]:

def apply_and_concat(dataframe, field, func, column_names):
    return pd.concat((
        dataframe,
        dataframe[field].apply(
            lambda cell: pd.Series(func(cell), index=column_names))), axis=1)

# speeches['mean'], speeches['std'] = speeches.apply(lambda speech: sentiment_analysis(speech.contents), axis=1)


In [60]:
ecb_with_topics = apply_and_concat(speeches, 'contents', complete_topic, ["dist"])

Document processed: 1
Number of Sentences: 115


AttributeError: module 'numpy' has no attribute 'arrary'

In [None]:
ecb_with_topics.iloc[0]


In [None]:
ecb_with_topics.to_csv('./ecb_with_topics_sbert.csv')