In [1]:
import os

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   
os.environ["CUDA_VISIBLE_DEVICES"]="4"

In [2]:
from sentence_transformers import SentenceTransformer, InputExample, losses, evaluation, util
from torch.utils.data import DataLoader
import numpy as np
import pickle

In [3]:
def kl_divergence(p,q):
    return np.sum(p * (np.log2(p)-np.log2(q)))

def js_divergence(p,q):
    m = 0.5 * (p + q)
    return 0.5 * kl_divergence(p, m) + 0.5 * kl_divergence(q, m)

def one_hot(p,q):
    return 1 if p==q else 0

def sigmoid(val):
    return val
    return 1/(1+np.exp(-17*(val-0.5)))

# js ensure symmetric
def similarity(p,q,mode="js"):
    if mode == "js":
        return sigmoid(np.exp2(-js_divergence(np.array(p),np.array(q))))
    elif mode == "kl":
        return sigmoid(np.exp2(-kl_divergence(np.array(p),np.array(q))))
    elif mode == "one-hot":
        return one_hot(p,q)

def get_random_index_pairs(num_data, amount):
    return np.random.randint(num_data, size=(amount, 2))


In [4]:
# flatten to one list for all 3
with open('train_data.pickle', 'rb') as file:
    train = pickle.load(file)

with open('gpt.pickle', 'rb') as file:
    gpt = pickle.load(file)
    
with open('gpt_p2.pickle', 'rb') as file:
    gpt2 = pickle.load(file)

gpt = [item for sublist in gpt for item in sublist]
gpt2 = [item for sublist in gpt2 for item in sublist]

mixed = gpt[200:] + train + gpt2[200:]
test = gpt2[:400] + gpt[:200]

In [5]:
model = SentenceTransformer('./res/sbert_v2')

In [6]:

with open('evaluation.pickle', 'rb') as file:
    eval_dict = pickle.load(file)

In [7]:
classes = ["banking","valuation","household","real estate","corporate","external","sovereign","technology", "climate", "energy", "health", "eu"]
#cosine similarity
#Compute embedding for both lists
embedded_class_dictionary = {label: [] for label in classes}


for label in classes:
    for sentence in eval_dict[label]:
        embeddings = model.encode(sentence, convert_to_tensor=True)
        embedded_class_dictionary[label].append(embeddings)

  




In [8]:
import random
import torch
import math

def rescale(dist):
    beta = torch.mean(dist[math.ceil(0.25*len(dist)):math.floor(0.75*len(dist))])
    alpha = torch.max(torch.tensor([(10+1/(torch.std(dist)/torch.mean(dist))), 400]))
    return 1/(1+torch.exp(-alpha*(dist-beta)))

def query(text, examples=10):
    scores = []
    text_vector = model.encode(text, convert_to_tensor=True)
    for label in classes:
        if label != "eu":
            examples_list = random.sample(embedded_class_dictionary[label], k=examples)
        else:
            examples_list = embedded_class_dictionary[label]
        cosine_scores = torch.tensor([util.pytorch_cos_sim(text_vector,  example) for example in examples_list])
        scores.append(torch.mean(cosine_scores))
    # torch.nn.functional.softmax(torch.tensor(scores))
    scores = torch.tensor(scores)
    scores = scores/torch.sum(scores)
    scores = rescale(scores)
    scores = scores/torch.sum(scores)
    #softmax
    return {label:score for label, score in zip(classes,scores)}, np.array(scores)


In [9]:
sent1 = "In contrast to the radical forces buffeting valuations, for most companies, 2020 was a year of “strategy lockdown."
sent2 = "Climate change is a real thing."

u = model.encode(sent1)
v = model.encode(sent2)
util.pytorch_cos_sim(u,v)

tensor([[0.5756]])

In [10]:

query("In contrast to the radical forces buffeting valuations, for most companies, 2020 was a year of “strategy lockdown.")
# {'banking': tensor(0.0144),
#   'valuation': tensor(0.3418),
#   'household': tensor(0.0022),
#   'real estate': tensor(0.0181),
#   'corporate': tensor(0.5484),
#   'external': tensor(0.0063),
#   'sovereign': tensor(0.0193),
#   'technology': tensor(0.0143),
#   'climate': tensor(0.0077),
#   'energy': tensor(0.0056),
#   'health': tensor(0.0210),
#   'eu': tensor(0.0008)}


({'banking': tensor(0.0219),
  'valuation': tensor(0.3215),
  'household': tensor(0.0336),
  'real estate': tensor(0.0320),
  'corporate': tensor(0.3360),
  'external': tensor(0.0153),
  'sovereign': tensor(0.0923),
  'technology': tensor(0.0119),
  'climate': tensor(0.0258),
  'energy': tensor(0.0363),
  'health': tensor(0.0422),
  'eu': tensor(0.0313)},
 array([0.02186355, 0.3215338 , 0.03357272, 0.03196084, 0.33601692,
        0.01530513, 0.09226985, 0.01192343, 0.02581508, 0.03628198,
        0.04220223, 0.03125454], dtype=float32))

In [11]:
query("Mortgage interest rate in selected European countries as of 4th quarter of 2019 and 2020")

({'banking': tensor(0.0166),
  'valuation': tensor(0.0362),
  'household': tensor(0.2336),
  'real estate': tensor(0.5644),
  'corporate': tensor(0.0096),
  'external': tensor(0.0324),
  'sovereign': tensor(0.0198),
  'technology': tensor(0.0088),
  'climate': tensor(0.0067),
  'energy': tensor(0.0258),
  'health': tensor(0.0427),
  'eu': tensor(0.0034)},
 array([0.01663781, 0.03619295, 0.23357873, 0.56437606, 0.00961803,
        0.03238049, 0.01980924, 0.00879645, 0.00666669, 0.02579491,
        0.04270546, 0.00344312], dtype=float32))

In [12]:

from sklearn.metrics import accuracy_score, precision_recall_fscore_support,top_k_accuracy_score
def compute_metrics(labels, preds):
    best = np.argmax(preds, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, best, average='macro')
    acc = accuracy_score(labels, best)
    top3 = top_k_accuracy_score(labels, preds ,k=3)
    top2 = top_k_accuracy_score(labels, preds ,k=2)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'top3': top3,
        'top2': top2
    }

In [13]:
from tqdm import tqdm
labels = []
preds = []

for item in tqdm(gpt2):
    labels.append(np.argmax(np.array(item["dist"])))
    preds.append(query(item["text"])[1])
preds = np.array(preds)

100%|██████████| 1038/1038 [00:41<00:00, 25.10it/s]


In [14]:
assert len(labels) == len(preds)

In [15]:
compute_metrics(labels,preds)
# {'accuracy': 0.92,
#  'f1': 0.8316229467944879,
#  'precision': 0.8142551892551894,
#  'recall': 0.8727039627039627,
#  'top3': 0.9725,
#  'top2': 0.965}
# train
# {'accuracy': 0.5770712909441233,
#  'f1': 0.421304669340332,
#  'precision': 0.41728348913927044,
#  'recall': 0.44001393966077357,
#  'top3': 0.8420038535645472,
#  'top2': 0.7620423892100193}
# test
# {'accuracy': 0.930281690140845,
#  'f1': 0.8722745634176153,
#  'precision': 0.8698194847154102,
#  'recall': 0.8845044288758909,
#  'top3': 0.9852112676056338,
#  'top2': 0.9781690140845071}


# enhance train accu
# {'accuracy': 0.9478873239436619,
#  'f1': 0.9068146034650141,
#  'precision': 0.893095646375258,
#  'recall': 0.9287405747114715,
#  'top3': 0.9852112676056338,
#  'top2': 0.9795774647887324}

# test
# {'accuracy': 0.7083333333333334,
#  'f1': 0.6400212205154878,
#  'precision': 0.6549408332820279,
#  'recall': 0.6365285737194072,
#  'top3': 0.915,
#  'top2': 0.865}

{'accuracy': 0.7745664739884393,
 'f1': 0.6091591343856654,
 'precision': 0.6673696421068095,
 'recall': 0.6195302496240477,
 'top3': 0.941233140655106,
 'top2': 0.8978805394990366}

In [16]:
import pandas as pd
import os

import numpy as np
import tqdm


import random

from pprint import pprint
import pickle 


from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS as stop_words

import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context


from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch 




In [17]:

speeches = pd.read_csv('./all_ECB_speeches.csv', delimiter='|')
speeches.head()


Unnamed: 0,date,speakers,title,subtitle,contents
0,2021-05-27,Isabel Schnabel,Societal responsibility and central bank indep...,"Keynote speech by Isabel Schnabel, Member of t...",SPEECH Societal responsibility and central...
1,2021-05-27,Luis de Guindos,Climate change and financial integration,"Keynote speech by Luis de Guindos, Vice-Presid...",SPEECH Climate change and financial integr...
2,2021-05-25,Philip R. Lane,The ECB strategy review,"Presentation by Philip R. Lane, Member of the ...",
3,2021-05-19,Fabio Panetta,At the edge of tomorrow: preparing the future ...,"Introductory remarks by Fabio Panetta, Member ...",SPEECH At the edge of tomorrow: preparing ...
4,2021-05-06,Christine Lagarde,Towards a green capital markets union for Europe,"Speech by Christine Lagarde, President of the ...",SPEECH Towards a green capital markets uni...


In [18]:

#Remove NA entries
speeches = speeches.dropna()

#Only get presidential speeches
# speeches = speeches.loc[speeches.subtitle.str.contains("\sPresident\s"),:]


#Regex cleaning

speeches['contents'] = speeches['contents'].replace('SPEECH', ' ', regex=True)
speeches['contents'] = speeches['contents'].replace('\s+', ' ', regex=True)
speeches['contents'] = speeches['contents'].replace('\(.*?\)', ' ', regex=True)
speeches['contents'] = speeches['contents'].replace('\[.*?\]', ' ', regex=True)
speeches['contents'] = speeches['contents'].replace('Note.*?\.', ' ', regex=True)
speeches['contents'] = speeches['contents'].replace('Chart .*?\..*?\.', ' ', regex=True)

speeches['contents'] = speeches['contents'].replace('I\..*?References', ' ', regex=True) #edge caSe
speeches['contents'] = speeches['contents'].replace('References.*', ' ', regex=True)
speeches['contents'] = speeches['contents'].replace('REFERENCES.*', ' ', regex=True)
speeches['contents'] = speeches['contents'].replace('LITERATURE.*', ' ', regex=True)
speeches['contents'] = speeches['contents'].replace('BIBLIOGRAPHY.*', ' ', regex=True)
speeches['contents'] = speeches['contents'].replace(' [0-9]\. ', ' ', regex=True)


speeches['contents'] = speeches['contents'].replace('Vol.*?pp.*?\.', ' ', regex=True)

speeches['contents'] = speeches['contents'].replace('Vol\..*?[0-9]*,.*?No\..*?\.', ' ', regex=True)


speeches['contents'] = speeches['contents'].replace('op\..*?cit\..*?\.', ' ', regex=True)
speeches['contents'] = speeches['contents'].replace('op\..*?cit\.', ' ', regex=True)


speeches['contents'] = speeches['contents'].replace('See.*?\.', ' ', regex=True)


speeches['contents'] = speeches['contents'].replace('SEE ALSO.*', ' ', regex=True)

speeches['contents'] = speeches['contents'].replace('Thank you\..*', ' ', regex=True)
speeches['contents'] = speeches['contents'].replace('Thank you for your kind attention\..*', ' ', regex=True)
speeches['contents'] = speeches['contents'].replace('Thank you for your attention\..*', ' ', regex=True)
speeches['contents'] = speeches['contents'].replace('I thank you for your attention\..*', ' ', regex=True)



# speeches['contents'] = speeches['contents'].replace('[^\x00-\x7F]+', ' ', regex=True)

# can also clean more edge cases like Thank you./Thank you for your kind attention. etc. kill everything behind 

In [19]:
# remove non-english

from langdetect import detect

def isEnglish(text):
    try:
        if detect(text) == 'en':
            return True
        else:
            # print(text[:40])
            return False
    except:
        print(text)
        return False

def isLongerThan(text):
    return len(text)>500

def filter(text):
    return isEnglish(text) and isLongerThan(text)

# non_en_idx = []
# for i in range(len(speeches)):
#     if not isEnglish(speeches.iloc[i]['contents']):
#         non_en_idx.append(i)

        

# print(len(non_en_idx))
print(len(speeches))
speeches = speeches[speeches.apply(lambda x: filter(x['contents']), axis=1)]   
print(len(speeches))

ModuleNotFoundError: No module named 'langdetect'

In [None]:
import nltk
from nltk import tokenize

nltk.download('punkt')

# pre-processing functions

def preprocess(speech):
    return tokenize.sent_tokenize(speech)

def join_to_fit(tokens):
    results = []
    temp = ""
    count = 0
    for token in tokens:

        if count >= 500:

            results.append(temp[:500])
            temp = temp[500:]
            count = 0 

        count += len(token)
        temp += token

    return results

# tried president only (same)
# removed neutral sentences (same)
# fss alternative index: #neg sent - #pos sent / total
def analyze_topic(speech):
    dists = []
  
    print(f"Number of Sentences: {len(speech)}")
    for index, paragraph in enumerate(speech):
        # print(f"Sentence processed:{(index+1)/len(speech)} Sentence Length:{len(paragraph)}" )
        # print(paragraph)
        out, dist = query(paragraph)
        dists.append(dist)

    return np.arrary(dists)
   
count = 0
def complete_topic(speech):
    global count
    count +=1
    print(f"Document processed: {count}")
    tokenized_speeches = preprocess(speech)
    # tokenized_speeches = join_to_fit(tokenized_speeches)
    outputs = analyze_topic(tokenized_speeches)
    return outputs



In [None]:

def apply_and_concat(dataframe, field, func, column_names):
    return pd.concat((
        dataframe,
        dataframe[field].apply(
            lambda cell: pd.Series(func(cell), index=column_names))), axis=1)

# speeches['mean'], speeches['std'] = speeches.apply(lambda speech: sentiment_analysis(speech.contents), axis=1)


In [None]:
ecb_with_topics = apply_and_concat(speeches, 'contents', complete_topic, ["dist"])

In [None]:
ecb_with_topics.iloc[0]


In [None]:
ecb_with_topics.to_csv('./ecb_with_topics_sbert.csv')