In [1]:
from sentence_transformers import SentenceTransformer, InputExample, losses, evaluation, util
from torch.utils.data import DataLoader
import numpy as np
import pickle

In [2]:
def kl_divergence(p,q):
    return np.sum(p * (np.log2(p)-np.log2(q)))

def js_divergence(p,q):
    m = 0.5 * (p + q)
    return 0.5 * kl_divergence(p, m) + 0.5 * kl_divergence(q, m)

def one_hot(p,q):
    return 1 if p==q else 0

def sigmoid(val):
    return 1/(1+np.exp(-17*(val-0.5)))

# js ensure symmetric
def similarity(p,q,mode="js"):
    if mode == "js":
        return sigmoid(np.exp2(-js_divergence(np.array(p),np.array(q))))
    elif mode == "kl":
        return sigmoid(np.exp2(-kl_divergence(np.array(p),np.array(q))))
    elif mode == "one-hot":
        return one_hot(p,q)

def get_random_index_pairs(num_data, amount):
    return np.random.randint(num_data, size=(amount, 2))


In [3]:
# flatten to one list for all 3
with open('train_data.pickle', 'rb') as file:
    train = pickle.load(file)

with open('gpt.pickle', 'rb') as file:
    gpt = pickle.load(file)
    
with open('gpt.pickle', 'rb') as file:
    gpt2 = pickle.load(file)

gpt = [item for sublist in gpt for item in sublist]
gpt2 = [item for sublist in gpt2 for item in sublist]

mixed = gpt[200:] + train + gpt2[200:]
test = gpt2[:200] + gpt[:200]

In [4]:
model = SentenceTransformer('./')

In [5]:

with open('evaluation.pickle', 'rb') as file:
    eval_dict = pickle.load(file)

In [6]:
classes = ["banking","valuation","household","real estate","corporate","external","sovereign","technology", "climate", "energy", "health", "eu"]
#cosine similarity
#Compute embedding for both lists
embedded_class_dictionary = {label: [] for label in classes}


for label in classes:
    for sentence in eval_dict[label]:
        embeddings = model.encode(sentence, convert_to_tensor=True)
        embedded_class_dictionary[label].append(embeddings)

  


In [36]:
import random
import torch
import math

def rescale(dist):
    beta = torch.mean(dist[math.ceil(0.25*len(dist)):math.floor(0.75*len(dist))])
    alpha = torch.max(torch.tensor([(10+1/(torch.std(dist)/torch.mean(dist))), 400]))
    return 1/(1+torch.exp(-alpha*(dist-beta)))

def query(text, examples=10):
    scores = []
    text_vector = model.encode(text, convert_to_tensor=True)
    for label in classes:
        if label != "eu":
            examples_list = random.sample(embedded_class_dictionary[label], k=examples)
        else:
            examples_list = embedded_class_dictionary[label]
        cosine_scores = torch.tensor([util.pytorch_cos_sim(text_vector,  example) for example in examples_list])
        scores.append(torch.mean(cosine_scores))
    # torch.nn.functional.softmax(torch.tensor(scores))
    scores = torch.tensor(scores)
    scores = scores/torch.sum(scores)
    scores = rescale(scores)
    scores = scores/torch.sum(scores)
    #softmax
    return {label:score for label, score in zip(classes,scores)}, np.array(scores)


In [37]:

query("In contrast to the radical forces buffeting valuations, for most companies, 2020 was a year of “strategy lockdown.")


({'banking': tensor(0.0143),
  'valuation': tensor(0.3063),
  'household': tensor(0.0026),
  'real estate': tensor(0.0196),
  'corporate': tensor(0.5824),
  'external': tensor(0.0066),
  'sovereign': tensor(0.0155),
  'technology': tensor(0.0163),
  'climate': tensor(0.0082),
  'energy': tensor(0.0059),
  'health': tensor(0.0215),
  'eu': tensor(0.0009)},
 array([0.01432589, 0.30633795, 0.00255404, 0.0195682 , 0.5823974 ,
        0.00656773, 0.01546279, 0.0163366 , 0.00815779, 0.00585703,
        0.02152549, 0.00090902], dtype=float32))

In [38]:

from sklearn.metrics import accuracy_score, precision_recall_fscore_support,top_k_accuracy_score
def compute_metrics(labels, preds):
    best = np.argmax(preds, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, best, average='macro')
    acc = accuracy_score(labels, best)
    top3 = top_k_accuracy_score(labels, preds ,k=3)
    top2 = top_k_accuracy_score(labels, preds ,k=2)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'top3': top3,
        'top2': top2
    }

In [41]:
from tqdm import tqdm
labels = []
preds = []

for item in tqdm(test):
    labels.append(np.argmax(np.array(item["dist"])))
    preds.append(query(item["text"])[1])
preds = np.array(preds)

100%|██████████| 400/400 [00:45<00:00,  8.72it/s]


In [42]:
assert len(labels) == len(preds)

In [43]:
compute_metrics(labels,preds)

{'accuracy': 0.92,
 'f1': 0.8316229467944879,
 'precision': 0.8142551892551894,
 'recall': 0.8727039627039627,
 'top3': 0.975,
 'top2': 0.965}