In [1]:
from sentence_transformers import SentenceTransformer, InputExample, losses, evaluation, util
from torch.utils.data import DataLoader
import numpy as np
import pickle

In [2]:
def kl_divergence(p,q):
    return np.sum(p * (np.log2(p)-np.log2(q)))

def js_divergence(p,q):
    m = 0.5 * (p + q)
    return 0.5 * kl_divergence(p, m) + 0.5 * kl_divergence(q, m)

def one_hot(p,q):
    return 1 if p==q else 0

def sigmoid(val):
    return 1/(1+np.exp(-17*(val-0.5)))

# js ensure symmetric
def similarity(p,q,mode="js"):
    if mode == "js":
        return sigmoid(np.exp2(-js_divergence(np.array(p),np.array(q))))
    elif mode == "kl":
        return sigmoid(np.exp2(-kl_divergence(np.array(p),np.array(q))))
    elif mode == "one-hot":
        return one_hot(p,q)

def get_random_index_pairs(num_data, amount):
    return np.random.randint(num_data, size=(amount, 2))


In [3]:
# flatten to one list for all 3
with open('train_data.pickle', 'rb') as file:
    train = pickle.load(file)

with open('gpt.pickle', 'rb') as file:
    gpt = pickle.load(file)
    
with open('gpt.pickle', 'rb') as file:
    gpt2 = pickle.load(file)

gpt = [item for sublist in gpt for item in sublist]
gpt2 = [item for sublist in gpt2 for item in sublist]

mixed = gpt[200:] + train + gpt2[200:]
test = gpt2[:200] + gpt[:200]

In [4]:
from itertools import combinations
import random
all_pairs = list(combinations(range(len(mixed)),2))

random.shuffle(all_pairs)
# bert load data
data = [{"texts":[mixed[idx[0]]["text"],mixed[idx[1]]["text"]], "label": similarity(mixed[idx[0]]["dist"],mixed[idx[1]]["dist"])} for idx in all_pairs]


In [5]:
len(data)

3646350

In [6]:
train, dev = data[:450000],data[450000:500000]

In [7]:
pair1 = [item["texts"][0] for item in dev]
pair2 = [item["texts"][1] for item in dev]
scores = [float(item["label"]) for item in dev]

In [8]:
#Define your train examples. You need more than just two examples...
train_examples = [InputExample(texts=item["texts"], label=float(item["label"])) for item in train]

In [9]:


#Define the model. Either from scratch of by loading a pre-trained model
model = SentenceTransformer('all-mpnet-base-v2')
# model = SentenceTransformer('./')

evaluator = evaluation.EmbeddingSimilarityEvaluator(pair1, pair2, scores)


#Define your train dataset, the dataloader and the train loss
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=32)
train_loss = losses.CosineSimilarityLoss(model)

#Tune the model
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=4, warmup_steps=100, evaluator=evaluator,evaluation_steps=500)

Iteration:   6%|▌         | 794/14063 [03:37<1:00:28,  3.66it/s]
Epoch:   0%|          | 0/4 [03:37<?, ?it/s]


KeyboardInterrupt: 

In [None]:
model.save("./sbert","sbert_fin")

In [None]:

with open('evaluation.pickle', 'rb') as file:
    eval_dict = pickle.load(file)

In [None]:
classes = ["banking","valuation","household","real estate","corporate","external","sovereign","technology", "climate", "energy", "health", "eu"]
#cosine similarity
#Compute embedding for both lists
embedded_class_dictionary = {label: [] for label in classes}


for label in classes:
    for sentence in eval_dict[label]:
        embeddings = model.encode(sentence, convert_to_tensor=True)
        embedded_class_dictionary[label].append(embeddings)

  


In [None]:
# with open('embedded_class_dictionary.pickle', 'wb') as file:
#     pickle.dump(embedded_class_dictionary, file)

In [None]:
import random
import torch

def query(text, examples=10):
    scores = []
    text_vector = model.encode(text, convert_to_tensor=True)
    for label in classes:
        if label != "eu":
            examples_list = random.sample(embedded_class_dictionary[label], k=examples)
        else:
            examples_list = embedded_class_dictionary[label]
        cosine_scores = torch.tensor([util.pytorch_cos_sim(text_vector,  example) for example in examples_list])
        scores.append(torch.mean(cosine_scores))
    # torch.nn.functional.softmax(torch.tensor(scores))
    scores = torch.tensor(scores)
    scores = scores/torch.sum(scores)
    #softmax
    return {label:score for label, score in zip(classes,scores)}


In [None]:

query("In contrast to the radical forces buffeting valuations, for most companies, 2020 was a year of “strategy lockdown.")


In [None]:

from sklearn.metrics import accuracy_score, precision_recall_fscore_support,top_k_accuracy_score

In [None]:
def compute_metrics(labels, preds):
    best = 
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    top3 = top_k_accuracy_score(labels, pred.predictions,k=3)
    top2 = top_k_accuracy_score(labels, pred.predictions,k=2)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'top3': top3,
        'top2': top2
    }

In [None]:
for dictionary in test:
    


SyntaxError: unexpected EOF while parsing (Temp/ipykernel_4840/2262350480.py, line 1)