In [1]:
import typing

import pandas as pd
import json
from zipfile import ZipFile
from itertools import chain
import config
import src
import logging
import tqdm
import torch
import numpy as np
import requests

In [2]:
CFG = config.Config()

In [3]:
sentences = ["I'm going to the bank.", "I need to visit the bank today.", "I'm heading to the bank to withdraw some cash.", "The bank is where I'm off to.", "I have an appointment at the bank.", "Let's spend the day by the river bank.", "The river bank is a peaceful place to relax.", "I enjoy walking along the river bank.", "We can have a picnic by the river bank.",
"She has a cool job.", "Her job is really interesting and fun.", "She works in a creative field and loves it.", "That job of hers is so unique.", "She's lucky to have such a cool profession.", "Her workplace is always chilly.", "She works in a refrigerated environment.", "The temperature in her office is freezing.", "She needs to bundle up for her job.",
"You need to check the trunk.", "Don't forget to look in the trunk of the car.", "There might be something important in the trunk.", "Make sure to verify the contents of the trunk.", "The trunk needs to be inspected for any damage.", "Take a look at the tree trunk for any damage.", "The trunk of the old tree might have some interesting carvings.", "Check if the tree trunk needs to be treated for pests.", "See if there are any unique patterns or textures on the tree trunk.",
"I saw her duck.", "Her duck was waddling in the park.", "The duck she owns is so cute.", "I spotted her duck by the pond.", "Her duck was quacking loudly.", "She ducked to avoid the flying object.", "Her quick ducking saved her from the falling branch.", "I noticed her sudden ducking movement.", "She ducked and dodged the incoming ball.",
"She has a great figure.", "Her body shape is very flattering.", "She carries herself with grace and confidence.", "Her figure is well-proportioned and attractive.", "She knows how to dress to highlight her figure.", "The numerical figure she presented was impressive.", "Her calculations yielded a significant figure.", "The data supports a substantial figure.", "The figure she quoted was accurate and reliable."]

context = ["bank-base", "bank-money", "bank-money", "bank-money", "bank-money", "bank-river", "bank-river", "bank-river", "bank-river", "cool-base", "cool-nice", "cool-nice", "cool-nice", "cool-nice", "cool-cold", "cool-cold", "cool-cold", "cool-cold", "trunk-base", "trunk-car", "trunk-car", "trunk-car", "trunk-car", "trunk-tree", "trunk-tree", "trunk-tree", "trunk-tree", 
           "duck-base", "duck-animal", "duck-animal", "duck-animal", "duck-animal", "duck-down", "duck-down", "duck-down", "duck-down", "figure-base", "figure-body", "figure-body", "figure-body", "figure-body", "figure-number", "figure-number", "figure-number", "figure-number"]

dataset = pd.DataFrame({'Sentence': sentences, 'Context': context})

In [None]:
dataset.head()

In [4]:
MODEL: str = "llama3:70b-instruct-q6_K" # "mixtral:8x7b-instruct-v0.1-q6_K"
CFG.prompt_files.keys()

dict_keys(['political_ideology', 'embedding_semantic_rich', 'political_negativity', 'embedding_finance', 'embedding_base', 'embedding_river', 'argument'])

In [5]:
keys_to_include = ['embedding_semantic_rich', 'embedding_finance', 'embedding_base', 'embedding_river']
subset_dict = {k: CFG.prompt_files[k] for k in keys_to_include if k in CFG.prompt_files}
subset_dict

{'embedding_semantic_rich': PosixPath('data/prompts/embedding_semantic_rich'),
 'embedding_finance': PosixPath('data/prompts/embedding_finance.json'),
 'embedding_base': PosixPath('data/prompts/embedding_base.json'),
 'embedding_river': PosixPath('data/prompts/embedding_river.json')}

In [6]:
subset_dict.values()

dict_values([PosixPath('data/prompts/embedding_semantic_rich'), PosixPath('data/prompts/embedding_finance.json'), PosixPath('data/prompts/embedding_base.json'), PosixPath('data/prompts/embedding_river.json')])

In [55]:
templates = [
    {"name": "embed_base", "template": 'Please consider the following sentence in the context of a story and find sentence level embeddings. Only answer with the sentence and nothing else./n'},
    {"name": "embed_rich", "template": 'Please consider the following sentence in the context of a story and find semantically rich sentence level embeddings. Only answer with the sentence and nothing else./n'},
    {"name": "embed_money", "template": 'Please consider the following sentence in the context of a story about finance and find semantically rich sentence level embeddings. Only answer with the sentence and nothing else./n'},
    {"name": "embed_river", "template": 'Please consider the following sentence in the context of a story about a river and find semantically rich sentence level embeddings. Only answer with the sentence and nothing else./n'}
]

template_dict = pd.DataFrame(templates).to_dict(orient='records')

In [30]:
for item in template_dict:
    name = item['name']
    print(name)

embed_base
embed_rich
embed_money
embed_river


In [61]:
embed_rich: typing.Dict[str, np.ndarray] = {}
embed_money: typing.Dict[str, np.ndarray] = {}
embed_river: typing.Dict[str, np.ndarray] = {}
embed_base: typing.Dict[str, np.ndarray] = {}

In [62]:
for item in template_dict:
    name = item['name']
    template = item['template']
    for index, value in tqdm.tqdm(dataset["Sentence"].items(), total=len(dataset)):
        try: 
            embed = np.array(requests.post(
                'https://inf.cl.uni-trier.de/embed/',
                json={'model': MODEL, 'prompt': template + value}
                ).json()["response"])
        except Exception as _e:
            logging.warning(_e)
            embed = None
        
        if name=='embed_rich':
            embed_rich[index] = embed
        if name=='embed_base':
            embed_base[index] = embed
        if name=='embed_river':
            embed_river[index] = embed
        if name=='embed_money':
            embed_money[index] = embed

       

  0%|          | 0/45 [00:00<?, ?it/s]

100%|██████████| 45/45 [00:08<00:00,  5.22it/s]
100%|██████████| 45/45 [00:08<00:00,  5.18it/s]
100%|██████████| 45/45 [00:08<00:00,  5.29it/s]
100%|██████████| 45/45 [00:08<00:00,  5.24it/s]


In [64]:
dataset_w_embeds = dataset.join(pd.Series(embed_rich, name="embed_rich")).join(pd.Series(embed_base, name="embed_base")).join(pd.Series(embed_money, name="embed_money")).join(pd.Series(embed_river, name="embed_river"))
#dataset_w_embeds.to_parquet(f'{CFG.report_dir}/dataset.embeds.parquet')
dataset_w_embeds.head()

Unnamed: 0,Sentence,Context,embed_rich,embed_base,embed_money,embed_river
0,I'm going to the bank.,bank-base,"[0.0007291287183761597, -0.13166683912277222, ...","[0.18572592735290527, -0.297760933637619, -0.0...","[0.2104983925819397, -0.05846552550792694, 0.1...","[0.23099350929260254, -0.20068785548210144, 0...."
1,I need to visit the bank today.,bank-money,"[0.6171398758888245, -0.12135252356529236, -0....","[0.10708608478307724, -0.3206588923931122, -0....","[0.5208940505981445, 0.1724483221769333, -0.11...","[0.39583635330200195, 0.2690700888633728, 0.19..."
2,I'm heading to the bank to withdraw some cash.,bank-money,"[0.34433573484420776, -0.06507916003465652, -0...","[0.3088618516921997, -0.019220836460590363, 0....","[0.6007224917411804, 0.1417076140642166, 0.068...","[0.42295312881469727, 0.11269190907478333, 0.1..."
3,The bank is where I'm off to.,bank-money,"[-0.13297878205776215, -0.0911918580532074, -0...","[0.19968175888061523, -0.26484861969947815, -0...","[0.05104644224047661, -0.08790059387683868, -0...","[-0.11331242322921753, -0.04678422957658768, 0..."
4,I have an appointment at the bank.,bank-money,"[0.2728506922721863, -0.2755275070667267, -0.2...","[-0.055613771080970764, -0.39524802565574646, ...","[0.3922412693500519, -0.22213643789291382, -0....","[0.2372683882713318, -0.21723666787147522, -0...."


In [66]:
groups = {
    "base": dataset_w_embeds[dataset_w_embeds["Context"] == "bank-base"]["embed_base"],
    "river": dataset_w_embeds[dataset_w_embeds["Context"] == "bank-river"]["embed_base"],
    "money": dataset_w_embeds[dataset_w_embeds["Context"] == "bank-money"]["embed_base"],
}

In [67]:
dist = torch.nn.PairwiseDistance()

results: typing.Dict[typing.Tuple[str, str], float] = {}

for label_1, c_1 in groups.items():
    for label_2, c_2 in groups.items():

        if (
            (label_1, label_2) in results.keys() or 
            (label_2, label_1) in results.keys()
        ):
            continue

        res = sum([
            sum(dist(
                torch.tensor(np.array(v_1)), 
                torch.tensor(np.array(c_2.tolist()))
                )) / len(c_2)
            for v_1 in c_1
        ]) / len(c_1)

        results[(label_1, label_2)] = res

        print(f'{label_1}:{label_2}:{res.item()}')

base:base:3.1999999999999965e-05
base:river:9.081911016181294
base:money:6.908000575729087
river:river:5.967878145050999
river:money:10.10194802753676
money:money:6.2836831508767395
