In [1]:
import typing

import pandas as pd
import json
from zipfile import ZipFile
from itertools import chain
import config
import src
import logging
import tqdm
import torch
import numpy as np
import requests

In [2]:
CFG = config.Config()

In [3]:
sentences = ["I'm going to the bank.", "I need to visit the bank today.", "I'm heading to the bank to withdraw some cash.", "The bank is where I'm off to.", "I have an appointment at the bank.", "Let's spend the day by the river bank.", "The river bank is a peaceful place to relax.", "I enjoy walking along the river bank.", "We can have a picnic by the river bank.",
"She has a cool job.", "Her job is really interesting and fun.", "She works in a creative field and loves it.", "That job of hers is so unique.", "She's lucky to have such a cool profession.", "Her workplace is always chilly.", "She works in a refrigerated environment.", "The temperature in her office is freezing.", "She needs to bundle up for her job.",
"You need to check the trunk.", "Don't forget to look in the trunk of the car.", "There might be something important in the trunk.", "Make sure to verify the contents of the trunk.", "The trunk needs to be inspected for any damage.", "Take a look at the tree trunk for any damage.", "The trunk of the old tree might have some interesting carvings.", "Check if the tree trunk needs to be treated for pests.", "See if there are any unique patterns or textures on the tree trunk.",
"I saw her duck.", "Her duck was waddling in the park.", "The duck she owns is so cute.", "I spotted her duck by the pond.", "Her duck was quacking loudly.", "She ducked to avoid the flying object.", "Her quick ducking saved her from the falling branch.", "I noticed her sudden ducking movement.", "She ducked and dodged the incoming ball.",
"She has a great figure.", "Her body shape is very flattering.", "She carries herself with grace and confidence.", "Her figure is well-proportioned and attractive.", "She knows how to dress to highlight her figure.", "The numerical figure she presented was impressive.", "Her calculations yielded a significant figure.", "The data supports a substantial figure.", "The figure she quoted was accurate and reliable."]

context = ["bank-base", "bank-money", "bank-money", "bank-money", "bank-money", "bank-river", "bank-river", "bank-river", "bank-river", "cool-base", "cool-nice", "cool-nice", "cool-nice", "cool-nice", "cool-cold", "cool-cold", "cool-cold", "cool-cold", "trunk-base", "trunk-car", "trunk-car", "trunk-car", "trunk-car", "trunk-tree", "trunk-tree", "trunk-tree", "trunk-tree", 
           "duck-base", "duck-animal", "duck-animal", "duck-animal", "duck-animal", "duck-down", "duck-down", "duck-down", "duck-down", "figure-base", "figure-body", "figure-body", "figure-body", "figure-body", "figure-number", "figure-number", "figure-number", "figure-number"]

dataset = pd.DataFrame({'Sentence': sentences, 'Context': context})

In [None]:
dataset.head()

In [4]:
MODEL: str = "llama3:70b-instruct-q6_K" # "mixtral:8x7b-instruct-v0.1-q6_K"
CFG.prompt_files.keys()

dict_keys(['political_ideology', 'embedding_semantic_rich', 'political_negativity', 'embedding_finance', 'embedding_base', 'embedding_river', 'argument'])

In [5]:
keys_to_include = ['embedding_semantic_rich', 'embedding_finance', 'embedding_base', 'embedding_river']
subset_dict = {k: CFG.prompt_files[k] for k in keys_to_include if k in CFG.prompt_files}
subset_dict

{'embedding_semantic_rich': PosixPath('data/prompts/embedding_semantic_rich'),
 'embedding_finance': PosixPath('data/prompts/embedding_finance.json'),
 'embedding_base': PosixPath('data/prompts/embedding_base.json'),
 'embedding_river': PosixPath('data/prompts/embedding_river.json')}

In [6]:
subset_dict.values()

dict_values([PosixPath('data/prompts/embedding_semantic_rich'), PosixPath('data/prompts/embedding_finance.json'), PosixPath('data/prompts/embedding_base.json'), PosixPath('data/prompts/embedding_river.json')])

In [55]:
templates = [
    {"name": "embed_base", "template": 'Please consider the following sentence in the context of a story and find sentence level embeddings. Only answer with the sentence and nothing else./n'},
    {"name": "embed_rich", "template": 'Please consider the following sentence in the context of a story and find semantically rich sentence level embeddings. Only answer with the sentence and nothing else./n'},
    {"name": "embed_money", "template": 'Please consider the following sentence in the context of a story about finance and find semantically rich sentence level embeddings. Only answer with the sentence and nothing else./n'},
    {"name": "embed_river", "template": 'Please consider the following sentence in the context of a story about a river and find semantically rich sentence level embeddings. Only answer with the sentence and nothing else./n'}
]

template_dict = pd.DataFrame(templates).to_dict(orient='records')

In [30]:
for item in template_dict:
    name = item['name']
    print(name)

embed_base
embed_rich
embed_money
embed_river


In [61]:
embed_rich: typing.Dict[str, np.ndarray] = {}
embed_money: typing.Dict[str, np.ndarray] = {}
embed_river: typing.Dict[str, np.ndarray] = {}
embed_base: typing.Dict[str, np.ndarray] = {}

In [62]:
for item in template_dict:
    name = item['name']
    template = item['template']
    for index, value in tqdm.tqdm(dataset["Sentence"].items(), total=len(dataset)):
        try: 
            embed = np.array(requests.post(
                'https://inf.cl.uni-trier.de/embed/',
                json={'model': MODEL, 'prompt': template + value}
                ).json()["response"])
        except Exception as _e:
            logging.warning(_e)
            embed = None
        
        if name=='embed_rich':
            embed_rich[index] = embed
        if name=='embed_base':
            embed_base[index] = embed
        if name=='embed_river':
            embed_river[index] = embed
        if name=='embed_money':
            embed_money[index] = embed

       

  0%|          | 0/45 [00:00<?, ?it/s]

100%|██████████| 45/45 [00:08<00:00,  5.22it/s]
100%|██████████| 45/45 [00:08<00:00,  5.18it/s]
100%|██████████| 45/45 [00:08<00:00,  5.29it/s]
100%|██████████| 45/45 [00:08<00:00,  5.24it/s]


In [64]:
dataset_w_embeds = dataset.join(pd.Series(embed_rich, name="embed_rich")).join(pd.Series(embed_base, name="embed_base")).join(pd.Series(embed_money, name="embed_money")).join(pd.Series(embed_river, name="embed_river"))
#dataset_w_embeds.to_parquet(f'{CFG.report_dir}/dataset.embeds.parquet')
dataset_w_embeds.head()

Unnamed: 0,Sentence,Context,embed_rich,embed_base,embed_money,embed_river
0,I'm going to the bank.,bank-base,"[0.0007291287183761597, -0.13166683912277222, ...","[0.18572592735290527, -0.297760933637619, -0.0...","[0.2104983925819397, -0.05846552550792694, 0.1...","[0.23099350929260254, -0.20068785548210144, 0...."
1,I need to visit the bank today.,bank-money,"[0.6171398758888245, -0.12135252356529236, -0....","[0.10708608478307724, -0.3206588923931122, -0....","[0.5208940505981445, 0.1724483221769333, -0.11...","[0.39583635330200195, 0.2690700888633728, 0.19..."
2,I'm heading to the bank to withdraw some cash.,bank-money,"[0.34433573484420776, -0.06507916003465652, -0...","[0.3088618516921997, -0.019220836460590363, 0....","[0.6007224917411804, 0.1417076140642166, 0.068...","[0.42295312881469727, 0.11269190907478333, 0.1..."
3,The bank is where I'm off to.,bank-money,"[-0.13297878205776215, -0.0911918580532074, -0...","[0.19968175888061523, -0.26484861969947815, -0...","[0.05104644224047661, -0.08790059387683868, -0...","[-0.11331242322921753, -0.04678422957658768, 0..."
4,I have an appointment at the bank.,bank-money,"[0.2728506922721863, -0.2755275070667267, -0.2...","[-0.055613771080970764, -0.39524802565574646, ...","[0.3922412693500519, -0.22213643789291382, -0....","[0.2372683882713318, -0.21723666787147522, -0...."


In [88]:
#comparing effect of base versus rich embeddings: base results
results: typing.Dict[typing.Tuple[str, str], float] = {}
groups = {
    "prombase-senbase": dataset_w_embeds[dataset_w_embeds["Context"] == "bank-base"]["embed_base"],
    "prombase-senriver": dataset_w_embeds[dataset_w_embeds["Context"] == "bank-river"]["embed_base"],
    "prombase-senmoney": dataset_w_embeds[dataset_w_embeds["Context"] == "bank-money"]["embed_base"]
}

In [91]:
#comparing effect of base versus rich embeddings:rich results
groups = {
    "promrich-senbase": dataset_w_embeds[dataset_w_embeds["Context"] == "bank-base"]["embed_rich"],
    "promrich-senriver": dataset_w_embeds[dataset_w_embeds["Context"] == "bank-river"]["embed_rich"],
    "promrich-senmoney": dataset_w_embeds[dataset_w_embeds["Context"] == "bank-money"]["embed_rich"]
}

In [92]:
dist = torch.nn.PairwiseDistance()

for label_1, c_1 in groups.items():
    for label_2, c_2 in groups.items():

        if (
            (label_1, label_2) in results.keys() or 
            (label_2, label_1) in results.keys()
        ):
            continue

        res = sum([
            sum(dist(
                torch.tensor(np.array(v_1)), 
                torch.tensor(np.array(c_2.tolist()))
                )) / len(c_2)
            for v_1 in c_1
        ]) / len(c_1)

        results[(label_1, label_2)] = res

        print(f'{label_1}:{label_2}:{res.item()}')

promrich-senbase:promrich-senbase:3.1999999999999965e-05
promrich-senbase:promrich-senriver:9.89513416690572
promrich-senbase:promrich-senmoney:7.614145734355528
promrich-senriver:promrich-senriver:7.048040232643907
promrich-senriver:promrich-senmoney:9.63994621535187
promrich-senmoney:promrich-senmoney:5.969263077751312


In [93]:
results

{('prombase-senbase',
  'prombase-senbase'): tensor(3.2000e-05, dtype=torch.float64),
 ('prombase-senbase',
  'prombase-senriver'): tensor(9.0819, dtype=torch.float64),
 ('prombase-senbase',
  'prombase-senmoney'): tensor(6.9080, dtype=torch.float64),
 ('prombase-senriver',
  'prombase-senriver'): tensor(5.9679, dtype=torch.float64),
 ('prombase-senriver',
  'prombase-senmoney'): tensor(10.1019, dtype=torch.float64),
 ('prombase-senmoney',
  'prombase-senmoney'): tensor(6.2837, dtype=torch.float64),
 ('promrich-senbase',
  'promrich-senbase'): tensor(3.2000e-05, dtype=torch.float64),
 ('promrich-senbase',
  'promrich-senriver'): tensor(9.8951, dtype=torch.float64),
 ('promrich-senbase',
  'promrich-senmoney'): tensor(7.6141, dtype=torch.float64),
 ('promrich-senriver',
  'promrich-senriver'): tensor(7.0480, dtype=torch.float64),
 ('promrich-senriver',
  'promrich-senmoney'): tensor(9.6399, dtype=torch.float64),
 ('promrich-senmoney',
  'promrich-senmoney'): tensor(5.9693, dtype=torch.f

it appears distances between sentences about money or river-banks do not increase when prompted to generate semantically rich embeddings


 we need to compare the distance from rich embeddings of the money sentences to [rich embeddings vs money embeddings] of the base sentence, similarly we need to compare the distance from rich embeddings of the river sentences to [rich vs river] embeddings of the base sentence 

the hypothesis would be that the distance between base and money sentences is smaller when the base sentence is prompted about finance than only for rich embeddings
similarly the second hypothesis would be that the distance between base and river sentences is smaller when the base sentence is prompted about river than only for rich embeddings

In [136]:
#comparing the effect of prompting context for the proximity of base sentence to context groups: river context
groups = {
    "promrich-senbase": dataset_w_embeds[dataset_w_embeds["Context"] == "bank-base"]["embed_rich"],
    "promrich-senriver": dataset_w_embeds[dataset_w_embeds["Context"] == "bank-river"]["embed_rich"],
    "promrich-senmoney": dataset_w_embeds[dataset_w_embeds["Context"] == "bank-money"]["embed_rich"],
    "promriver-senbase": dataset_w_embeds[dataset_w_embeds["Context"] == "bank-base"]["embed_river"],
    "prommoney-senbase": dataset_w_embeds[dataset_w_embeds["Context"] == "bank-base"]["embed_money"]   
}
results3: typing.Dict[typing.Tuple[str, str], float] = {}

In [117]:
#comparing the effect of prompting context for the proximity of base sentence to context groups: river context
groups = {
    "promrich-senbase": dataset_w_embeds[dataset_w_embeds["Context"] == "bank-base"]["embed_rich"],
    "promrich-senriver": dataset_w_embeds[dataset_w_embeds["Context"] == "bank-river"]["embed_rich"],
    }
results3: typing.Dict[typing.Tuple[str, str], float] = {}

In [137]:
dist = torch.nn.PairwiseDistance()

for label_1, c_1 in groups.items():
    for label_2, c_2 in groups.items():

        if (
            (label_1, label_2) in results3.keys() or 
            (label_2, label_1) in results3.keys()
        ):
            continue

        res = sum([
            sum(dist(
                torch.tensor(np.array(v_1)), 
                torch.tensor(np.array(c_2.tolist()))
                )) / len(c_2)
            for v_1 in c_1
        ]) / len(c_1)

        results3[(label_1, label_2)] = res

        print(f'{label_1}:{label_2}:{res.item()}')

promrich-senbase:promrich-senbase:3.1999999999999965e-05
promrich-senbase:promrich-senriver:9.89513416690572
promrich-senbase:promrich-senmoney:7.614145734355528
promrich-senbase:promriver-senbase:6.663236028231872
promrich-senbase:prommoney-senbase:5.876314264484897
promrich-senriver:promrich-senriver:7.048040232643907
promrich-senriver:promrich-senmoney:9.63994621535187
promrich-senriver:promriver-senbase:10.113802779148141
promrich-senriver:prommoney-senbase:10.477796142593455
promrich-senmoney:promrich-senmoney:5.969263077751312
promrich-senmoney:promriver-senbase:9.403336166357366
promrich-senmoney:prommoney-senbase:8.599128745915003
promriver-senbase:promriver-senbase:3.1999999999999965e-05
promriver-senbase:prommoney-senbase:5.448238857335945
prommoney-senbase:prommoney-senbase:3.1999999999999965e-05


H1: promrich-senmoney:promrich-senbase > promrich-senmoney:prommoney-senbase
    promrich-senbase:promrich-senmoney:7.614145734355528 < promrich-senmoney:prommoney-senbase:8.599128745915003

H2: promrich-senriver:promrich-senbase > promrich-senriver:promriver-senbase
    promrich-senbase:promrich-senriver:9.89513416690572 < promriver-senbase:promrich-senriver:10.113802957715867 

-> both hypotheses are falsified

how can we explain this?



In [138]:
#maybe look at the output that is embedded, did the output yield correctly or are there traces of the prompt left in the output?

output_rich: typing.Dict[str, np.ndarray] = {}
output_money: typing.Dict[str, np.ndarray] = {}
output_river: typing.Dict[str, np.ndarray] = {}
output_base: typing.Dict[str, np.ndarray] = {}

for item in template_dict:
    name = item['name']
    template = item['template']
    for index, value in tqdm.tqdm(dataset["Sentence"].items(), total=len(dataset)):
        try: 
            output = np.array(requests.post(
                'https://inf.cl.uni-trier.de/',
                json={'model': MODEL, 'prompt': template + value}
                ).json()["response"])
        except Exception as _e:
            logging.warning(_e)
            output = None
        
        if name=='embed_rich':
            output_rich[index] = output
        if name=='embed_base':
            output_base[index] = output
        if name=='embed_river':
            output_river[index] = output
        if name=='embed_money':
            output_money[index] = output

100%|██████████| 45/45 [09:35<00:00, 12.80s/it]
100%|██████████| 45/45 [03:38<00:00,  4.85s/it]
100%|██████████| 45/45 [03:10<00:00,  4.24s/it]
100%|██████████| 45/45 [02:24<00:00,  3.22s/it]


In [139]:
output_base

{0: array('[-0.045, , -0.0445, 0.0333, -0.0192, 0.0214, -0.0328, 0.0161, 0.0386, -0.0087, -0.0129, -0.0177, 0.0433, 0.0055, -0.0143, 0.0352, 0.0234, -0.0338, 0.0311, -0.0216, 0.0029, 0.0047, -0.0392, 0.0164, 0.0035, -0.0113, -0.0293, 0.0064, -0.0408, -0.0182, 0.0198, -0.0259, 0.0246, 0.0011, -0.0303, -0.0035, 0.0157, -0.0089, 0.0074, 0.0115, -0.0202, 0.0264, -0.0048, 0.0103, -0.0141, -0.0279, 0.0126, 0.0136, -0.0163, -0.0067, 0.0052, 0.0364, 0.0071, 0.0228, -0.0095, -0.0319]',
       dtype='<U468'),
 1: array('I need to visit the bank today.', dtype='<U31'),
 2: array('[0.0477, (heading), 0.0241 (to), -0.0113 (the), 0.0556 (bank), 0.0445 (to), -0.0074 (withdraw), 0.0362 (some), 0.0438 (cash)]',
       dtype='<U125'),
 3: array("[0.048, (bank), 0.044 (is), 0.042 (where), 0.041 (I'm), 0.04 (off), 0.038 (to)]",
       dtype='<U79'),
 4: array('[0.015, (have), -0.144 (an), 0.235 (appointment), -0.136 (at), 0.241 (the), 0.155 (bank)]',
       dtype='<U89'),
 5: array("[0.024, (river), 0.018