In [2]:
#this notebook compares the performance of embeddings extracted by Llama3 models on a sentence similarity task
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import config
import typing
import logging
import tqdm
import torch
import numpy as np
import requests


In [3]:
CFG = config.Config()

In [3]:
#first calculate DiffCSE embeddings:
model_name = "voidism/diffcse-roberta-base-sts"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

Some weights of the model checkpoint at voidism/diffcse-roberta-base-sts were not used when initializing RobertaModel: ['aux_bert.embeddings.LayerNorm.bias', 'aux_bert.embeddings.LayerNorm.weight', 'aux_bert.embeddings.position_embeddings.weight', 'aux_bert.embeddings.position_ids', 'aux_bert.embeddings.token_type_embeddings.weight', 'aux_bert.embeddings.word_embeddings.weight', 'aux_bert.encoder.layer.0.attention.output.LayerNorm.bias', 'aux_bert.encoder.layer.0.attention.output.LayerNorm.weight', 'aux_bert.encoder.layer.0.attention.output.dense.bias', 'aux_bert.encoder.layer.0.attention.output.dense.weight', 'aux_bert.encoder.layer.0.attention.self.key.bias', 'aux_bert.encoder.layer.0.attention.self.key.weight', 'aux_bert.encoder.layer.0.attention.self.query.bias', 'aux_bert.encoder.layer.0.attention.self.query.weight', 'aux_bert.encoder.layer.0.attention.self.value.bias', 'aux_bert.encoder.layer.0.attention.self.value.weight', 'aux_bert.encoder.layer.0.intermediate.dense.bias', 'aux

In [32]:
#first try procedure on extracted topic argument sets from simulation data COMPTEXT24:
LANGUAGE: str = 'English'
GROUPER: str = 'topic'
ARG_DIR: str = 'data/arguments'
MODEL: str = "llama3:70b-instruct-q6_K" # "mixtral:8x7b-instruct-v0.1-q6_K"

In [5]:
#function for encoding sentence embeddings based on the DiffCSE approach which is based on (and refers to) SimCSE/evaluation.py 
def DiffCSE_encode_argument(sentence):
    input_ids = tokenizer.encode(sentence, add_special_tokens=True, truncation=True, max_length=512, padding='max_length')
    input_ids_tensor = torch.tensor(input_ids).unsqueeze(0)  # Add batch dimension
    attention_mask = torch.ones_like(input_ids_tensor)  # Creating attention mask 
    with torch.no_grad():
            outputs = model(input_ids_tensor, output_hidden_states=True, return_dict=True, attention_mask=attention_mask)
            argument_embedding = outputs.last_hidden_state[:, 0].cpu()
            return argument_embedding.tolist()

In [20]:
# Load the arguments from the .parquet file
arguments_df = pd.read_parquet(f"{ARG_DIR}/arguments.by.{GROUPER}.{LANGUAGE}.parquet")

In [23]:
arguments_df.head()

Unnamed: 0,arguments,label,DiffCSE_embedding
0,"[""Access to affordable healthcare is a fundam...",healthcare,"[[-0.031701505184173584, 0.1383965015411377, -..."
1,"""The current healthcare system unfairly benef...",healthcare,"[[-0.041136495769023895, 0.11384233832359314, ..."
2,"""High costs of healthcare create financial bu...",healthcare,"[[0.0036838464438915253, 0.12259957194328308, ..."
3,"""Pre-existing conditions should not exclude i...",healthcare,"[[0.041562195867300034, 0.14262324571609497, -..."
4,"""Universal healthcare would ensure everyone h...",healthcare,"[[0.0069744773209095, 0.1994149088859558, -0.3..."


In [22]:
#for DiffCSE: Create embeddings for each argument and store them in a new column
arguments_df.loc[:, 'DiffCSE_embedding'] = arguments_df["arguments"].apply(DiffCSE_encode_argument)

In [28]:
#for Llama3:
embed_Llama3: typing.Dict[str, np.ndarray] = {}


In [34]:
    name = "embed_llama3"
    template = 'You help me get embeddings for a sentence. I provide you a with a context and a sentence and you reply only with that exact sentence. Context = '
    for index, row in tqdm.tqdm(arguments_df.iterrows(), total=len(arguments_df)):
        context = row["label"]
        sentence = row["arguments"]
        try: 
            embed = np.array(requests.post(
                'https://inf.cl.uni-trier.de/embed/',
                json={'model': MODEL, 'prompt': template + context + '; Sentence: ' + sentence}
                ).json()["response"])
        except Exception as _e:
            logging.warning(_e)
            embed = None
        
        if name=='embed_llama3':
            embed_Llama3[index] = embed


  0%|          | 0/206 [00:00<?, ?it/s]

100%|██████████| 206/206 [00:43<00:00,  4.78it/s]


In [36]:
dataset_w_embeds = arguments_df.join(pd.Series(embed_Llama3, name="embed_llama3"))
#dataset_w_embeds.to_parquet(f'{CFG.report_dir}/dataset.embeds.parquet')
dataset_w_embeds.head()

Unnamed: 0,arguments,label,DiffCSE_embedding,embed_llama3
0,"[""Access to affordable healthcare is a fundam...",healthcare,"[[-0.031701505184173584, 0.1383965015411377, -...","[0.5143977403640747, -0.44048359990119934, -0...."
1,"""The current healthcare system unfairly benef...",healthcare,"[[-0.041136495769023895, 0.11384233832359314, ...","[0.40946468710899353, -0.4694342613220215, -0...."
2,"""High costs of healthcare create financial bu...",healthcare,"[[0.0036838464438915253, 0.12259957194328308, ...","[0.3307845890522003, -0.531212329864502, -0.79..."
3,"""Pre-existing conditions should not exclude i...",healthcare,"[[0.041562195867300034, 0.14262324571609497, -...","[0.2711120843887329, -0.33552631735801697, -0...."
4,"""Universal healthcare would ensure everyone h...",healthcare,"[[0.0069744773209095, 0.1994149088859558, -0.3...","[0.40233278274536133, -0.5556334257125854, -0...."


In [37]:
#reshape embeddings to single dimension:
dataset_w_embeds['DiffCSE_embedding'] = dataset_w_embeds['DiffCSE_embedding'].apply(lambda x: x[0])

In [38]:
grouped_data = dataset_w_embeds.groupby('label')

In [46]:
results: typing.Dict[typing.Tuple[str, str], float] = {}

In [47]:
#for DiffCSE:
dist = torch.nn.PairwiseDistance()

for model_1, c_1 in grouped_data['DiffCSE_embedding']:
    for model_2, c_2 in grouped_data['DiffCSE_embedding']:

        if (
            (model_1, model_2) in results.keys() or 
            (model_2, model_1) in results.keys()
        ):
            continue

        res = sum([
            sum(dist(
                torch.tensor(np.array(v_1)), 
                torch.tensor(np.array(c_2.tolist()))
                )) / len(c_2)
            for v_1 in c_1
        ]) / len(c_1)

        results[(model_1, model_2)] = res

        print(f'{model_1}:{model_2}:{res.item()}')

healthcare:healthcare:3.512138113613706
healthcare:ukraine:4.4383741450897745
ukraine:ukraine:4.4135212572020945


In [48]:
#for Llama3
dist = torch.nn.PairwiseDistance()
results2: typing.Dict[typing.Tuple[str, str], float] = {}
for model_1, c_1 in grouped_data['embed_llama3']:
    for model_2, c_2 in grouped_data['embed_llama3']:

        if (
            (model_1, model_2) in results2.keys() or 
            (model_2, model_1) in results2.keys()
        ):
            continue

        res = sum([
            sum(dist(
                torch.tensor(np.array(v_1)), 
                torch.tensor(np.array(c_2.tolist()))
                )) / len(c_2)
            for v_1 in c_1
        ]) / len(c_1)

        results2[(model_1, model_2)] = res

        print(f'{model_1}:{model_2}:{res.item()}')

healthcare:healthcare:3.8460111482448824
healthcare:ukraine:7.980850266694351
ukraine:ukraine:3.7148092856659054


clearly adding context to Llama3 embeddings differentiates the arguments considerably, but this could be due to the embedding extraction overly relying on the context... what if the arguments had a similar context? If we compare the diversity of all healthcare arguments to all ukraine arguments there is hardly any difference, while the DiffCSE embeddings found a larger diversity amond ukraine arguments than among healthcare arguments

In [51]:
    #what if we change the context:

    #for argument context:
    embed_Llama3_arg_context: typing.Dict[str, np.ndarray] = {}

    name = "embed_llama3"
    template = 'You help me get embeddings for a sentence. I provide you a with a context and a sentence and you reply only with that exact sentence. Context = '
    for index, row in tqdm.tqdm(arguments_df.iterrows(), total=len(arguments_df)):
        context = 'arguments about ' + row["label"]
        sentence = row["arguments"]
        try: 
            embed = np.array(requests.post(
                'https://inf.cl.uni-trier.de/embed/',
                json={'model': MODEL, 'prompt': template + context + '; Sentence: ' + sentence}
                ).json()["response"])
        except Exception as _e:
            logging.warning(_e)
            embed = None
        
        if name=='embed_llama3':
            embed_Llama3_arg_context[index] = embed

  0%|          | 0/206 [00:00<?, ?it/s]

100%|██████████| 206/206 [00:42<00:00,  4.86it/s]


In [52]:
dataset_w_embeds = dataset_w_embeds.join(pd.Series(embed_Llama3_arg_context, name="embed_Llama3_arg_context"))
#dataset_w_embeds.to_parquet(f'{CFG.report_dir}/dataset.embeds.parquet')
dataset_w_embeds.head()

Unnamed: 0,arguments,label,DiffCSE_embedding,embed_llama3,embed_Llama3_arg_context
0,"[""Access to affordable healthcare is a fundam...",healthcare,"[-0.031701505184173584, 0.1383965015411377, -0...","[0.5143977403640747, -0.44048359990119934, -0....","[0.49859514832496643, -0.534142255783081, -0.9..."
1,"""The current healthcare system unfairly benef...",healthcare,"[-0.041136495769023895, 0.11384233832359314, -...","[0.40946468710899353, -0.4694342613220215, -0....","[0.40977850556373596, -0.5506035089492798, -0...."
2,"""High costs of healthcare create financial bu...",healthcare,"[0.0036838464438915253, 0.12259957194328308, -...","[0.3307845890522003, -0.531212329864502, -0.79...","[0.29485011100769043, -0.5935364365577698, -0...."
3,"""Pre-existing conditions should not exclude i...",healthcare,"[0.041562195867300034, 0.14262324571609497, -0...","[0.2711120843887329, -0.33552631735801697, -0....","[0.3479368984699249, -0.47894537448883057, -0...."
4,"""Universal healthcare would ensure everyone h...",healthcare,"[0.0069744773209095, 0.1994149088859558, -0.36...","[0.40233278274536133, -0.5556334257125854, -0....","[0.3797900080680847, -0.6202806234359741, -0.9..."


In [53]:
    #what if we don't vary the context:

    #for argument context:
    embed_Llama3_constant_context: typing.Dict[str, np.ndarray] = {}

    name = "embed_llama3"
    template = 'You help me get embeddings for a sentence. I provide you a with a context and a sentence and you reply only with that exact sentence. Context = '
    for index, row in tqdm.tqdm(arguments_df.iterrows(), total=len(arguments_df)):
        context = 'viewpoints extracted from tweets'
        sentence = row["arguments"]
        try: 
            embed = np.array(requests.post(
                'https://inf.cl.uni-trier.de/embed/',
                json={'model': MODEL, 'prompt': template + context + '; Sentence: ' + sentence}
                ).json()["response"])
        except Exception as _e:
            logging.warning(_e)
            embed = None
        
        if name=='embed_llama3':
            embed_Llama3_constant_context[index] = embed

100%|██████████| 206/206 [00:39<00:00,  5.16it/s]


In [54]:
dataset_w_embeds = dataset_w_embeds.join(pd.Series(embed_Llama3_constant_context, name="embed_Llama3_constant_context"))
#dataset_w_embeds.to_parquet(f'{CFG.report_dir}/dataset.embeds.parquet')
dataset_w_embeds.head()

Unnamed: 0,arguments,label,DiffCSE_embedding,embed_llama3,embed_Llama3_arg_context,embed_Llama3_constant_context
0,"[""Access to affordable healthcare is a fundam...",healthcare,"[-0.031701505184173584, 0.1383965015411377, -0...","[0.5143977403640747, -0.44048359990119934, -0....","[0.49859514832496643, -0.534142255783081, -0.9...","[0.5910554528236389, -0.5961346626281738, -0.7..."
1,"""The current healthcare system unfairly benef...",healthcare,"[-0.041136495769023895, 0.11384233832359314, -...","[0.40946468710899353, -0.4694342613220215, -0....","[0.40977850556373596, -0.5506035089492798, -0....","[0.4684109091758728, -0.6994882225990295, -0.6..."
2,"""High costs of healthcare create financial bu...",healthcare,"[0.0036838464438915253, 0.12259957194328308, -...","[0.3307845890522003, -0.531212329864502, -0.79...","[0.29485011100769043, -0.5935364365577698, -0....","[0.4911944568157196, -0.7441452741622925, -0.6..."
3,"""Pre-existing conditions should not exclude i...",healthcare,"[0.041562195867300034, 0.14262324571609497, -0...","[0.2711120843887329, -0.33552631735801697, -0....","[0.3479368984699249, -0.47894537448883057, -0....","[0.40695157647132874, -0.6734833121299744, -0...."
4,"""Universal healthcare would ensure everyone h...",healthcare,"[0.0069744773209095, 0.1994149088859558, -0.36...","[0.40233278274536133, -0.5556334257125854, -0....","[0.3797900080680847, -0.6202806234359741, -0.9...","[0.48512306809425354, -0.6649637818336487, -0...."


In [55]:
grouped_data = dataset_w_embeds.groupby('label')

In [56]:
#for embed_Llama3_arg_context
dist = torch.nn.PairwiseDistance()
results3: typing.Dict[typing.Tuple[str, str], float] = {}
for model_1, c_1 in grouped_data['embed_Llama3_arg_context']:
    for model_2, c_2 in grouped_data['embed_Llama3_arg_context']:

        if (
            (model_1, model_2) in results3.keys() or 
            (model_2, model_1) in results3.keys()
        ):
            continue

        res = sum([
            sum(dist(
                torch.tensor(np.array(v_1)), 
                torch.tensor(np.array(c_2.tolist()))
                )) / len(c_2)
            for v_1 in c_1
        ]) / len(c_1)

        results3[(model_1, model_2)] = res

        print(f'{model_1}:{model_2}:{res.item()}')

healthcare:healthcare:3.3984666872173945
healthcare:ukraine:5.409316422610467
ukraine:ukraine:3.5329811240691726


In [57]:
#for embed_Llama3_constant_context
dist = torch.nn.PairwiseDistance()
results4: typing.Dict[typing.Tuple[str, str], float] = {}
for model_1, c_1 in grouped_data['embed_Llama3_constant_context']:
    for model_2, c_2 in grouped_data['embed_Llama3_constant_context']:

        if (
            (model_1, model_2) in results4.keys() or 
            (model_2, model_1) in results4.keys()
        ):
            continue

        res = sum([
            sum(dist(
                torch.tensor(np.array(v_1)), 
                torch.tensor(np.array(c_2.tolist()))
                )) / len(c_2)
            for v_1 in c_1
        ]) / len(c_1)

        results4[(model_1, model_2)] = res

        print(f'{model_1}:{model_2}:{res.item()}')

healthcare:healthcare:3.1070970643153184
healthcare:ukraine:3.8706244450564724
ukraine:ukraine:3.188041701467152


if we make the context wording more similar, the difference between topics is smaller
if we don't vary context at all, the difference between topics is very much smaller


In [26]:
#consider testing the effect of adding irrelevant context to sentences on embedding distances:

#prompted chatGPT:can you give me 3 examples of sentences which have the exact same meaning regardless of whether they are used in the context of ukaïne or healthcare?
# Define the sentences
sentences = [
    "The situation remains critical and requires immediate attention.",
    "There is a need for more resources to effectively address the challenges.",
    "Cooperation between different sectors is essential for a successful outcome."
]

# Create a DataFrame
df = pd.DataFrame(sentences, columns=['Sentences'])

# Display the DataFrame
print(df)


                                           Sentences
0  The situation remains critical and requires im...
1  There is a need for more resources to effectiv...
2  Cooperation between different sectors is essen...


In [27]:
#prompted Llama3: can you give me 3 examples of sentences which have the exact same meaning regardless of whether they are used in the context of ukaïne or healthcare?

sentences = [
    "The situation is getting out of control.",
    "We need to take immediate action to address this issue.",
    "The lack of resources is hindering our progress."
]


#Create a dictionary to store the data

df2 = pd.DataFrame(sentences, columns=['Sentences'])
df= df.append(df2, ignore_index=True)


  df= df.append(df2, ignore_index=True)


In [28]:
df

Unnamed: 0,Sentences
0,The situation remains critical and requires im...
1,There is a need for more resources to effectiv...
2,Cooperation between different sectors is essen...
3,The situation is getting out of control.
4,We need to take immediate action to address th...
5,The lack of resources is hindering our progress.
