In [1]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModel


In [2]:
model_name = "Twitter/twhin-bert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

Some weights of BertModel were not initialized from the model checkpoint at Twitter/twhin-bert-base and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
LANGUAGE: str = 'English'
GROUPER: str = 'retrieved_source'

In [4]:
# Function to encode a single sentence and return its embeddings
def encode_argument(sentence):
    input_ids = tokenizer.encode(sentence, add_special_tokens=True, truncation=True, max_length=512, padding='max_length')
    input_ids_tensor = torch.tensor(input_ids).unsqueeze(0)  # Add batch dimension
    attention_mask = torch.ones_like(input_ids_tensor)  # Creating attention mask
    with torch.no_grad():
        outputs = model(input_ids_tensor, attention_mask=attention_mask)
        argument_embedding = outputs.last_hidden_state.mean(dim=1).squeeze(0)  # Mean pooling of token embeddings
    return argument_embedding.tolist()

In [36]:
# Load the arguments from the .parquet file
arguments_df = pd.read_parquet(f"arguments.by.{GROUPER}.{LANGUAGE}.parquet")



In [37]:
arguments_df

Unnamed: 0,arguments,label
0,"""The US should provide Ukraine with military aid",https://cepa.org/article/europe-slumbers-at-uk...
1,"""Continued Russian aggression towards Ukraine ...",https://cepa.org/article/europe-slumbers-at-uk...
2,"""The international community must take action ...",https://cepa.org/article/europe-slumbers-at-uk...
3,"""The US has an important role in supporting Uk...",https://cepa.org/article/europe-slumbers-at-uk...
4,"""There is no substitute for Congressional acti...",https://cepa.org/article/europe-slumbers-at-uk...
...,...,...
991,"""Economic sanctions imposed on Russia have no...",https://www.wilsoncenter.org/blog-post/state-u...
992,"""The conflict in Ukraine is a complex issue w...",https://www.wilsoncenter.org/blog-post/state-u...
993,"""The international community must take action...",https://www.wilsoncenter.org/blog-post/state-u...
994,"""Diplomatic efforts are necessary to find a p...",https://www.wilsoncenter.org/blog-post/state-u...


In [38]:
####run this block only for GROUPER=retrieved_source to condense article link id's and focus on articles with most arguments:
arguments_df['label_nr'] = pd.factorize(arguments_df['label'])[0] + 1
# Count the occurrences of each value in the column
counts = arguments_df['label_nr'].value_counts()
threshold = 17
selected_values = counts[counts > threshold].index
arguments_df = arguments_df[arguments_df['label_nr'].isin(selected_values)]
####

In [44]:
arguments_df.loc[:, 'embedding']


58     [0.056044090539216995, -0.13699206709861755, -...
59     [0.07642713189125061, -0.1521081030368805, -0....
60     [0.03913707286119461, -0.130110502243042, -0.3...
61     [0.04545748978853226, -0.1358380764722824, -0....
62     [0.06929218769073486, -0.15236812829971313, -0...
                             ...                        
965    [0.05197259783744812, -0.14843794703483582, -0...
966    [0.06756751239299774, -0.16555306315422058, -0...
967    [0.059354379773139954, -0.14634248614311218, -...
968    [0.0737944096326828, -0.1366034597158432, -0.3...
969    [0.026186656206846237, -0.11421550810337067, -...
Name: embedding, Length: 96, dtype: object

In [39]:
# Create embeddings for each argument and store them in a new column
arguments_df.loc[:, 'embedding'] = arguments_df["arguments"].apply(encode_argument)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  arguments_df["embedding"] = arguments_df["arguments"].apply(encode_argument)


In [41]:
# Save the DataFrame with embeddings to a new .parquet file
arguments_df.to_parquet(f'argument_embeddings.by.{GROUPER}.{LANGUAGE}.parquet')
arguments_df.to_csv(f'argument_embeddings.by.{GROUPER}.{LANGUAGE}.csv')
