<a href="https://colab.research.google.com/github/dtim-upc/LOKI/blob/main/Reducing-Pairs/Data_Preprocessing_Grouping_Simple.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import os
import json
import torch
import numpy as np
from sentence_transformers import SentenceTransformer, SimilarityFunction
from tqdm.notebook import tqdm

In [3]:
# Define folder paths for input and output
data_folder = "/content/input_data"
output_folder = "/content/output_data"
os.makedirs(output_folder, exist_ok=True)

In [4]:
# Load the pre-trained SentenceTransformer model with DOT_PRODUCT similarity function
model = SentenceTransformer("all-MiniLM-L6-v2", similarity_fn_name=SimilarityFunction.DOT_PRODUCT, device="cuda")
# model = SentenceTransformer("all-mpnet-base-v2", similarity_fn_name=SimilarityFunction.DOT_PRODUCT, device="cuda")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [14]:
# Load dataset
file_path = os.path.join(data_folder, 'formatted_data.json')
with open(file_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

In [15]:
# Generate embeddings for each paragraph
embeddings = []
paragraph_ids = []
for entry in tqdm(data, desc="Generating paragraph embeddings"):
    sentence_context = entry.get('sentence_context', [])
    paragraph_id = entry.get('id', None)
    paragraph_ids.append(paragraph_id)

    # Generate sentence embeddings and perform mean-pooling to get paragraph-level embedding
    sentence_embeddings = model.encode(sentence_context, convert_to_tensor=True)
    paragraph_embedding = sentence_embeddings.mean(dim=0)
    embeddings.append(paragraph_embedding)

Generating paragraph embeddings:   0%|          | 0/3157 [00:00<?, ?it/s]

In [16]:
# Convert embeddings to numpy array for similarity computation
embeddings = torch.stack(embeddings)

# Compute pairwise similarities
similarities = model.similarity(embeddings, embeddings)

In [17]:
# Containers for classified groups
positive_group = []
extreme_negative_group = []
hard_negative_group = []

In [18]:
# Set the Threshold for the groups
positive_threshold = 0.7
negative_threshold = 0.3

In [19]:
# Iterate through similarity matrix and classify based on thresholds

# We will only take the upper half of the matrix, hence an integer division!
total_pairs = len(paragraph_ids) * (len(paragraph_ids) - 1) // 2
progress_bar = tqdm(total=total_pairs, desc="Classifying paragraph pairs")

for idx_i in range(len(paragraph_ids)):
    for idx_j in range(idx_i + 1, len(paragraph_ids)):  # Start from idx_i + 1 to skip diagonal and lower half
        similarity_score = similarities[idx_i][idx_j].item()

        if similarity_score > positive_threshold:
            positive_group.append({
                "paragraph_1": paragraph_ids[idx_i],
                "paragraph_2": paragraph_ids[idx_j],
                "similarity": round(similarity_score, 3)
            })
        elif similarity_score < negative_threshold:
            extreme_negative_group.append({
                "paragraph_1": paragraph_ids[idx_i],
                "paragraph_2": paragraph_ids[idx_j],
                "similarity": round(similarity_score, 3)
            })
        else:
            hard_negative_group.append({
                "paragraph_1": paragraph_ids[idx_i],
                "paragraph_2": paragraph_ids[idx_j],
                "similarity": round(similarity_score, 3)
            })

        progress_bar.update(1)

progress_bar.close()

Classifying paragraph pairs:   0%|          | 0/4981746 [00:00<?, ?it/s]

In [20]:
# Save classified groups into separate JSON files
with open(os.path.join(output_folder, 'positive_group.json'), 'w', encoding='utf-8') as f:
    json.dump(positive_group, f, indent=4, ensure_ascii=False)

with open(os.path.join(output_folder, 'extreme_negative_group.json'), 'w', encoding='utf-8') as f:
    json.dump(extreme_negative_group, f, indent=4, ensure_ascii=False)

with open(os.path.join(output_folder, 'hard_negative_group.json'), 'w', encoding='utf-8') as f:
    json.dump(hard_negative_group, f, indent=4, ensure_ascii=False)

print("Classification completed and saved to output folder.")

Classification completed and saved to output folder.
