<a href="https://colab.research.google.com/github/dtim-upc/LOKI/blob/main/Reducing-Pairs/Data_Preprocessing_Grouping_Pruned_Extreme_Negatives_Balanced.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import os
import json
import numpy as np
from sentence_transformers import SentenceTransformer, SimilarityFunction, util
from tqdm.notebook import tqdm
import torch
import random
from collections import Counter, defaultdict

In [3]:
# Define folder paths for input and output
data_folder = "/content/input_data"
output_folder = "/content/output_data"
os.makedirs(output_folder, exist_ok=True)

In [4]:
# Load the pre-trained SentenceTransformer model with DOT_PRODUCT similarity function
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer("all-MiniLM-L6-v2", similarity_fn_name=SimilarityFunction.DOT_PRODUCT, device=device)
# model = SentenceTransformer("all-mpnet-base-v2", similarity_fn_name=SimilarityFunction.DOT_PRODUCT, device=device)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [6]:
# Load dataset
file_path = os.path.join(data_folder, 'formatted_data.json')
with open(file_path, 'r', encoding='utf-8') as f:
  data = json.load(f)

In [7]:
# Generate embeddings for each paragraph
embeddings = []
paragraph_ids = []
for entry in tqdm(data, desc="Generating paragraph embeddings"):
  sentence_context = entry.get('sentence_context', [])
  paragraph_id = entry.get('id', None)
  paragraph_ids.append(paragraph_id)

  # Generate sentence embeddings and perform mean-pooling to get paragraph-level embedding
  sentence_embeddings = model.encode(sentence_context, convert_to_tensor=True)
  paragraph_embedding = sentence_embeddings.mean(dim=0)
  embeddings.append(paragraph_embedding)

Generating paragraph embeddings:   0%|          | 0/3157 [00:00<?, ?it/s]

In [8]:
# Convert embeddings to numpy array for similarity computation
embeddings = torch.stack(embeddings)

# Compute pairwise similarities
similarities = model.similarity(embeddings, embeddings)

In [9]:
# Containers for classified groups
positive_group = []
hard_negative_group = []
reserved_list_dict = defaultdict(list)

In [10]:
# Initialize all possible ranges
all_possible_ranges = [f"{r:.2f} to {r + 0.01:.2f}" for r in np.arange(-1.0, 0.3, 0.01)]
reserved_list_dict = {key: [] for key in all_possible_ranges}
range_counts = Counter({key: 0 for key in all_possible_ranges})

In [11]:
# Thresholds for classification: (<0.3) Extreme Negative; (0.3 to <0.7) Hard Negative; (>=0.7) Positive/Highly Similar IItems
positive_threshold = 0.69
negative_threshold = 0.3

In [12]:
# Iterate over the upper triangular part of the similarity matrix (excluding the diagonal)
num_paragraphs = len(paragraph_ids)
progress_bar = tqdm(total=(num_paragraphs * (num_paragraphs - 1)) // 2, desc="Classifying paragraph pairs")
for i in range(num_paragraphs):
  for j in range(i + 1, num_paragraphs):
    similarity_score = similarities[i, j].item()

    if similarity_score > positive_threshold:
      positive_group.append({
        "paragraph_1": paragraph_ids[i],
        "paragraph_2": paragraph_ids[j],
        "similarity": round(similarity_score, 3)
      })
    elif similarity_score >= negative_threshold:
      hard_negative_group.append({
        "paragraph_1": paragraph_ids[i],
        "paragraph_2": paragraph_ids[j],
        "similarity": round(similarity_score, 3)
      })
    else:
      # Determine the range key for Extreme Negative Cases in order to pick representative samples
      range_start = round(similarity_score, 2)
      range_end = round(range_start + 0.01, 2)
      range_key = f"{range_start:.2f} to {range_end:.2f}"
      if range_key in reserved_list_dict:
        reserved_list_dict[range_key].append({
          "paragraph_1": paragraph_ids[i],
          "paragraph_2": paragraph_ids[j],
          "similarity": round(similarity_score, 3)
        })
        range_counts[range_key] += 1
    progress_bar.update(1)

progress_bar.close()

Classifying paragraph pairs:   0%|          | 0/4981746 [00:00<?, ?it/s]

In [13]:
# Compute total number of extreme negative pairs
S = sum(range_counts.values())

# Extreme negative target count
M = len(positive_group) + len(hard_negative_group)
remaining_target_count = M

In [14]:
# Initialize allocations
allocations = {}
capped_ranges = set()
excess_pairs = 0

In [15]:
# Step 1: pre-compute initial allocations
for range_key in range_counts.keys():
  s_i = range_counts[range_key]  # Available data in range
  p_i = s_i / S if S > 0 else 0  # Proportion of total data
  a_i = round(p_i * M)           # Initial allocation
  allocations[range_key] = a_i

In [16]:
# Step 2: Adjust allocations for data availability - ranges having lower samples
for range_key in allocations.keys():
  s_i = range_counts[range_key]
  a_i = allocations[range_key]
  if a_i > s_i:
    excess = a_i - s_i
    allocations[range_key] = s_i
    excess_pairs += excess
    capped_ranges.add(range_key)

In [17]:
# Step 3: Redistribute excess pairs (Optimized)
if excess_pairs > 0:
  # Compute total available capacity
  total_capacity = sum(range_counts[range_key] - allocations[range_key] for range_key in allocations)

  # If there's no capacity left, we cannot redistribute
  if total_capacity == 0:
    excess_pairs = 0  # Cannot distribute further
  else:
    for range_key in allocations.keys():
      available = range_counts[range_key] - allocations[range_key]
      if available > 0:
        # Proportion of available capacity
        p_i = available / total_capacity if total_capacity > 0 else 0
        additional_allocation = min(round(p_i * excess_pairs), available)
        allocations[range_key] += additional_allocation
        excess_pairs -= additional_allocation
        if excess_pairs <= 0:
          break

In [18]:
# Collect selected pairs based on final allocations
pruned_extreme_negative_group = []
progress_bar = tqdm(total=len(allocations), desc="Pruning extreme negative pairs")

for range_key, num_pairs in allocations.items():
  if num_pairs > 0:
    available_pairs = reserved_list_dict[range_key]
    # Ensure we do not sample more pairs than available
    num_pairs = min(num_pairs, len(available_pairs))
    selected_pairs = random.sample(available_pairs, num_pairs)
    pruned_extreme_negative_group.extend(selected_pairs)
  progress_bar.update(1)

progress_bar.close()

Pruning extreme negative pairs:   0%|          | 0/130 [00:00<?, ?it/s]

In [19]:
# Save classified groups into separate JSON files
with open(os.path.join(output_folder, 'positive_group.json'), 'w', encoding='utf-8') as f:
  json.dump(positive_group, f, indent=4, ensure_ascii=False)

with open(os.path.join(output_folder, 'hard_negative_group.json'), 'w', encoding='utf-8') as f:
  json.dump(hard_negative_group, f, indent=4, ensure_ascii=False)

with open(os.path.join(output_folder, 'extreme_negative_group.json'), 'w', encoding='utf-8') as f:
  json.dump(pruned_extreme_negative_group, f, indent=4, ensure_ascii=False)

print("Classification completed and saved to output folder.")

Classification completed and saved to output folder.
