In [6]:
import pandas as pd
import re
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

# Load dataset
file_path = "/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/for_training/merged/aspirational_plus_batch_1_batch_2_merged.xlsx"
df = pd.read_excel(file_path)  # Ensure correct file path

# Ensure the column name is correct
sentence_col = "sentence"  # Change if necessary

# Function to clean text
def clean_text(text):
    return re.sub(r'[^\w\s]', '', str(text).lower().strip())  # Remove punctuation & lowercase

# Preprocess all sentences in one pass (efficient vectorized approach)
clean_sentences = [clean_text(sentence) for sentence in df[sentence_col]]

# Detect loose duplicates using fuzzy matching
duplicate_pairs = []
checked_sentences = {}

for idx, sentence in enumerate(clean_sentences):
    if sentence in checked_sentences:
        continue  # Skip if already checked

    # Find close matches efficiently
    matches = process.extract(sentence, clean_sentences, scorer=fuzz.token_sort_ratio, limit=10)

    for match in matches:
        match_sentence, score = match  # Unpack correctly
        match_idx = clean_sentences.index(match_sentence)  # Get the actual index

        if score >= 85 and idx != match_idx:  # Adjust similarity threshold if needed
            duplicate_pairs.append((df.iloc[idx][sentence_col], df.iloc[match_idx][sentence_col], score))

    checked_sentences[sentence] = True  # Mark as checked

# Convert to DataFrame for easier review
duplicate_df = pd.DataFrame(duplicate_pairs, columns=["Original Sentence", "Duplicate Sentence", "Similarity Score"])

# Save results to CSV
output_file = "loose_duplicates.csv"
duplicate_df.to_csv(output_file, index=False)

In [None]:
# Display results
import ace_tools as tools
tools.display_dataframe_to_user(name="Loose Duplicates", dataframe=duplicate_df)

print(f"Possible loose duplicates found: {len(duplicate_df)}")