# 2c. DATA DEDUPLICATION

This notebook handles:
1. **Exact Duplicate Detection** - Find and remove fully identical rows
2. **Near-Duplicate Detection** - Use similarity measures to find non-exact duplicates
3. **Similarity Functions** - Implement Jaccard, Levenshtein, and other measures
4. **Blocking Strategies** - Efficient candidate generation for large datasets
5. **Duplicate Resolution** - Merge or remove identified duplicates

## 2c.1 Imports and Load Data

In [None]:
import pandas as pd
import numpy as np
from collections import Counter
import re

pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 140)

In [None]:
# Load the cleaned dataset from previous step
try:
    MILANO = pd.read_csv("MILANO_cleaned.csv", sep=";")
    print("Loaded cleaned dataset")
except FileNotFoundError:
    try:
        MILANO = pd.read_csv("MILANO_transformed.csv", sep=";")
        print("Loaded transformed dataset")
    except FileNotFoundError:
        MILANO = pd.read_csv("Comune-di-Milano-Pubblici-esercizi(in)-2.csv", sep=";")
        print("Loaded original dataset")

print(f"Shape: {MILANO.shape}")
MILANO.head()

---
# 1. EXACT DUPLICATE DETECTION

## 1.1 Find Exact Duplicates

In [None]:
# Check for exact duplicates across all columns
duplicates = MILANO.duplicated(keep=False)  # Mark all duplicates including first occurrence
n_duplicates = MILANO.duplicated().sum()  # Count of duplicates (excluding first)

print(f"Total rows: {len(MILANO)}")
print(f"Exact duplicate rows (excluding first): {n_duplicates}")
print(f"Total rows involved in duplication: {duplicates.sum()}")

In [None]:
# Display duplicate rows if any
if duplicates.sum() > 0:
    print("Duplicate rows:")
    display(MILANO[duplicates].sort_values(by=list(MILANO.columns)))

## 1.2 Check Duplicates on Key Columns

In [None]:
# Check duplicates on address columns (potential business key)
address_cols = ['Codice via', 'Civico', 'ZD']
available_cols = [c for c in address_cols if c in MILANO.columns]

if available_cols:
    dups_address = MILANO.duplicated(subset=available_cols, keep=False)
    print(f"Rows with duplicate address ({available_cols}): {dups_address.sum()}")
    
    if dups_address.sum() > 0 and dups_address.sum() <= 50:
        print("\nSample of duplicate addresses:")
        display(MILANO[dups_address].sort_values(by=available_cols).head(20))

## 1.3 Remove Exact Duplicates

In [None]:
# Remove exact duplicates (keep first occurrence)
before_count = len(MILANO)
MILANO = MILANO.drop_duplicates(keep='first')
after_count = len(MILANO)

print(f"Rows before: {before_count}")
print(f"Rows after: {after_count}")
print(f"Removed: {before_count - after_count}")

---
# 2. SIMILARITY FUNCTIONS

Implement various similarity measures for near-duplicate detection.

## 2.1 Jaccard Similarity (Set-based)

In [None]:
def jaccard_similarity(s1, s2):
    """Compute Jaccard similarity between two strings (as sets of tokens)."""
    if pd.isna(s1) or pd.isna(s2):
        return 0.0
    
    set1 = set(str(s1).lower().split())
    set2 = set(str(s2).lower().split())
    
    if len(set1) == 0 and len(set2) == 0:
        return 1.0
    
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    
    return len(intersection) / len(union) if len(union) > 0 else 0.0

# Test
print("Jaccard examples:")
print(f"  'bar caffè milano' vs 'bar caffe milano': {jaccard_similarity('bar caffè milano', 'bar caffe milano'):.3f}")
print(f"  'ristorante italiano' vs 'pizzeria italiana': {jaccard_similarity('ristorante italiano', 'pizzeria italiana'):.3f}")

## 2.2 Levenshtein Distance (Edit Distance)

In [None]:
def levenshtein_distance(s1, s2):
    """Compute Levenshtein (edit) distance between two strings."""
    if pd.isna(s1) or pd.isna(s2):
        return float('inf')
    
    s1, s2 = str(s1).lower(), str(s2).lower()
    
    if len(s1) < len(s2):
        s1, s2 = s2, s1
    
    if len(s2) == 0:
        return len(s1)
    
    previous_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row
    
    return previous_row[-1]

def levenshtein_similarity(s1, s2):
    """Compute normalized Levenshtein similarity (0-1 range)."""
    if pd.isna(s1) or pd.isna(s2):
        return 0.0
    
    s1, s2 = str(s1), str(s2)
    max_len = max(len(s1), len(s2))
    
    if max_len == 0:
        return 1.0
    
    distance = levenshtein_distance(s1, s2)
    return 1 - (distance / max_len)

# Test
print("Levenshtein examples:")
print(f"  'caffè' vs 'caffe': distance={levenshtein_distance('caffè', 'caffe')}, similarity={levenshtein_similarity('caffè', 'caffe'):.3f}")
print(f"  'bar milano' vs 'bar milana': distance={levenshtein_distance('bar milano', 'bar milana')}, similarity={levenshtein_similarity('bar milano', 'bar milana'):.3f}")

## 2.3 Q-Gram Similarity

In [None]:
def get_qgrams(s, q=2):
    """Extract q-grams (character n-grams) from a string."""
    if pd.isna(s):
        return []
    s = str(s).lower()
    return [s[i:i+q] for i in range(len(s) - q + 1)]

def qgram_similarity(s1, s2, q=2):
    """Compute q-gram similarity using Jaccard on q-grams."""
    qgrams1 = Counter(get_qgrams(s1, q))
    qgrams2 = Counter(get_qgrams(s2, q))
    
    if len(qgrams1) == 0 and len(qgrams2) == 0:
        return 1.0
    
    intersection = sum((qgrams1 & qgrams2).values())
    union = sum((qgrams1 | qgrams2).values())
    
    return intersection / union if union > 0 else 0.0

# Test
print("Q-gram (2-gram) examples:")
print(f"  'caffè' -> {get_qgrams('caffè', 2)}")
print(f"  'caffe' -> {get_qgrams('caffe', 2)}")
print(f"  Similarity: {qgram_similarity('caffè', 'caffe', 2):.3f}")

---
# 3. NEAR-DUPLICATE DETECTION

## 3.1 Define Comparison Key

We'll look for near-duplicates based on similar business names (`Insegna`) and same address.

In [None]:
# Create a composite key for blocking
MILANO['block_key'] = (
    MILANO['Codice via'].astype(str) + '_' +
    MILANO['Civico'].astype(str)
)

# Count how many records share the same block key
block_sizes = MILANO['block_key'].value_counts()
print(f"Total blocks: {len(block_sizes)}")
print(f"Blocks with multiple records: {(block_sizes > 1).sum()}")
print(f"\nLargest blocks:")
display(block_sizes.head(10))

## 3.2 Find Near-Duplicates within Blocks

In [None]:
def find_near_duplicates(df, block_col, compare_col, similarity_threshold=0.8):
    """Find near-duplicates within each block based on string similarity."""
    near_dups = []
    
    # Group by block key
    for block_key, group in df.groupby(block_col):
        if len(group) < 2:
            continue
        
        indices = group.index.tolist()
        
        # Compare all pairs within block
        for i, idx1 in enumerate(indices):
            for idx2 in indices[i+1:]:
                val1 = df.loc[idx1, compare_col]
                val2 = df.loc[idx2, compare_col]
                
                sim = levenshtein_similarity(val1, val2)
                
                if sim >= similarity_threshold:
                    near_dups.append({
                        'idx1': idx1,
                        'idx2': idx2,
                        'block': block_key,
                        'val1': val1,
                        'val2': val2,
                        'similarity': sim
                    })
    
    return pd.DataFrame(near_dups)

In [None]:
# Find near-duplicates based on Insegna similarity within same address
insegna_col = 'Insegna' if 'Insegna' in MILANO.columns else None

if insegna_col:
    near_dups = find_near_duplicates(
        MILANO,
        block_col='block_key',
        compare_col=insegna_col,
        similarity_threshold=0.7
    )
    
    print(f"Found {len(near_dups)} potential near-duplicate pairs")
    
    if len(near_dups) > 0:
        display(near_dups.sort_values('similarity', ascending=False).head(20))

## 3.3 Examine Near-Duplicate Candidates

In [None]:
# Show full records for top near-duplicate pairs
if insegna_col and len(near_dups) > 0:
    top_pairs = near_dups.sort_values('similarity', ascending=False).head(5)
    
    for _, pair in top_pairs.iterrows():
        print(f"\n=== Similarity: {pair['similarity']:.3f} ===")
        print(f"Block: {pair['block']}")
        
        display(MILANO.loc[[pair['idx1'], pair['idx2']]])

---
# 4. BLOCKING STRATEGIES

## 4.1 Standard Blocking

In [None]:
# Blocking by Zone (ZD)
if 'ZD' in MILANO.columns:
    zone_blocks = MILANO.groupby('ZD').size()
    print("Records per Zone (ZD):")
    display(zone_blocks)

## 4.2 Sorted Neighborhood

In [None]:
def sorted_neighborhood_pairs(df, sort_col, window_size=3):
    """Generate candidate pairs using sorted neighborhood method."""
    # Sort by the specified column
    sorted_df = df.sort_values(sort_col).reset_index()
    
    pairs = []
    for i in range(len(sorted_df)):
        for j in range(i + 1, min(i + window_size, len(sorted_df))):
            pairs.append({
                'idx1': sorted_df.loc[i, 'index'],
                'idx2': sorted_df.loc[j, 'index'],
                'val1': sorted_df.loc[i, sort_col],
                'val2': sorted_df.loc[j, sort_col]
            })
    
    return pd.DataFrame(pairs)

# Example with street code
if 'Codice via' in MILANO.columns:
    # Use on a sample for demonstration
    sample = MILANO.head(100)
    sn_pairs = sorted_neighborhood_pairs(sample, 'Codice via', window_size=3)
    print(f"Sorted neighborhood candidates (sample): {len(sn_pairs)} pairs")

---
# 5. DUPLICATE RESOLUTION

## 5.1 Merge Strategy for Duplicates

In [None]:
def merge_duplicates(df, dup_pairs, prefer='first'):
    """Mark duplicates for removal, keeping preferred record."""
    to_remove = set()
    
    for _, pair in dup_pairs.iterrows():
        if prefer == 'first':
            to_remove.add(pair['idx2'])
        else:
            to_remove.add(pair['idx1'])
    
    return list(to_remove)

# If we have near-duplicates to handle
if insegna_col and len(near_dups) > 0:
    # Only consider high-confidence matches (>= 0.9 similarity)
    high_conf_dups = near_dups[near_dups['similarity'] >= 0.9]
    
    if len(high_conf_dups) > 0:
        indices_to_remove = merge_duplicates(MILANO, high_conf_dups, prefer='first')
        print(f"Indices marked for removal: {len(indices_to_remove)}")
        print(f"Indices: {indices_to_remove[:10]}...") if len(indices_to_remove) > 10 else print(f"Indices: {indices_to_remove}")
    else:
        print("No high-confidence duplicates to remove")

## 5.2 Summary of Deduplication

In [None]:
# Clean up temporary columns
if 'block_key' in MILANO.columns:
    MILANO = MILANO.drop(columns=['block_key'])

print("=== DEDUPLICATION SUMMARY ===")
print(f"Final row count: {len(MILANO)}")
print(f"Exact duplicates removed: Yes")
print(f"Near-duplicates identified: {len(near_dups) if 'near_dups' in dir() else 0}")

---
# 6. SAVE DEDUPLICATED DATASET

In [None]:
# Save the deduplicated dataset
MILANO.to_csv("MILANO_deduplicated.csv", index=False, sep=";")
print("Saved: MILANO_deduplicated.csv")

In [None]:
# Final preview
print(f"Final dataset shape: {MILANO.shape}")
MILANO.head()