# Needle Investigation & Fix

**Goal**: 
1. Investigate why `source_sentence` (needles) do not match exactly with the text in `section_*` columns.
2. Fix the `source_sentence` values by replacing them with the **exact substring** found in the corresponding section.

In [None]:
import pandas as pd
import numpy as np
import difflib
import re
import os

In [None]:
# Load the latest cleaned ground truth
INPUT_FILE = "../../data/clean_ground_truth/cleaned_EDGAR_gt_02-16-2026.csv"
OUTPUT_FILE = "../../data/clean_ground_truth/cleaned_EDGAR_gt_FIXED.csv"

df = pd.read_csv(INPUT_FILE)
print(f"Loaded {len(df)} rows.")

In [None]:
# Identify columns
section_cols = [c for c in df.columns if c.startswith("section_") and not c.endswith("_section")]
source_cols = [c for c in df.columns if c.endswith("_source_sentence")]

print(f"Found {len(section_cols)} section columns.")
print(f"Found {len(source_cols)} source sentence columns.")

## 1. Analysis: Find Mismatches

In [None]:
def find_exact_section(row, source_sent):
    """Check if source_sent exists exactly in ANY section."""
    if pd.isna(source_sent) or str(source_sent) == "nan":
        return None
    
    s_str = str(source_sent).strip()
    for sec in section_cols:
        sec_text = str(row.get(sec, ""))
        if s_str in sec_text:
            return sec
    return None

# Check current status
mismatch_count = 0
total_checked = 0

print("Checking for exact matches...")
for col in source_cols:
    matches = df.apply(lambda row: find_exact_section(row, row[col]), axis=1)
    not_found = matches.isna() & df[col].notna()
    count = not_found.sum()
    mismatch_count += count
    total_checked += df[col].notna().sum()
    
    if count > 0:
        print(f"  {col}: {count} mismatches")

print(f"\nTotal Mismatches: {mismatch_count} / {total_checked}")

## 2. Fix: Fuzzy Anchor & Replace

We will find the *closest* matching substring in the sections and replace the `source_sentence` with that exact text.

In [None]:
def extract_best_matching_span(row, source_sent):
    """
    Finds the section with the best fuzzy match for source_sent.
    Returns the EXACT substring from that section.
    """
    if pd.isna(source_sent) or str(source_sent) == "nan":
        return source_sent, None
        
    src = str(source_sent).strip()
    if not src:
        return source_sent, None

    best_ratio = 0
    best_span = source_sent # Default to original if we fail
    best_section_name = None

    # normalized source for loose check
    norm_src = "".join(src.split())

    for sec in section_cols:
        sec_text = str(row.get(sec, ""))
        if not sec_text or sec_text == "nan":
            continue
            
        # 1. Try Normalized Match (Fast & Robust to whitespace)
        # We map indices back to original string
        norm_sec = "".join(sec_text.split())
        if norm_src in norm_sec:
            # Find start index in normalized string
            start_idx = norm_sec.find(norm_src)
            # Find end index
            len_match = len(norm_src)
            
            # Map back to original indices
            # This requires iterating the original string to skip whitespace
            # Optimized mapping:
            orig_indices = [i for i, c in enumerate(sec_text) if not c.isspace()]
            
            if start_idx < len(orig_indices) and (start_idx + len_match - 1) < len(orig_indices):
                orig_start = orig_indices[start_idx]
                orig_end = orig_indices[start_idx + len_match - 1] + 1
                return sec_text[orig_start:orig_end], sec

        # 2. SequenceMatcher (Slower, handles typos/char diffs)
        matcher = difflib.SequenceMatcher(None, sec_text, src)
        match = matcher.find_longest_match(0, len(sec_text), 0, len(src))
        
        if match.size > 0:
            # Calculate coverage ratio
            coverage = match.size / len(src)
            if coverage > best_ratio:
                best_ratio = coverage
                best_span = sec_text[match.a : match.a + match.size]
                best_section_name = sec
    
    # If we found a decent fuzzy match (>80% overlap), use it
    if best_ratio > 0.8:
        return best_span, best_section_name
        
    # If fail, return original (maybe print a warning)
    return source_sent, None


print("Fixing source sentences...")
df_fixed = df.copy()

for col in source_cols:
    print(f"  Processing {col}...")
    
    # We'll also update the 'section' column while we're at it
    feature_base = col.replace("_source_sentence", "")
    section_col_name = f"{feature_base}_section"
    
    def fix_row(row):
        val = row[col]
        if pd.isna(val):
            return val, np.nan
            
        # Check if already exact
        exact_sec = find_exact_section(row, val)
        if exact_sec:
            return val, exact_sec
            
        # If not, find best match
        fixed_val, sec_name = extract_best_matching_span(row, val)
        return fixed_val, sec_name

    # Apply
    # Returns tuple (fixed_val, section_name)
    results = df_fixed.apply(fix_row, axis=1, result_type='expand')
    df_fixed[col] = results[0]
    df_fixed[section_col_name] = results[1]

print("Done.")

In [None]:
# Verify Fixes
print("Verifying fixes...")
mismatch_count = 0
for col in source_cols:
    matches = df_fixed.apply(lambda row: find_exact_section(row, row[col]), axis=1)
    not_found = matches.isna() & df_fixed[col].notna()
    count = not_found.sum()
    mismatch_count += count
    if count > 0:
        print(f"  WARNING: {col} stil has {count} mismatches.")
        
if mismatch_count == 0:
    print("SUCCESS: All source sentences now exist exactly in sections.")
else:
    print(f"Still have {mismatch_count} mismatches.")

In [None]:
# Save
df_fixed.to_csv(OUTPUT_FILE, index=False)
print(f"Saved fixed CSV to {OUTPUT_FILE}")