# Needle Investigation & Fix: Robust Header/Fuzzy Logic

**Goal**: 
1. Ensure 'source_sentence' values are EXACT substrings of the 'section_*' columns.
2. Populate a separate column '_section' indicating which section the sentence belongs to.
3. Handle "Header Injection" (e.g. "Item 1. Business..." when section only has "...") by stripping headers and finding the best sentence match.

In [17]:
import pandas as pd
import numpy as np
import difflib
import re
import os

In [18]:
# Config
INPUT_FILE = "../../data/clean_ground_truth/cleaned_EDGAR_gt_02-16-2026.csv"
OUTPUT_FILE = "../../data/clean_ground_truth/cleaned_EDGAR_gt_FIXED.csv"

if not os.path.exists(INPUT_FILE):
    print(f"ERROR: File not found: {INPUT_FILE}")
else:
    df = pd.read_csv(INPUT_FILE)
    print(f"Loaded {len(df)} rows.")

Loaded 255 rows.


In [19]:
# Identify columns
section_cols = [c for c in df.columns if c.startswith("section_") and not c.endswith("_section")]
source_cols = [c for c in df.columns if c.endswith("_source_sentence")]

print(f"Found {len(section_cols)} section columns.")
print(f"Found {len(source_cols)} source sentence columns: {source_cols}")

Found 20 section columns.
Found 9 source sentence columns: ['registrant_name_source_sentence', 'headquarters_city_source_sentence', 'headquarters_state_source_sentence', 'incorporation_state_source_sentence', 'incorporation_year_source_sentence', 'employees_count_total_source_sentence', 'employees_count_full_time_source_sentence', 'ceo_lastname_source_sentence', 'holder_record_amount_source_sentence']


## 1. Preprocessing: Clean 'nan' Strings

In [20]:
print("Cleaning 'nan' strings...")
for col in source_cols:
    # Standardize missing values
    df[col] = df[col].replace(["nan", "NaN", "None", "NULL", ""], np.nan)

print("Done. Checking non-null counts:")
print(df[source_cols].notna().sum())

Cleaning 'nan' strings...
Done. Checking non-null counts:
registrant_name_source_sentence              249
headquarters_city_source_sentence            153
headquarters_state_source_sentence           148
incorporation_state_source_sentence          186
incorporation_year_source_sentence           172
employees_count_total_source_sentence        171
employees_count_full_time_source_sentence     61
ceo_lastname_source_sentence                 194
holder_record_amount_source_sentence         161
dtype: int64


## 2. Core Logic: Robust Matcher

In [21]:
def split_into_sentences(text):
    """Simple sentence splitter."""
    # Split by period, question mark, or exclamation point followed by space or newline
    # Also handle newlines as implicit breaks if they look like list items
    if not text: return []
    # This regex is a bit simplistic but works for finding candidate chunks
    # split on .?! followed by space or end of string
    chunks = re.split(r'(?<=[.?!])\s+', text)
    # Also split by double newlines to be safe
    final_chunks = []
    for c in chunks:
        final_chunks.extend(c.split('\n\n'))
    return [c.strip() for c in final_chunks if c.strip()]

def clean_header(text):
    """Removes common EDGAR headers mostly from start."""
    # Remove "Item 1. Business.", "PART I", etc.
    text = re.sub(r'^(Item|Part)\s+\w+\.?\s*', '', text, flags=re.IGNORECASE)
    text = re.sub(r'^(Business|Properties|Legal Proceedings)\.?\s*', '', text, flags=re.IGNORECASE)
    return text.strip()

def find_needle_in_sections(row, source_sent):
    if pd.isna(source_sent):
        return np.nan, np.nan
        
    src = str(source_sent).strip()
    if not src or src.lower() == "nan":
        return np.nan, np.nan

    # --- 1. EXACT MATCH (Fastest) ---
    for sec in section_cols:
        sec_text = str(row.get(sec, ""))
        if src in sec_text:
            return src, sec

    # --- 2. ROBUST SEARCH ---
    # Prepare candidates from source (Original & Header-Stripped)
    candidates = [src]
    cleaned = clean_header(src)
    if cleaned != src and len(cleaned) > 10:
        candidates.append(cleaned)

    best_ratio = 0
    best_span = src
    best_sec = "NOT_FOUND"
    
    for sec in section_cols:
        sec_text = str(row.get(sec, ""))
        if not sec_text or len(sec_text) < 5 or sec_text == "nan": continue
        
        # Optimization: Don't fuzzy match HUGE sections line by line if no n-gram overlap
        # But for now, just split into "phrases" or sentences
        # actually, standard difflib on the WHOLE section is surprisingly good at finding the block
        
        # Strategy A: Whole Section Fuzzy Search (Finds the best block)
        matcher = difflib.SequenceMatcher(None, sec_text, src)
        match = matcher.find_longest_match(0, len(sec_text), 0, len(src))
        
        if match.size > 0:
            # Compute score against the CANDIDATE (cleaned), not unrelated junk
            # But we only matched against 'src' so far. 
            # Let's extract the block and score it against 'cleaned'
            
            found_block = sec_text[match.a : match.a + match.size]
            
            # If the found block is essentially the cleaned source
            # calculated ratio
            
            # Let's try matching against clean source too if different
            current_ratio = match.size / len(src)
            
            # If match is long (>50 chars), it is almost certainly the right place
            if match.size > 50:
                current_ratio = max(current_ratio, 0.9)
            
            if current_ratio > best_ratio:
                best_ratio = current_ratio
                best_span = found_block
                best_sec = sec

        # Strategy B: Cleaned Source (Header Stripped) against Section
        if len(candidates) > 1:
            cleaned_src = candidates[1]
            matcher_c = difflib.SequenceMatcher(None, sec_text, cleaned_src)
            match_c = matcher_c.find_longest_match(0, len(sec_text), 0, len(cleaned_src))
            
            if match_c.size > 0:
                ratio_c = match_c.size / len(cleaned_src)
                if match_c.size > 40: ratio_c = max(ratio_c, 0.9)
                
                if ratio_c > best_ratio:
                    best_ratio = ratio_c
                    best_span = sec_text[match_c.a : match_c.a + match_c.size]
                    best_sec = sec

    # --- 3. DECISION ---
    # Threshold 0.6 is safe if we have >40 chars overlap (handled above)
    # For short strings, be stricter.
    
    threshold = 0.6
    if len(best_span) < 20: threshold = 0.8
    
    if best_ratio > threshold:
        return best_span, best_sec

    return src, "NOT_FOUND"

# Apply Logic
print("Processing rows... This may take 2-3 minutes.")
df_fixed = df.copy()

for col in source_cols:
    feature_base = col.replace("_source_sentence", "")
    new_sec_col = f"{feature_base}_section"
    print(f"  Fixing {col} -> mapping to {new_sec_col}")
    
    results = df_fixed.apply(lambda row: find_needle_in_sections(row, row[col]), axis=1, result_type='expand')
    
    df_fixed[col] = results[0]
    df_fixed[new_sec_col] = results[1]

print("Done processing.")

Processing rows... This may take 2-3 minutes.
  Fixing registrant_name_source_sentence -> mapping to registrant_name_section
  Fixing headquarters_city_source_sentence -> mapping to headquarters_city_section
  Fixing headquarters_state_source_sentence -> mapping to headquarters_state_section
  Fixing incorporation_state_source_sentence -> mapping to incorporation_state_section
  Fixing incorporation_year_source_sentence -> mapping to incorporation_year_section
  Fixing employees_count_total_source_sentence -> mapping to employees_count_total_section
  Fixing employees_count_full_time_source_sentence -> mapping to employees_count_full_time_section
  Fixing ceo_lastname_source_sentence -> mapping to ceo_lastname_section
  Fixing holder_record_amount_source_sentence -> mapping to holder_record_amount_section
Done processing.


## 3. Validation Check

In [26]:
print("Verifying matches...")
mismatches = 0

for col in source_cols:
    feature_base = col.replace("_source_sentence", "")
    sec_col_name = f"{feature_base}_section"
    
    # Check every row where source is not null
    valid_rows = df_fixed[df_fixed[col].notna()].copy()
    
    def verify_row(r):
        s_sent = r[col]
        s_sec_name = r[sec_col_name]
        
        if pd.isna(s_sec_name) or s_sec_name == "NOT_FOUND":
            return False # Should have been found ideally
            
        # The real test: Is string IN section text?
        sec_text = str(r.get(s_sec_name, ""))
        return str(s_sent) in sec_text

    results = valid_rows.apply(verify_row, axis=1)
    fails = (~results).sum()
    
    if fails > 0:
        print(f"  WARNING: {col} has {fails} mismatches after fix.")
        mismatches += fails
        
        # Show first failure diagnostic
        bad_row = valid_rows[~results].iloc[0]
        print(f"    [Diag] File: {bad_row['filename']}")
        print(f"    [Diag] Section: {bad_row[sec_col_name]}")
        print(f"    [Diag] Type: {type(bad_row[col])}")
        print(f"    [Diag] Sent: {repr(bad_row[col])}")
        
        sec_val = str(bad_row.get(bad_row[sec_col_name], ''))
        print(f"    [Diag] In Section? {str(bad_row[col]) in sec_val}")
    else:
        print(f"  {col}: ALL CLEAN.")

if mismatches == 0:
    print("\nSUCCESS: All source sentences are now perfect substrings of their assigned sections.")
else:
    print(f"\nCompleted with {mismatches} issues remaining (check warnings above).")

Verifying matches...
    [Diag] File: 738339_1993.txt
    [Diag] Section: NOT_FOUND
    [Diag] Type: <class 'str'>
    [Diag] Sent: 'American Healthcare Management, Inc. (together, unless the context otherwise requires, with its subsidiaries, "AHI" or the "Company") is a health care services company engaged in the operation of 16 general acute care hospitals in nine states, with a total of 2,028 licensed beds.'
    [Diag] In Section? False
    [Diag] File: 738339_1993.txt
    [Diag] Section: NOT_FOUND
    [Diag] Type: <class 'str'>
    [Diag] Sent: 'Also, the Company leases approximately 17,000 square feet of executive office space in Valley Forge Square, King of Prussia, Pennsylvania.'
    [Diag] In Section? False
    [Diag] File: 738339_1993.txt
    [Diag] Section: NOT_FOUND
    [Diag] Type: <class 'str'>
    [Diag] Sent: 'Also, the Company leases approximately 17,000 square feet of executive office space in Valley Forge Square, King of Prussia, Pennsylvania.'
    [Diag] In Section? 

In [None]:
# Save
if mismatches == 0:
    df_fixed.to_csv(OUTPUT_FILE, index=False)
    print(f"Saved validated file to {OUTPUT_FILE}")
else:
    print("Output file NOT saved because verification failed (safety first).")