In [None]:
import pandas as pd
import sys
from pathlib import Path

print("Starting contact comparison...")

# Define file paths
tiered_file = Path('output/Two_Tier_Filtered_Family_Office_Contacts_v6.xlsx')
original_file = Path('../email-filtering-tool/output/Filtered_Out_Contacts.xlsx')

# Check if files exist
print(f"Tiered file exists: {tiered_file.exists()}")
print(f"Original file exists: {original_file.exists()}")

if tiered_file.exists() and original_file.exists():
    print("Both files found, proceeding with comparison...")
else:
    print("One or both files not found. Please check file paths.")


In [None]:
# Read the Excel files
tiered_df = pd.read_excel(tiered_file)
original_df = pd.read_excel(original_file)

print(f"Loaded tiered output: {tiered_df.shape[0]} contacts")
print(f"Loaded original filtering: {original_df.shape[0]} contacts")

# Display column names
print(f"\nTiered output columns: {list(tiered_df.columns)}")
print(f"Original filtering columns: {list(original_df.columns)}")


In [None]:
# Get contact IDs for comparison
tiered_ids = set(tiered_df['CONTACT_ID'].astype(str))
original_ids = set(original_df['CONTACT_ID'].astype(str))

print(f"Unique contact IDs in tiered output: {len(tiered_ids)}")
print(f"Unique contact IDs in original filtering: {len(original_ids)}")

# Find differences
contacts_not_in_original = tiered_ids - original_ids
contacts_not_in_tiered = original_ids - tiered_ids

print(f"\nContacts in tiered output but NOT in original filtering: {len(contacts_not_in_original)}")
print(f"Contacts in original filtering but NOT in tiered output: {len(contacts_not_in_tiered)}")


In [None]:
# Get details for contacts not in original filtering
if len(contacts_not_in_original) > 0:
    missing_contacts = tiered_df[tiered_df['CONTACT_ID'].astype(str).isin(contacts_not_in_original)]
    
    print("="*100)
    print("CONTACTS IN LATEST TIERED OUTPUT BUT NOT IN ORIGINAL FILTERING")
    print("="*100)
    
    # Display key information for each contact
    for idx, row in missing_contacts.iterrows():
        print(f"\nContact ID: {row['CONTACT_ID']}")
        print(f"Investor: {row.get('INVESTOR', 'N/A')}")
        print(f"Firm Type: {row.get('FIRM TYPE', 'N/A')}")
        print(f"Name: {row.get('NAME', 'N/A')}")
        print(f"Title: {row.get('TITLE', 'N/A')}")
        print(f"Email: {row.get('EMAIL', 'N/A')}")
        print(f"Location: {row.get('CITY', 'N/A')}, {row.get('STATE', 'N/A')}")
        print("-" * 80)
    
    # Save to CSV
    output_file = Path('output/contacts_not_in_original_filtering.csv')
    missing_contacts.to_csv(output_file, index=False)
    print(f"\nResults saved to: {output_file}")
else:
    print("No contacts found in tiered output that are not in original filtering.")
