In [None]:
# Analyzing "Unknown" Businesses in NYC Consumer Complaints

**Deep Dive into Missing and Unknown Business Data**

This analysis focuses on understanding complaints where the business is listed as "unknown", missing, or unclear. We'll explore patterns, try to identify these businesses through other data points, and understand the impact on overall analysis.


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import re

# Configure pandas and plotting
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
plt.style.use('default')
sns.set_palette("husl")
%matplotlib inline

print("🔍 Libraries loaded successfully!")
print("Ready to investigate unknown businesses in NYC complaints data")


In [None]:
# Load the dataset
df = pd.read_csv('DCWP_Consumer_Complaints_20250623.csv')

print(f"📊 Dataset loaded: {len(df):,} total complaints")
print(f"Columns available: {len(df.columns)}")

# Basic overview of business name data
print("\n🏢 BUSINESS NAME DATA OVERVIEW:")
print("=" * 50)
print(f"Total complaints: {len(df):,}")
print(f"Non-null business names: {df['Business Name'].notna().sum():,}")
print(f"Null/missing business names: {df['Business Name'].isna().sum():,}")
print(f"Unique business names: {df['Business Name'].nunique():,}")

# Calculate missing percentage
missing_percentage = (df['Business Name'].isna().sum() / len(df)) * 100
print(f"Percentage of complaints with missing business names: {missing_percentage:.1f}%")


In [None]:
# Identify different types of "unknown" businesses
print("🔍 IDENTIFYING UNKNOWN BUSINESS CATEGORIES:")
print("=" * 55)

# Create a copy for analysis
df_analysis = df.copy()

# Clean business names for analysis
df_analysis['Business Name Clean'] = df_analysis['Business Name'].str.strip() if df_analysis['Business Name'].dtype == 'object' else df_analysis['Business Name']

# Category 1: Completely missing (NaN/null)
completely_missing = df_analysis['Business Name'].isna()
print(f"1. 📋 COMPLETELY MISSING (NaN/null): {completely_missing.sum():,} complaints")

# Category 2: Empty strings or whitespace only
empty_or_whitespace = df_analysis['Business Name Clean'].str.len() == 0
empty_or_whitespace = empty_or_whitespace.fillna(False)  # Handle NaN values
print(f"2. ⬜ EMPTY/WHITESPACE ONLY: {empty_or_whitespace.sum():,} complaints")

# Category 3: Explicitly marked as "unknown" or similar
unknown_keywords = ['unknown', 'n/a', 'na', 'not available', 'not provided', 'none', 'unlisted', 'unnamed']
explicitly_unknown = df_analysis['Business Name Clean'].str.lower().isin(unknown_keywords)
explicitly_unknown = explicitly_unknown.fillna(False)
print(f"3. ❓ EXPLICITLY UNKNOWN: {explicitly_unknown.sum():,} complaints")

# Category 4: Very short names (likely incomplete)
very_short = df_analysis['Business Name Clean'].str.len() <= 3
very_short = very_short.fillna(False)
print(f"4. 📏 VERY SHORT (≤3 chars): {very_short.sum():,} complaints")

# Category 5: Generic/unhelpful names
generic_patterns = [
    r'^unlicensed.*', r'^license.*', r'^complaint.*', r'^business.*', 
    r'^store$', r'^shop$', r'^company$', r'^inc$', r'^corp$', r'^llc$',
    r'^deli$', r'^restaurant$', r'^salon$', r'^market$'
]
generic_mask = pd.Series([False] * len(df_analysis))
for pattern in generic_patterns:
    pattern_match = df_analysis['Business Name Clean'].str.lower().str.match(pattern, na=False)
    generic_mask = generic_mask | pattern_match

print(f"5. 🏪 GENERIC NAMES: {generic_mask.sum():,} complaints")

# Combine all "unknown" categories
all_unknown = completely_missing | empty_or_whitespace | explicitly_unknown | very_short | generic_mask
total_unknown = all_unknown.sum()
unknown_percentage = (total_unknown / len(df_analysis)) * 100

print(f"\n📊 TOTAL 'UNKNOWN' BUSINESSES:")
print(f"• Combined unknown/unclear: {total_unknown:,} complaints")
print(f"• Percentage of all complaints: {unknown_percentage:.1f}%")
print(f"• Clear business names: {len(df_analysis) - total_unknown:,} complaints")

# Store the unknown mask for further analysis
df_analysis['is_unknown'] = all_unknown


In [None]:
# Analyze patterns in unknown business complaints
print("🔎 PATTERNS IN UNKNOWN BUSINESS COMPLAINTS:")
print("=" * 55)

# Filter to unknown businesses
unknown_businesses = df_analysis[df_analysis['is_unknown']]
known_businesses = df_analysis[~df_analysis['is_unknown']]

print(f"Analyzing {len(unknown_businesses):,} unknown business complaints...")

# Business categories for unknown businesses
print("\n📂 BUSINESS CATEGORIES (Unknown Businesses):")
unknown_categories = unknown_businesses['Business Category'].value_counts().head(10)
for i, (category, count) in enumerate(unknown_categories.items(), 1):
    percentage = (count / len(unknown_businesses)) * 100
    print(f"{i:2d}. {category:<30} ({count:>4,} complaints, {percentage:>4.1f}%)")

# Compare with known businesses
print("\n📂 BUSINESS CATEGORIES (Known Businesses - Top 10):")
known_categories = known_businesses['Business Category'].value_counts().head(10)
for i, (category, count) in enumerate(known_categories.items(), 1):
    percentage = (count / len(known_businesses)) * 100
    print(f"{i:2d}. {category:<30} ({count:>4,} complaints, {percentage:>4.1f}%)")

# Geographic distribution
print("\n🗽 GEOGRAPHIC DISTRIBUTION (Unknown Businesses):")
unknown_boroughs = unknown_businesses['Borough'].value_counts()
for borough, count in unknown_boroughs.items():
    percentage = (count / len(unknown_businesses)) * 100
    print(f"• {str(borough):<12}: {count:>4,} complaints ({percentage:>4.1f}%)")

# Complaint types
print("\n📝 TOP COMPLAINT TYPES (Unknown Businesses):")
unknown_complaints = unknown_businesses['Complaint Code'].value_counts().head(10)
for i, (complaint, count) in enumerate(unknown_complaints.items(), 1):
    percentage = (count / len(unknown_businesses)) * 100
    print(f"{i:2d}. {complaint:<40} ({count:>3,} cases, {percentage:>4.1f}%)")

# Intake channels
print("\n📞 COMPLAINT CHANNELS (Unknown Businesses):")
unknown_channels = unknown_businesses['Intake Channel'].value_counts()
for channel, count in unknown_channels.items():
    percentage = (count / len(unknown_businesses)) * 100
    print(f"• {str(channel):<15}: {count:>4,} complaints ({percentage:>4.1f}%)")


In [None]:
# Detective work: Try to identify unknown businesses using other data
print("🕵️ DETECTIVE WORK: IDENTIFYING UNKNOWN BUSINESSES")
print("=" * 60)

# Method 1: Group by address to find businesses at same location
print("🏠 METHOD 1: GROUPING BY ADDRESS")
print("-" * 40)

# Create a combined address field
unknown_businesses['Full_Address'] = (
    unknown_businesses['Building Nbr'].astype(str) + ' ' + 
    unknown_businesses['Street1'].astype(str) + ', ' + 
    unknown_businesses['City'].astype(str) + ', ' + 
    unknown_businesses['Postcode'].astype(str)
).str.replace('nan', '').str.replace(',  ,', ', ')

# Find addresses with multiple unknown business complaints
address_counts = unknown_businesses['Full_Address'].value_counts()
frequent_addresses = address_counts[address_counts >= 3].head(10)

print("Top addresses with multiple unknown business complaints:")
for i, (address, count) in enumerate(frequent_addresses.items(), 1):
    print(f"{i:2d}. {address:<50} ({count} complaints)")
    
    # Show complaint details for this address
    addr_complaints = unknown_businesses[unknown_businesses['Full_Address'] == address]
    categories = addr_complaints['Business Category'].value_counts()
    complaint_types = addr_complaints['Complaint Code'].value_counts()
    
    print(f"    Categories: {', '.join([f'{cat} ({cnt})' for cat, cnt in categories.head(3).items()])}")
    print(f"    Top complaints: {', '.join([f'{comp[:30]}... ({cnt})' for comp, cnt in complaint_types.head(2).items()])}")
    print()

# Method 2: Use Business Unique ID to link complaints
print("🆔 METHOD 2: BUSINESS UNIQUE ID ANALYSIS")
print("-" * 45)

# Find unknown businesses with Business Unique IDs
unknown_with_ids = unknown_businesses[unknown_businesses['Business Unique ID'].notna()]
print(f"Unknown businesses with Business IDs: {len(unknown_with_ids):,}")

if len(unknown_with_ids) > 0:
    # Check if these IDs exist elsewhere in the dataset with known names
    id_matches = 0
    potential_identifications = []
    
    for bus_id in unknown_with_ids['Business Unique ID'].unique()[:10]:  # Check first 10
        # Find all complaints with this business ID
        same_id_complaints = df_analysis[df_analysis['Business Unique ID'] == bus_id]
        
        # Check if any have known business names
        known_names = same_id_complaints[~same_id_complaints['is_unknown']]['Business Name'].unique()
        
        if len(known_names) > 0:
            id_matches += 1
            unknown_count = len(same_id_complaints[same_id_complaints['is_unknown']])
            known_count = len(same_id_complaints[~same_id_complaints['is_unknown']])
            potential_identifications.append({
                'business_id': bus_id,
                'known_name': known_names[0],
                'unknown_complaints': unknown_count,
                'known_complaints': known_count
            })
    
    print(f"Business IDs with both unknown and known names: {id_matches}")
    print("\nPotential identifications:")
    for identification in potential_identifications[:5]:
        print(f"• ID {identification['business_id']}: '{identification['known_name']}'")
        print(f"  - Unknown entries: {identification['unknown_complaints']}")
        print(f"  - Known entries: {identification['known_complaints']}")

# Method 3: Pattern analysis in complaint codes and categories
print("\n🔍 METHOD 3: PATTERN ANALYSIS")
print("-" * 35)

# Find unknown businesses that might be unlicensed operations
unlicensed_patterns = unknown_businesses[
    unknown_businesses['Complaint Code'].str.contains('Unlicensed|License', na=False, case=False) |
    unknown_businesses['Business Category'].str.contains('Unlicensed', na=False, case=False)
]

print(f"Unknown businesses with licensing issues: {len(unlicensed_patterns):,}")

if len(unlicensed_patterns) > 0:
    print("Top categories for unlicensed unknown businesses:")
    unlicensed_cats = unlicensed_patterns['Business Category'].value_counts().head(5)
    for cat, count in unlicensed_cats.items():
        print(f"• {cat}: {count} complaints")

# Show some examples of unknown business entries
print(f"\n📋 SAMPLE UNKNOWN BUSINESS ENTRIES:")
print("-" * 40)
sample_unknown = unknown_businesses[['Business Name', 'Business Category', 'Complaint Code', 'Borough', 'Street1']].head(10)
for i, (_, row) in enumerate(sample_unknown.iterrows(), 1):
    print(f"{i:2d}. Name: '{row['Business Name']}'")
    print(f"    Category: {row['Business Category']}")
    print(f"    Complaint: {row['Complaint Code']}")
    print(f"    Location: {row['Street1']}, {row['Borough']}")
    print()


In [None]:
# Create visualizations to understand unknown business patterns
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12))

# 1. Pie chart of unknown vs known businesses
labels = ['Known Businesses', 'Unknown Businesses']
sizes = [len(known_businesses), len(unknown_businesses)]
colors = ['lightblue', 'lightcoral']

ax1.pie(sizes, labels=labels, autopct='%1.1f%%', colors=colors, startangle=90)
ax1.set_title('Distribution of Known vs Unknown Businesses', fontweight='bold')

# 2. Top business categories for unknown businesses
top_unknown_cats = unknown_categories.head(8)
bars2 = ax2.barh(range(len(top_unknown_cats)), top_unknown_cats.values)
ax2.set_yticks(range(len(top_unknown_cats)))
ax2.set_yticklabels([cat[:20] + '...' if len(cat) > 20 else cat for cat in top_unknown_cats.index])
ax2.set_xlabel('Number of Complaints')
ax2.set_title('Top Business Categories - Unknown Businesses', fontweight='bold')
ax2.grid(axis='x', alpha=0.3)

# Add value labels
for bar, count in zip(bars2, top_unknown_cats.values):
    ax2.text(bar.get_width() + 10, bar.get_y() + bar.get_height()/2, 
             f'{count:,}', va='center', ha='left', fontweight='bold')

# 3. Geographic distribution comparison
borough_comparison = pd.DataFrame({
    'Unknown': unknown_boroughs,
    'Known': known_businesses['Borough'].value_counts()
}).fillna(0)

borough_comparison.plot(kind='bar', ax=ax3, color=['lightcoral', 'lightblue'])
ax3.set_title('Complaints by Borough: Unknown vs Known Businesses', fontweight='bold')
ax3.set_xlabel('Borough')
ax3.set_ylabel('Number of Complaints')
ax3.legend()
ax3.tick_params(axis='x', rotation=45)
ax3.grid(axis='y', alpha=0.3)

# 4. Top complaint types for unknown businesses
top_unknown_complaints = unknown_complaints.head(8)
bars4 = ax4.barh(range(len(top_unknown_complaints)), top_unknown_complaints.values)
ax4.set_yticks(range(len(top_unknown_complaints)))
ax4.set_yticklabels([comp[:25] + '...' if len(comp) > 25 else comp for comp in top_unknown_complaints.index])
ax4.set_xlabel('Number of Complaints')
ax4.set_title('Top Complaint Types - Unknown Businesses', fontweight='bold')
ax4.grid(axis='x', alpha=0.3)

# Add value labels
for bar, count in zip(bars4, top_unknown_complaints.values):
    ax4.text(bar.get_width() + 5, bar.get_y() + bar.get_height()/2, 
             f'{count:,}', va='center', ha='left', fontweight='bold')

plt.tight_layout()
plt.show()

print("📊 Visualizations showing patterns in unknown business complaints")


In [None]:
# Impact analysis and recommendations
print("📈 IMPACT OF UNKNOWN BUSINESSES ON ANALYSIS")
print("=" * 55)

# Calculate the impact on business rankings
print("🏆 IMPACT ON BUSINESS RANKINGS:")
print("-" * 35)

# If we could identify unknown businesses, how would rankings change?
all_business_counts = df_analysis['Business Name'].value_counts()
known_business_counts = known_businesses['Business Name'].value_counts()

print(f"• Total unique business names (including unknowns): {len(all_business_counts):,}")
print(f"• Known business names only: {len(known_business_counts):,}")
print(f"• 'Missing' businesses due to unknown entries: {len(all_business_counts) - len(known_business_counts):,}")

# Show how much the top businesses could change
print(f"\n📊 POTENTIAL RANKING CHANGES:")
print("If unknown businesses were properly identified, rankings could change significantly:")
print(f"• Unknown complaints represent {unknown_percentage:.1f}% of all data")
print(f"• This is equivalent to {total_unknown:,} complaints that can't be properly attributed")

# Data quality metrics
print(f"\n📋 DATA QUALITY ASSESSMENT:")
print("-" * 35)

data_quality_metrics = {
    'Completeness': ((len(df_analysis) - total_unknown) / len(df_analysis)) * 100,
    'Business Name Coverage': (df_analysis['Business Name'].notna().sum() / len(df_analysis)) * 100,
    'Business ID Coverage': (df_analysis['Business Unique ID'].notna().sum() / len(df_analysis)) * 100,
    'Address Coverage': (df_analysis['Street1'].notna().sum() / len(df_analysis)) * 100
}

for metric, value in data_quality_metrics.items():
    status = "✅ Good" if value >= 90 else "⚠️ Needs Improvement" if value >= 70 else "❌ Poor"
    print(f"• {metric:<25}: {value:>5.1f}% {status}")

# Resolution outcomes comparison
print(f"\n⚖️ RESOLUTION OUTCOME COMPARISON:")
print("-" * 40)

unknown_outcomes = unknown_businesses['Result'].value_counts(normalize=True) * 100
known_outcomes = known_businesses['Result'].value_counts(normalize=True) * 100

print("Unknown businesses outcomes vs Known businesses:")
for outcome in unknown_outcomes.index[:5]:
    unknown_pct = unknown_outcomes.get(outcome, 0)
    known_pct = known_outcomes.get(outcome, 0)
    difference = unknown_pct - known_pct
    trend = "↗️" if difference > 5 else "↘️" if difference < -5 else "→"
    print(f"• {outcome:<30}: Unknown {unknown_pct:>4.1f}% | Known {known_pct:>4.1f}% {trend}")

print(f"\n💡 KEY INSIGHTS ABOUT UNKNOWN BUSINESSES:")
print("-" * 50)

insights = [
    f"1. {unknown_percentage:.1f}% of complaints have unclear business identification",
    f"2. Top unknown category: '{unknown_categories.index[0]}' ({unknown_categories.iloc[0]:,} complaints)",
    f"3. Most unknown complaints come from {unknown_boroughs.index[0]} ({unknown_boroughs.iloc[0]:,} cases)",
    f"4. Primary complaint type: '{unknown_complaints.index[0]}'",
    f"5. Many unknowns appear to be unlicensed operations",
    "6. Address clustering reveals potential repeat offenders",
    "7. Business IDs could help identify some unknown businesses"
]

for insight in insights:
    print(f"   {insight}")

print(f"\n🔧 RECOMMENDATIONS FOR IMPROVING DATA QUALITY:")
print("-" * 55)

recommendations = [
    "1. 📋 Mandatory business name collection at complaint intake",
    "2. 🆔 Always collect business license/permit numbers when available",
    "3. 📍 Use address matching to link unknown businesses to known ones",
    "4. 🔍 Implement fuzzy matching for business name standardization",
    "5. 📞 Follow up with complainants to get missing business information",
    "6. 🏢 Cross-reference with business registration databases",
    "7. 🤖 Use machine learning to predict likely business identities",
    "8. 📊 Regular data quality audits and cleanup processes"
]

for recommendation in recommendations:
    print(f"   {recommendation}")

print(f"\n✅ ANALYSIS SUMMARY:")
print(f"Successfully analyzed {total_unknown:,} unknown business complaints")
print(f"Identified patterns and potential methods for business identification")
print(f"Provided actionable recommendations for improving data quality")


In [None]:
## 🛠️ Practical Data Cleaning Example

**Code to Clean and Potentially Identify Unknown Businesses**
