# 🔍 Standalone ML Pipeline Investigation

## Completely Independent Analysis - No Project Dependencies

This notebook investigates why the ML pipeline is filtering out 81,044 out of 81,052 records.

### Investigation Goals:
1. **Load data directly from HDFS** without any project imports
2. **Check available data dates** in the gold layer
3. **Analyze data quality** - price, area, missing values
4. **Simulate pipeline filtering** step by step
5. **Identify root cause** of massive data loss

---

## 1. Setup PySpark - No Project Dependencies

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, count, sum as spark_sum, avg, min as spark_min, max as spark_max,
    stddev, isnan, isnull, when, lit, regexp_replace, desc, asc
)
from pyspark.sql.types import DoubleType, StringType, IntegerType, FloatType
from datetime import datetime, timedelta
import pandas as pd

print("✅ Libraries imported successfully - NO PROJECT DEPENDENCIES")

In [None]:
# Initialize Spark with HDFS configuration
spark = (
    SparkSession.builder
    .appName("StandaloneInvestigation")
    .config("spark.ui.port", "4051")  # Different port to avoid conflicts
    .config("spark.driver.memory", "4g")
    .config("spark.executor.memory", "4g")
    .config("spark.hadoop.fs.defaultFS", "hdfs://namenode:9000")
    .config("spark.sql.adaptive.enabled", "true")
    .getOrCreate()
)

print(f"✅ Spark session initialized: {spark.version}")
print(f"🌐 Default FS: {spark.conf.get('spark.hadoop.fs.defaultFS')}")
spark.sparkContext.setLogLevel("WARN")  # Reduce log noise

## 2. Discover Available Data in HDFS

In [None]:
# Check HDFS connection and discover data structure
def check_hdfs_path(path):
    """Check if HDFS path exists"""
    try:
        hadoop_conf = spark.sparkContext._jsc.hadoopConfiguration()
        fs = spark.sparkContext._jvm.org.apache.hadoop.fs.FileSystem.get(hadoop_conf)
        return fs.exists(spark.sparkContext._jvm.org.apache.hadoop.fs.Path(path))
    except Exception as e:
        print(f"❌ Error checking path {path}: {str(e)}")
        return False

def list_hdfs_directory(path):
    """List contents of HDFS directory"""
    try:
        hadoop_conf = spark.sparkContext._jsc.hadoopConfiguration()
        fs = spark.sparkContext._jvm.org.apache.hadoop.fs.FileSystem.get(hadoop_conf)
        hdfs_path = spark.sparkContext._jvm.org.apache.hadoop.fs.Path(path)

        if fs.exists(hdfs_path):
            status_list = fs.listStatus(hdfs_path)
            return [str(status.getPath().getName()) for status in status_list]
        else:
            return []
    except Exception as e:
        print(f"❌ Error listing directory {path}: {str(e)}")
        return []

# Base path for real estate data
base_path = "/data/realestate/processed/gold/unified"
print(f"🔍 Checking base path: {base_path}")

if check_hdfs_path(base_path):
    print("✅ Base path exists")

    # List property types
    property_types = list_hdfs_directory(base_path)
    print(f"📁 Available property types: {property_types}")

    # Check house data specifically
    house_path = f"{base_path}/house"
    if check_hdfs_path(house_path):
        print(f"✅ House data path exists: {house_path}")

        # List available years
        years = list_hdfs_directory(house_path)
        print(f"📅 Available years: {sorted(years)}")

        # Check recent data
        if '2024' in years:
            year_path = f"{house_path}/2024"
            months = list_hdfs_directory(year_path)
            print(f"📅 Available months in 2024: {sorted(months)}")

            if '06' in months:
                month_path = f"{year_path}/06"
                days = list_hdfs_directory(month_path)
                print(f"📅 Available days in 2024-06: {sorted(days)}")
    else:
        print(f"❌ House data path does not exist: {house_path}")
else:
    print(f"❌ Base path does not exist: {base_path}")

    # Try alternative base paths
    alternative_bases = [
        "/data/realestate/processed/gold",
        "/data/realestate/gold",
        "/data/processed/gold"
    ]

    for alt_base in alternative_bases:
        print(f"🔍 Trying alternative base: {alt_base}")
        if check_hdfs_path(alt_base):
            print(f"✅ Found alternative base: {alt_base}")
            contents = list_hdfs_directory(alt_base)
            print(f"📁 Contents: {contents}")
            break

## 3. Load Sample Data

In [None]:
# Try to load a specific day's data
test_date = "2024-06-07"
property_type = "house"

# Construct path based on discovered structure
data_path = f"/data/realestate/processed/gold/unified/{property_type}/{test_date.replace('-', '/')}/*.parquet"
print(f"🔍 Attempting to load data from: {data_path}")

try:
    df = spark.read.parquet(data_path)
    print(f"✅ Successfully loaded data from {test_date}")
    print(f"📊 Total records: {df.count():,}")
    print(f"📊 Total columns: {len(df.columns)}")
except Exception as e:
    print(f"❌ Failed to load from primary path: {str(e)}")

    # Try alternative path formats
    alternative_paths = [
        f"/data/realestate/processed/gold/{property_type}/{test_date.replace('-', '/')}/*.parquet",
        f"/data/realestate/gold/{property_type}/{test_date.replace('-', '/')}/*.parquet",
        f"/data/realestate/processed/gold/unified/{property_type}/unified_*.parquet",
        f"/data/realestate/processed/gold/{property_type}/unified_*.parquet"
    ]

    for alt_path in alternative_paths:
        try:
            print(f"🔍 Trying: {alt_path}")
            df = spark.read.parquet(alt_path)
            print(f"✅ SUCCESS! Loaded data from: {alt_path}")
            print(f"📊 Total records: {df.count():,}")
            print(f"📊 Total columns: {len(df.columns)}")
            data_path = alt_path
            break
        except Exception as alt_e:
            print(f"❌ Failed: {str(alt_e)[:100]}...")
    else:
        raise Exception("Could not find gold data at any expected path")

## 4. Analyze Data Schema and Quality

In [None]:
# Display complete schema
print("📋 COMPLETE SCHEMA:")
print("=" * 80)
for i, field in enumerate(df.schema.fields, 1):
    print(f"  {i:2d}. {field.name:<20} : {field.dataType.typeName()}")

print(f"\n📊 TOTAL COLUMNS: {len(df.columns)}")
print(f"📊 TOTAL RECORDS: {df.count():,}")

In [None]:
# Check for critical columns that the ML pipeline needs
required_columns = [
    'id', 'price', 'area', 'latitude', 'longitude',
    'district', 'ward', 'property_type', 'data_date'
]

print("🔍 CRITICAL COLUMNS CHECK:")
print("=" * 50)

available_columns = df.columns
for col_name in required_columns:
    if col_name in available_columns:
        col_type = dict(df.dtypes)[col_name]
        print(f"  ✅ {col_name:<15} : {col_type}")
    else:
        print(f"  ❌ {col_name:<15} : MISSING")

# Check if there are similar column names
print("\n🔍 SIMILAR COLUMN NAMES:")
import re
for req_col in required_columns:
    if req_col not in available_columns:
        similar = [col for col in available_columns if req_col.lower() in col.lower() or col.lower() in req_col.lower()]
        if similar:
            print(f"  🔍 For '{req_col}' found similar: {similar}")

In [None]:
# Show sample data for key columns
key_columns = ['id', 'price', 'area', 'latitude', 'longitude'] if all(col in df.columns for col in ['id', 'price', 'area', 'latitude', 'longitude']) else df.columns[:5]

print("🔍 SAMPLE DATA (First 10 records):")
print("=" * 80)
df.select(*key_columns).show(10, truncate=False)

# Show data types for these columns
print("\n📊 KEY COLUMNS DATA TYPES:")
for col_name in key_columns:
    col_type = dict(df.dtypes)[col_name]
    print(f"  - {col_name:<15} : {col_type}")

## 5. Deep Dive into PRICE Column Analysis

In [None]:
# Focus on price column since that's where most data is being filtered out
if 'price' in df.columns:
    print("💰 PRICE COLUMN DEEP ANALYSIS:")
    print("=" * 60)

    # Get price column type
    price_type = dict(df.dtypes)['price']
    print(f"Price column data type: {price_type}")

    # Total records
    total_records = df.count()
    print(f"Total records: {total_records:,}")

    # Null analysis
    null_count = df.filter(col('price').isNull()).count()
    print(f"Null prices: {null_count:,} ({(null_count/total_records*100):.2f}%)")

    # If price is string type, check for string issues
    if price_type == 'string':
        print("\n🔍 STRING PRICE ANALYSIS:")

        # Check for empty strings
        empty_count = df.filter(col('price') == '').count()
        print(f"Empty string prices: {empty_count:,} ({(empty_count/total_records*100):.2f}%)")

        # Check for 'null' strings
        null_string_count = df.filter(col('price').isin(['null', 'NULL', 'None'])).count()
        print(f"'null' string prices: {null_string_count:,} ({(null_string_count/total_records*100):.2f}%)")

        # Show sample string values
        print("\n📝 Sample price string values:")
        sample_prices = df.select('price').filter(col('price').isNotNull() & (col('price') != '')).limit(20).collect()
        for i, row in enumerate(sample_prices[:10], 1):
            price_val = row['price']
            print(f"  {i:2d}. '{price_val}' (len: {len(str(price_val))})")

        # Try to convert to numeric and see what fails
        print("\n🔢 NUMERIC CONVERSION TEST:")

        # Add a column that tries to convert price to double
        df_with_numeric = df.withColumn(
            'price_numeric',
            regexp_replace(col('price'), '[^0-9.]', '').cast(DoubleType())
        )

        # Count successful conversions
        successful_conversions = df_with_numeric.filter(
            col('price_numeric').isNotNull() &
            (col('price_numeric') > 0)
        ).count()

        print(f"Successful numeric conversions: {successful_conversions:,} ({(successful_conversions/total_records*100):.2f}%)")
        print(f"Failed conversions: {total_records - successful_conversions:,} ({((total_records - successful_conversions)/total_records*100):.2f}%)")

        # Show examples of failed conversions
        print("\n❌ Examples of unconvertible prices:")
        failed_prices = df_with_numeric.filter(
            col('price').isNotNull() &
            (col('price') != '') &
            col('price_numeric').isNull()
        ).select('price').limit(10).collect()

        for i, row in enumerate(failed_prices, 1):
            print(f"  {i}. '{row['price']}'")

        # Show statistics for successful conversions
        if successful_conversions > 0:
            print("\n📊 Statistics for convertible prices:")
            stats = df_with_numeric.filter(col('price_numeric').isNotNull()).select(
                spark_min('price_numeric').alias('min_price'),
                spark_max('price_numeric').alias('max_price'),
                avg('price_numeric').alias('avg_price'),
                count('price_numeric').alias('count_price')
            ).collect()[0]

            print(f"  Min: {stats['min_price']:,.0f}")
            print(f"  Max: {stats['max_price']:,.0f}")
            print(f"  Average: {stats['avg_price']:,.0f}")
            print(f"  Count: {stats['count_price']:,}")

    else:
        # Price is already numeric
        print("\n📊 NUMERIC PRICE ANALYSIS:")

        # Get basic statistics
        price_stats = df.select(
            spark_min('price').alias('min_price'),
            spark_max('price').alias('max_price'),
            avg('price').alias('avg_price'),
            count('price').alias('count_price')
        ).collect()[0]

        print(f"  Min: {price_stats['min_price']:,.2f}")
        print(f"  Max: {price_stats['max_price']:,.2f}")
        print(f"  Average: {price_stats['avg_price']:,.2f}")
        print(f"  Non-null count: {price_stats['count_price']:,}")

        # Check for problematic values
        zero_prices = df.filter(col('price') == 0).count()
        negative_prices = df.filter(col('price') < 0).count()

        print(f"\nZero prices: {zero_prices:,}")
        print(f"Negative prices: {negative_prices:,}")

else:
    print("❌ Price column not found in dataset!")
    print(f"Available columns: {df.columns}")

## 6. Simulate Pipeline Validation Logic

In [None]:
# Simulate the exact validation logic from the ML pipeline
print("🔍 SIMULATING PIPELINE VALIDATION LOGIC:")
print("=" * 60)

# Start with the original dataset
current_df = df
initial_count = current_df.count()
print(f"📊 Starting records: {initial_count:,}")

# Step 1: Basic data validation (similar to _validate_basic_data method)
print("\n🔍 Step 1: Basic Data Validation")

# Check what columns exist for validation
validation_columns = ['id', 'price', 'area', 'latitude', 'longitude']
available_validation_cols = [col for col in validation_columns if col in current_df.columns]
print(f"Available validation columns: {available_validation_cols}")

# Apply validation logic step by step
if 'price' in available_validation_cols:
    # Check price column type and apply appropriate validation
    price_type = dict(current_df.dtypes)['price']
    print(f"Price column type: {price_type}")

    if price_type == 'string':
        # String type validation
        print("\n💰 Validating STRING price column:")

        # Count records that would be filtered out
        null_or_empty = current_df.filter(
            col('price').isNull() |
            (col('price') == '') |
            (col('price') == 'null') |
            (col('price') == 'NULL')
        ).count()

        print(f"Records with null/empty price: {null_or_empty:,}")

        # Apply the filter (this is what causes the massive data loss!)
        current_df = current_df.filter(
            col('price').isNotNull() &
            (col('price') != '') &
            (col('price') != 'null') &
            (col('price') != 'NULL')
        )

        after_price_validation = current_df.count()
        filtered_out = initial_count - after_price_validation
        print(f"Records after price validation: {after_price_validation:,}")
        print(f"Records filtered out: {filtered_out:,} ({(filtered_out/initial_count*100):.2f}%)")

        # ISSUE ANALYSIS: Check if the problem is that price values look like numbers but are stored as strings
        print("\n🔍 ANALYZING THE ISSUE:")

        # Let's check the original data before any filtering
        print("Checking sample price values in original data...")
        original_sample = df.select('price').limit(20).collect()

        for i, row in enumerate(original_sample[:10], 1):
            price_val = row['price']
            is_null = price_val is None
            is_empty = price_val == '' if not is_null else False
            is_null_string = price_val in ['null', 'NULL'] if not is_null else False

            print(f"  {i:2d}. Value: '{price_val}' | Null: {is_null} | Empty: {is_empty} | NullString: {is_null_string}")

    else:
        # Numeric type validation
        print(f"\n💰 Validating NUMERIC price column ({price_type}):")

        # For numeric columns, only filter null values
        null_prices = current_df.filter(col('price').isNull()).count()
        print(f"Records with null price: {null_prices:,}")

        current_df = current_df.filter(col('price').isNotNull())

        after_price_validation = current_df.count()
        filtered_out = initial_count - after_price_validation
        print(f"Records after price validation: {after_price_validation:,}")
        print(f"Records filtered out: {filtered_out:,} ({(filtered_out/initial_count*100):.2f}%)")

# Continue with other validations if we still have data
if current_df.count() > 0:
    print("\n🔍 Step 2: Additional Validations")

    # Area validation
    if 'area' in available_validation_cols:
        area_type = dict(current_df.dtypes)['area']
        print(f"\n🏠 Area column type: {area_type}")

        before_area = current_df.count()

        if area_type == 'string':
            current_df = current_df.filter(
                col('area').isNotNull() &
                (col('area') != '') &
                (col('area') != 'null') &
                (col('area') != 'NULL')
            )
        else:
            current_df = current_df.filter(col('area').isNotNull())

        after_area = current_df.count()
        print(f"Records after area validation: {after_area:,} (filtered: {before_area - after_area:,})")

    # Location validation
    location_cols = ['latitude', 'longitude']
    for loc_col in location_cols:
        if loc_col in available_validation_cols:
            loc_type = dict(current_df.dtypes)[loc_col]
            print(f"\n📍 {loc_col} column type: {loc_type}")

            before_loc = current_df.count()

            if loc_type == 'string':
                current_df = current_df.filter(
                    col(loc_col).isNotNull() &
                    (col(loc_col) != '') &
                    (col(loc_col) != 'null') &
                    (col(loc_col) != 'NULL')
                )
            else:
                current_df = current_df.filter(col(loc_col).isNotNull())

            after_loc = current_df.count()
            print(f"Records after {loc_col} validation: {after_loc:,} (filtered: {before_loc - after_loc:,})")

print(f"\n📊 FINAL RESULT:")
final_count = current_df.count()
total_filtered = initial_count - final_count
print(f"Final records: {final_count:,}")
print(f"Total filtered out: {total_filtered:,} ({(total_filtered/initial_count*100):.2f}%)")

if total_filtered > initial_count * 0.8:  # More than 80% filtered
    print("\n🚨 CRITICAL ISSUE IDENTIFIED:")
    print("More than 80% of data is being filtered out!")
    print("This suggests a fundamental data type or validation logic problem.")

## 7. Root Cause Analysis and Recommendations

In [None]:
print("🔍 ROOT CAUSE ANALYSIS:")
print("=" * 60)

# Check the actual data distribution in key columns
print("\n1. DATA TYPE ISSUES:")

# Check if numeric values are stored as strings
if 'price' in df.columns:
    price_type = dict(df.dtypes)['price']
    if price_type == 'string':
        print("❌ ISSUE: Price is stored as STRING but should be NUMERIC")

        # Test conversion rate
        df_test = df.withColumn(
            'price_as_double',
            regexp_replace(col('price'), '[^0-9.]', '').cast(DoubleType())
        )

        convertible = df_test.filter(col('price_as_double').isNotNull()).count()
        total = df_test.count()

        print(f"   Convertible to numeric: {convertible:,}/{total:,} ({(convertible/total*100):.1f}%)")

        if convertible > total * 0.8:  # More than 80% convertible
            print("   💡 SOLUTION: Convert string prices to numeric BEFORE validation")
    else:
        print(f"✅ Price column has correct type: {price_type}")

print("\n2. VALIDATION LOGIC ISSUES:")

# Check if the validation is too strict
problematic_patterns = {
    'empty_strings': df.filter(col('price') == '').count() if 'price' in df.columns else 0,
    'null_strings': df.filter(col('price').isin(['null', 'NULL'])).count() if 'price' in df.columns else 0,
    'actual_nulls': df.filter(col('price').isNull()).count() if 'price' in df.columns else 0
}

total_records = df.count()
for pattern, count in problematic_patterns.items():
    if count > 0:
        pct = (count / total_records) * 100
        print(f"   {pattern}: {count:,} ({pct:.2f}%)")

print("\n3. RECOMMENDATIONS:")
print("   1. 🔧 Add data type optimization BEFORE validation")
print("   2. 🔧 Convert string numeric columns to proper types")
print("   3. 🔧 Use type-aware validation logic")
print("   4. 🔧 Add better logging to track filtering steps")
print("   5. 🔧 Consider relaxing validation criteria for development")

## 8. Proposed Fix Implementation

In [None]:
print("🔧 TESTING PROPOSED FIX:")
print("=" * 50)

# Implement the fix that should be applied to the pipeline
def optimize_data_types(df):
    """Convert string columns to appropriate numeric types"""

    # Define columns that should be numeric
    numeric_columns = ['price', 'area', 'latitude', 'longitude', 'price_per_m2']

    result_df = df
    conversions_applied = []

    for col_name in numeric_columns:
        if col_name in df.columns:
            current_type = dict(df.dtypes)[col_name]

            if current_type == 'string':
                print(f"🔄 Converting {col_name} from string to double")

                # Clean and convert
                result_df = result_df.withColumn(
                    col_name,
                    regexp_replace(col(col_name), '[^0-9.-]', '').cast(DoubleType())
                )

                conversions_applied.append(col_name)

    print(f"✅ Applied conversions to: {conversions_applied}")
    return result_df

def validate_with_proper_types(df):
    """Apply validation logic that's aware of data types"""

    result_df = df
    initial_count = df.count()

    print(f"Starting validation with {initial_count:,} records")

    validation_columns = ['price', 'area', 'latitude', 'longitude']

    for col_name in validation_columns:
        if col_name in df.columns:
            col_type = dict(result_df.dtypes)[col_name]
            before_count = result_df.count()

            if col_type in ['double', 'float', 'int', 'integer', 'long']:
                # Numeric validation - only filter nulls and invalid values
                if col_name in ['latitude', 'longitude']:
                    # For coordinates, filter null and extreme values
                    result_df = result_df.filter(
                        col(col_name).isNotNull() &
                        (col(col_name) != 0) &  # Invalid coordinates
                        (col(col_name).between(-180, 180))
                    )
                elif col_name in ['price', 'area']:
                    # For price and area, filter null and negative/zero values
                    result_df = result_df.filter(
                        col(col_name).isNotNull() &
                        (col(col_name) > 0)
                    )
                else:
                    # General numeric validation
                    result_df = result_df.filter(col(col_name).isNotNull())

            else:
                # String validation - filter null, empty, and 'null' strings
                result_df = result_df.filter(
                    col(col_name).isNotNull() &
                    (col(col_name) != '') &
                    (~col(col_name).isin(['null', 'NULL', 'None']))
                )

            after_count = result_df.count()
            filtered = before_count - after_count

            print(f"  {col_name} ({col_type}): {after_count:,} records (filtered: {filtered:,})")

    final_count = result_df.count()
    total_filtered = initial_count - final_count

    print(f"\n📊 Validation complete:")
    print(f"  Final records: {final_count:,}")
    print(f"  Total filtered: {total_filtered:,} ({(total_filtered/initial_count*100):.1f}%)")

    return result_df

# Apply the fix
print("\n🔧 STEP 1: Optimize Data Types")
fixed_df = optimize_data_types(df)

# Show the difference
print("\n📋 Data types after optimization:")
key_columns = ['price', 'area', 'latitude', 'longitude']
for col_name in key_columns:
    if col_name in fixed_df.columns:
        original_type = dict(df.dtypes)[col_name]
        new_type = dict(fixed_df.dtypes)[col_name]
        print(f"  {col_name}: {original_type} → {new_type}")

print("\n🔧 STEP 2: Apply Proper Validation")
validated_df = validate_with_proper_types(fixed_df)

# Compare with original broken validation
original_count = df.count()
fixed_count = validated_df.count()
improvement = fixed_count - final_count  # final_count from previous broken validation

print(f"\n📊 COMPARISON:")
print(f"  Original data: {original_count:,} records")
print(f"  Broken pipeline result: {final_count:,} records ({(final_count/original_count*100):.1f}% retained)")
print(f"  Fixed pipeline result: {fixed_count:,} records ({(fixed_count/original_count*100):.1f}% retained)")
print(f"  Improvement: +{improvement:,} records ({(improvement/original_count*100):.1f}% better retention)")

if fixed_count > original_count * 0.5:  # Retaining more than 50%
    print("\n✅ SUCCESS: Fix dramatically improves data retention!")
else:
    print("\n⚠️  WARNING: Still losing significant data - further investigation needed")

# Show sample of final data
if fixed_count > 0:
    print("\n🔍 Sample of cleaned data:")
    sample_cols = [col for col in ['id', 'price', 'area', 'latitude', 'longitude'] if col in validated_df.columns]
    validated_df.select(*sample_cols).show(5, truncate=False)

In [None]:
# Cleanup
spark.stop()
print("✅ Spark session stopped")