# Auto Insurance Fraud Detection - ML Preprocessing Pipeline

## Senior-Level Data Science Approach

This notebook implements a comprehensive preprocessing pipeline for auto insurance fraud detection using industry best practices.

### Pipeline Overview:
1. **Data Loading & Quality Analysis**
2. **Missing Value Handling**
3. **Duplicate Detection & Removal**
4. **Outlier Detection & Treatment**
5. **Categorical Feature Encoding**
6. **Feature Selection**
7. **Feature Normalization**
8. **Feature Engineering**
9. **Report Generation**

In [None]:
# Import required libraries
import sys
import os
sys.path.append('/Users/debabratapattnayak/web-dev/learnathon')

# Import our comprehensive preprocessing pipeline
exec(open('comprehensive_fraud_preprocessing.py').read())

# Display environment info
print("Environment Setup Complete!")
print(f"Python Version: {sys.version}")
print(f"Working Directory: {os.getcwd()}")

## Step 1: Initialize the Preprocessing Pipeline

In [None]:
# Initialize the fraud detection preprocessor
data_path = "/Users/debabratapattnayak/web-dev/learnathon/dataset"
output_dir = "/Users/debabratapattnayak/web-dev/learnathon/ml_analysis_reports"

preprocessor = FraudDetectionPreprocessor(data_path, output_dir)
print(f"Preprocessor initialized with output directory: {preprocessor.report_dir}")

## Step 2: Load and Explore Data

In [None]:
# Load the datasets
preprocessor.load_data()

print(f"Training Data Shape: {preprocessor.combined_train.shape}")
print(f"Test Data Shape: {preprocessor.test_data.shape}")
print(f"\nTraining Data Columns: {list(preprocessor.combined_train.columns)}")

In [None]:
# Display first few rows
print("First 5 rows of training data:")
display(preprocessor.combined_train.head())

print("\nData Info:")
preprocessor.combined_train.info()

## Step 3: Data Quality Analysis

In [None]:
# Perform comprehensive data quality analysis
quality_analysis = preprocessor.analyze_data_quality()

print("=== DATA QUALITY ANALYSIS ===")
print(f"\nColumns with missing values: {len(quality_analysis['missing_values']['columns_with_missing'])}")
print(f"Total duplicates: {quality_analysis['duplicates']['count']} ({quality_analysis['duplicates']['percentage']:.2f}%)")

# Show missing value percentages
missing_pct = quality_analysis['missing_values']['percentages']
missing_cols = [(col, pct) for col, pct in missing_pct.items() if pct > 0]
missing_cols.sort(key=lambda x: x[1], reverse=True)

print("\nTop columns with missing values:")
for col, pct in missing_cols[:10]:
    print(f"  {col}: {pct:.2f}%")

## Step 4: Handle Missing Values

In [None]:
# Handle missing values using advanced strategies
print("Before missing value handling:")
print(f"Total missing values: {preprocessor.combined_train.isnull().sum().sum()}")
print(f"Data shape: {preprocessor.combined_train.shape}")

preprocessor.handle_missing_values()

print("\nAfter missing value handling:")
print(f"Total missing values: {preprocessor.combined_train.isnull().sum().sum()}")
print(f"Data shape: {preprocessor.combined_train.shape}")

# Display handling summary
if 'missing_values_handled' in preprocessor.preprocessing_summary:
    mv_summary = preprocessor.preprocessing_summary['missing_values_handled']
    print(f"\nStrategy used: {mv_summary['strategy_used']}")
    if mv_summary['high_missing_dropped']:
        print(f"Columns dropped: {mv_summary['high_missing_dropped']}")

## Step 5: Handle Duplicates

In [None]:
# Identify and handle duplicate records
preprocessor.identify_and_handle_duplicates()

if 'duplicates_handled' in preprocessor.preprocessing_summary:
    dup_info = preprocessor.preprocessing_summary['duplicates_handled']
    print("=== DUPLICATE HANDLING RESULTS ===")
    print(f"Initial records: {dup_info['initial_count']}")
    print(f"Final records: {dup_info['final_count']}")
    print(f"Duplicates removed: {dup_info['removed_count']} ({dup_info['removal_percentage']:.2f}%)")

## Step 6: Outlier Detection and Treatment

In [None]:
# Detect outliers using multiple methods
outlier_analysis = preprocessor.detect_outliers()

print("=== OUTLIER ANALYSIS ===")
print(f"Analyzed {len(outlier_analysis)} numerical columns")

# Show top columns with outliers
outlier_summary = []
for col, info in outlier_analysis.items():
    outlier_summary.append((col, info['iqr_outliers'], info['total_values']))

outlier_summary.sort(key=lambda x: x[1], reverse=True)

print("\nTop columns with outliers (IQR method):")
for col, outliers, total in outlier_summary[:10]:
    percentage = (outliers / total) * 100 if total > 0 else 0
    print(f"  {col}: {outliers} outliers ({percentage:.2f}%)")

In [None]:
# Handle outliers using capping method
preprocessor.handle_outliers(method='cap')

print("Outliers handled using capping method (1st and 99th percentiles)")
if 'outliers_handled' in preprocessor.preprocessing_summary:
    outlier_info = preprocessor.preprocessing_summary['outliers_handled']
    print(f"\nColumns processed: {len(outlier_info)}")
    
    # Show summary for top columns
    for col, info in list(outlier_info.items())[:5]:
        print(f"  {col}: {info['outliers_handled']} outliers capped")

## Step 7: Categorical Feature Encoding

In [None]:
# Encode categorical features
print("Before encoding:")
categorical_cols = preprocessor.combined_train.select_dtypes(include=['object']).columns
print(f"Categorical columns: {len(categorical_cols)}")
print(f"Columns: {list(categorical_cols)}")

preprocessor.encode_categorical_features()

print("\nAfter encoding:")
categorical_cols_after = preprocessor.combined_train.select_dtypes(include=['object']).columns
print(f"Remaining categorical columns: {len(categorical_cols_after)}")

if 'categorical_encoding' in preprocessor.preprocessing_summary:
    enc_info = preprocessor.preprocessing_summary['categorical_encoding']
    label_encoded = [col for col, info in enc_info.items() if info['method'] == 'label_encoding']
    freq_encoded = [col for col, info in enc_info.items() if info['method'] == 'frequency_encoding']
    
    print(f"\nLabel encoded columns: {len(label_encoded)}")
    print(f"Frequency encoded columns: {len(freq_encoded)}")

## Step 8: Feature Selection

In [None]:
# Select important features using multiple methods
selected_features = preprocessor.select_important_features(n_features=15)

print("=== FEATURE SELECTION RESULTS ===")
print(f"Total features available: {len(preprocessor.combined_train.columns) - 1}")  # Exclude target
print(f"Selected features: {len(selected_features)}")

print("\nTop Selected Features:")
for i, feature in enumerate(selected_features, 1):
    score = preprocessor.feature_analysis['combined_scores'].get(feature, 0)
    print(f"{i:2d}. {feature:<25} (Score: {score:.4f})")

## Step 9: Feature Normalization

In [None]:
# Normalize selected features
preprocessor.normalize_features()

print("=== FEATURE NORMALIZATION ===")
if 'normalization' in preprocessor.preprocessing_summary:
    norm_info = preprocessor.preprocessing_summary['normalization']
    print(f"Method: {norm_info['method']}")
    print(f"Features normalized: {len(norm_info['features_normalized'])}")
    
    # Show before/after statistics for a few features
    print("\nNormalization Statistics (first 5 features):")
    for i, feature in enumerate(norm_info['features_normalized'][:5]):
        original_mean = preprocessor.combined_train[feature].mean()
        original_std = preprocessor.combined_train[feature].std()
        normalized_mean = preprocessor.combined_train[f"{feature}_normalized"].mean()
        normalized_std = preprocessor.combined_train[f"{feature}_normalized"].std()
        
        print(f"  {feature}:")
        print(f"    Original: mean={original_mean:.2f}, std={original_std:.2f}")
        print(f"    Normalized: mean={normalized_mean:.2f}, std={normalized_std:.2f}")

## Step 10: Feature Engineering

In [None]:
# Create engineered features
engineered_features = preprocessor.create_engineered_features()

print("=== FEATURE ENGINEERING RESULTS ===")
print(f"Engineered features created: {len(engineered_features)}")

print("\nNew Features:")
for i, feature in enumerate(engineered_features, 1):
    print(f"{i}. {feature}")
    
    # Show basic statistics
    if feature in preprocessor.combined_train.columns:
        stats = preprocessor.combined_train[feature].describe()
        print(f"   Mean: {stats['mean']:.4f}, Std: {stats['std']:.4f}")
        print(f"   Min: {stats['min']:.4f}, Max: {stats['max']:.4f}")
    print()

## Step 11: Generate Visualizations

In [None]:
# Create comprehensive visualizations
preprocessor.create_preprocessing_visualizations()

print("Visualizations created successfully!")
print(f"Saved to: {preprocessor.report_dir}")

# List created visualization files
import glob
viz_files = glob.glob(str(preprocessor.report_dir / "*.png"))
print(f"\nVisualization files created: {len(viz_files)}")
for file in viz_files:
    print(f"  - {os.path.basename(file)}")

## Step 12: Generate PDF Reports

In [None]:
# Generate comprehensive PDF reports
print("Generating PDF reports...")

# Generate preprocessing report (PDF 1)
pdf1_path = preprocessor.generate_preprocessing_pdf()
print(f"\nPreprocessing Report (PDF 1): {pdf1_path}")

# Generate feature engineering report (PDF 2)
pdf2_path = preprocessor.generate_feature_engineering_pdf()
print(f"Feature Engineering Report (PDF 2): {pdf2_path}")

print("\n=== PDF REPORTS GENERATED SUCCESSFULLY ===")

## Step 13: Save Processed Data

In [None]:
# Save the processed training data
processed_data_path = preprocessor.report_dir / "processed_training_data.csv"
preprocessor.combined_train.to_csv(processed_data_path, index=False)

print(f"Processed data saved to: {processed_data_path}")
print(f"Final data shape: {preprocessor.combined_train.shape}")

# Show final column summary
print("\nFinal Dataset Summary:")
print(f"Total columns: {len(preprocessor.combined_train.columns)}")
print(f"Selected original features: {len(preprocessor.selected_features)}")
print(f"Engineered features: {len(preprocessor.engineered_features)}")
print(f"Normalized features: {len([col for col in preprocessor.combined_train.columns if '_normalized' in col])}")

## Final Summary and Next Steps

In [None]:
print("\n" + "="*80)
print("PREPROCESSING PIPELINE COMPLETED SUCCESSFULLY!")
print("="*80)

print(f"\n📁 Report Directory: {preprocessor.report_dir}")
print(f"📄 Preprocessing Report (PDF 1): preprocessing_analysis_report.pdf")
print(f"📄 Feature Engineering Report (PDF 2): feature_engineering_report.pdf")
print(f"💾 Processed Data: processed_training_data.csv")

print(f"\n📊 Data Summary:")
print(f"   • Original shape: {preprocessor.preprocessing_summary['initial_train_shape']}")
print(f"   • Final shape: {preprocessor.combined_train.shape}")
print(f"   • Selected features: {len(preprocessor.selected_features)}")
print(f"   • Engineered features: {len(preprocessor.engineered_features)}")

print(f"\n🎯 Ready for Model Building:")
print(f"   • Data is cleaned and preprocessed")
print(f"   • Features are selected and engineered")
print(f"   • Normalization applied for ML algorithms")
print(f"   • Comprehensive documentation generated")

print(f"\n🚀 Next Steps:")
print(f"   1. Review the generated PDF reports")
print(f"   2. Use processed data for model training")
print(f"   3. Apply same preprocessing to test data")
print(f"   4. Build and evaluate ML models")
print(f"   5. Create Streamlit application")

print("="*80)