In [5]:
# complete_fraud_detection_system.py
# Complete Insurance Fraud Detection System with Training and Prediction

import pandas as pd
import numpy as np
import joblib
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
import xgboost as xgb
from sklearn.utils import resample
warnings.filterwarnings('ignore')

# Set up plotting style
plt.style.use('default')
sns.set_palette("husl")

class FraudDetectionSystem:
    def __init__(self):
        self.model = None
        self.scaler = None
        self.feature_columns = None
        self.label_encoders = {}
        
    def create_synthetic_data(self, n_samples=10000):
        """
        Create synthetic insurance fraud data for training
        """
        print("🔧 Creating synthetic training data...")
        np.random.seed(42)
        
        data = {
            'Policy_Num': [f'POL{str(i).zfill(6)}' for i in range(n_samples)],
            'Age': np.random.randint(18, 80, n_samples),
            'Gender': np.random.choice(['Male', 'Female'], n_samples, p=[0.6, 0.4]),
            'Education': np.random.choice(['High School', 'Bachelor', 'Master', 'PhD'], n_samples, p=[0.4, 0.35, 0.2, 0.05]),
            'Occupation': np.random.choice(['Engineer', 'Doctor', 'Teacher', 'Student', 'Other'], n_samples, p=[0.2, 0.15, 0.15, 0.1, 0.4]),
            'Annual_Income': np.random.lognormal(10.5, 0.5, n_samples).astype(int),
            'Policy_State': np.random.choice(['CA', 'TX', 'NY', 'FL', 'IL'], n_samples, p=[0.3, 0.25, 0.2, 0.15, 0.1]),
            'Vehicle_Cost': np.random.lognormal(10, 0.3, n_samples).astype(int),
            'Vehicle_Age': np.random.randint(0, 20, n_samples),
            'Auto_Make': np.random.choice(['Toyota', 'Honda', 'Ford', 'BMW', 'Mercedes'], n_samples, p=[0.3, 0.25, 0.2, 0.15, 0.1]),
            'Auto_Model': np.random.choice(['Sedan', 'SUV', 'Truck', 'Coupe', 'Hatchback'], n_samples, p=[0.3, 0.25, 0.2, 0.15, 0.1]),
            'Total_Claim': np.random.lognormal(8, 0.7, n_samples).astype(int),
            'Injury_Claim': np.random.lognormal(7, 0.8, n_samples).astype(int),
            'Property_Claim': np.random.lognormal(7.5, 0.6, n_samples).astype(int),
            'Vehicle_Claim': np.random.lognormal(7.8, 0.5, n_samples).astype(int),
            'Accident_Severity': np.random.choice(['Minor', 'Major', 'Total Loss'], n_samples, p=[0.6, 0.3, 0.1]),
            'Accident_Type': np.random.choice(['Single Vehicle', 'Multi-Vehicle', 'Parked Car'], n_samples, p=[0.4, 0.5, 0.1]),
            'Collision_Type': np.random.choice(['Front', 'Rear', 'Side', 'Other'], n_samples, p=[0.3, 0.25, 0.25, 0.2]),
            'Accident_Hour': np.random.randint(0, 24, n_samples),
            'Number_of_Vehicles_Involved': np.random.choice([1, 2, 3, 4], n_samples, p=[0.4, 0.4, 0.15, 0.05]),
            'Bodily_Injuries': np.random.randint(0, 5, n_samples),
            'Witnesses': np.random.randint(0, 4, n_samples),
            'Police_Report_Available': np.random.choice(['Yes', 'No'], n_samples, p=[0.7, 0.3]),
            'authorities_contacted': np.random.choice(['Police', 'Fire', 'Ambulance', 'None'], n_samples, p=[0.5, 0.1, 0.2, 0.2]),
            'Deductible': np.random.choice([500, 1000, 1500, 2000], n_samples, p=[0.3, 0.4, 0.2, 0.1]),
            'Driver_Rating': np.random.randint(1, 5, n_samples),
            'Days_Policy_Accident': np.random.randint(1, 365, n_samples),
            'Days_Policy_Claim': np.random.randint(1, 30, n_samples),
            'Past_Number_of_Claims': np.random.poisson(0.5, n_samples),
            'SafeDriver': np.random.choice(['Yes', 'No'], n_samples, p=[0.8, 0.2])
        }
        
        df = pd.DataFrame(data)
        
        # Create fraud labels based on logical rules
        fraud_probability = np.zeros(n_samples)
        
        # High fraud indicators
        fraud_probability += (df['Total_Claim'] > 50000) * 0.3
        fraud_probability += (df['Days_Policy_Claim'] < 7) * 0.25
        fraud_probability += (df['Police_Report_Available'] == 'No') * 0.2
        fraud_probability += (df['Witnesses'] == 0) * 0.15
        fraud_probability += (df['Past_Number_of_Claims'] > 2) * 0.2
        fraud_probability += (df['Accident_Hour'].between(22, 6)) * 0.1
        fraud_probability += (df['SafeDriver'] == 'No') * 0.15
        fraud_probability += (df['Accident_Severity'] == 'Total Loss') * 0.2
        
        # Add some randomness
        fraud_probability += np.random.random(n_samples) * 0.1
        
        # Clip probabilities
        fraud_probability = np.clip(fraud_probability, 0, 1)
        
        # Generate fraud labels
        df['Fraud'] = (np.random.random(n_samples) < fraud_probability).astype(int)
        
        print(f"✅ Synthetic data created: {len(df)} records")
        print(f"   Fraud cases: {df['Fraud'].sum()} ({df['Fraud'].mean()*100:.1f}%)")
        
        return df
    
    def preprocess_data(self, df, is_training=True):
        """
        Preprocess the data for training or prediction
        """
        print("🔧 Preprocessing data...")
        
        # Create a copy to avoid modifying original
        data = df.copy()
        
        # Handle missing values
        numeric_cols = data.select_dtypes(include=[np.number]).columns
        categorical_cols = data.select_dtypes(include=['object']).columns
        
        # Fill numeric missing values with median
        for col in numeric_cols:
            if data[col].isnull().any():
                data[col].fillna(data[col].median(), inplace=True)
        
        # Fill categorical missing values with mode
        for col in categorical_cols:
            if data[col].isnull().any():
                data[col].fillna(data[col].mode()[0], inplace=True)
        
        # Create engineered features
        data['Claim_to_Vehicle_Ratio'] = data.get('Total_Claim', 0) / (data.get('Vehicle_Cost', 1) + 1)
        data['Age_Vehicle_Interaction'] = data.get('Age', 0) * data.get('Vehicle_Age', 0)
        data['High_Risk_Hour'] = ((data.get('Accident_Hour', 12) >= 22) | (data.get('Accident_Hour', 12) <= 6)).astype(int)
        data['Quick_Claim'] = (data.get('Days_Policy_Claim', 30) <= 7).astype(int)
        data['High_Claim_Amount'] = (data.get('Total_Claim', 0) > data.get('Total_Claim', 0).quantile(0.9) if 'Total_Claim' in data.columns else 0).astype(int)
        data['Multiple_Claims_History'] = (data.get('Past_Number_of_Claims', 0) > 1).astype(int)
        
        # Encode categorical variables
        categorical_features = ['Gender', 'Education', 'Occupation', 'Policy_State', 'Auto_Make', 
                              'Auto_Model', 'Accident_Severity', 'Accident_Type', 'Collision_Type',
                              'Police_Report_Available', 'authorities_contacted', 'SafeDriver']
        
        for col in categorical_features:
            if col in data.columns:
                if is_training:
                    # Create and store label encoder for training
                    le = LabelEncoder()
                    data[col] = le.fit_transform(data[col].astype(str))
                    self.label_encoders[col] = le
                else:
                    # Use existing label encoder for prediction
                    if col in self.label_encoders:
                        try:
                            data[col] = self.label_encoders[col].transform(data[col].astype(str))
                        except ValueError:
                            # Handle unseen categories
                            data[col] = 0
                    else:
                        data[col] = 0
        
        # Remove non-feature columns
        columns_to_remove = ['Policy_Num', 'Fraud'] if 'Policy_Num' in data.columns else []
        for col in columns_to_remove:
            if col in data.columns:
                data = data.drop(columns=[col])
        
        # Store feature columns for consistency
        if is_training:
            self.feature_columns = data.columns.tolist()
        else:
            # Ensure prediction data has same columns as training
            for col in self.feature_columns:
                if col not in data.columns:
                    data[col] = 0
            data = data[self.feature_columns]
        
        print(f"✅ Preprocessing complete: {data.shape[1]} features")
        return data
    
    def train_model(self, df=None):
        """
        Train the fraud detection model
        """
        print("\n" + "="*70)
        print("🚀 TRAINING FRAUD DETECTION MODEL")
        print("="*70)
        
        # Create synthetic data if none provided
        if df is None:
            df = self.create_synthetic_data()
        
        # Separate features and target
        X = self.preprocess_data(df, is_training=True)
        y = df['Fraud'].values
        
        print(f"\n📊 Training Data Summary:")
        print(f"   Features: {X.shape[1]}")
        print(f"   Samples: {X.shape[0]}")
        print(f"   Fraud Rate: {y.mean()*100:.1f}%")
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )
        
        # Scale features
        self.scaler = StandardScaler()
        X_train_scaled = self.scaler.fit_transform(X_train)
        X_test_scaled = self.scaler.transform(X_test)
        
        # Train XGBoost model
        print("\n🤖 Training XGBoost model...")
        self.model = xgb.XGBClassifier(
            n_estimators=100,
            max_depth=6,
            learning_rate=0.1,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42,
            eval_metric='logloss'
        )
        
        self.model.fit(X_train_scaled, y_train)
        
        # Evaluate model
        train_score = self.model.score(X_train_scaled, y_train)
        test_score = self.model.score(X_test_scaled, y_test)
        
        y_pred = self.model.predict(X_test_scaled)
        y_pred_proba = self.model.predict_proba(X_test_scaled)[:, 1]
        
        auc_score = roc_auc_score(y_test, y_pred_proba)
        
        print(f"\n📈 Model Performance:")
        print(f"   Training Accuracy: {train_score:.4f}")
        print(f"   Testing Accuracy: {test_score:.4f}")
        print(f"   AUC Score: {auc_score:.4f}")
        
        # Cross-validation
        cv_scores = cross_val_score(self.model, X_train_scaled, y_train, cv=5, scoring='roc_auc')
        print(f"   CV AUC: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
        
        # Classification report
        print(f"\n📋 Classification Report:")
        print(classification_report(y_test, y_pred, target_names=['No Fraud', 'Fraud']))
        
        # Save model and components
        self.save_model()
        
        # Create training visualizations
        self.create_training_plots(X_test_scaled, y_test, y_pred_proba)
        
        print("\n✅ Model training completed successfully!")
        return self.model
    
    def save_model(self):
        """
        Save the trained model and components
        """
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        
        # Save model
        joblib.dump(self.model, 'fraud_detection_xgb_model.pkl')
        print(f"💾 Model saved: fraud_detection_xgb_model.pkl")
        
        # Save scaler
        joblib.dump(self.scaler, 'fraud_model_scaler.pkl')
        print(f"💾 Scaler saved: fraud_model_scaler.pkl")
        
        # Save feature columns
        joblib.dump(self.feature_columns, 'fraud_model_features.pkl')
        print(f"💾 Features saved: fraud_model_features.pkl")
        
        # Save label encoders
        joblib.dump(self.label_encoders, 'fraud_label_encoders.pkl')
        print(f"💾 Encoders saved: fraud_label_encoders.pkl")
    
    def load_model(self):
        """
        Load the trained model and components
        """
        try:
            self.model = joblib.load('fraud_detection_xgb_model.pkl')
            self.scaler = joblib.load('fraud_model_scaler.pkl')
            self.feature_columns = joblib.load('fraud_model_features.pkl')
            self.label_encoders = joblib.load('fraud_label_encoders.pkl')
            print("✅ Model and components loaded successfully!")
            return True
        except FileNotFoundError as e:
            print(f"❌ Error loading model: {e}")
            return False
    
    def create_training_plots(self, X_test, y_test, y_pred_proba):
        """
        Create training evaluation plots
        """
        fig, axes = plt.subplots(2, 2, figsize=(15, 10))
        
        # ROC Curve
        fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
        auc_score = roc_auc_score(y_test, y_pred_proba)
        
        axes[0, 0].plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {auc_score:.3f})')
        axes[0, 0].plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        axes[0, 0].set_xlabel('False Positive Rate')
        axes[0, 0].set_ylabel('True Positive Rate')
        axes[0, 0].set_title('ROC Curve')
        axes[0, 0].legend()
        axes[0, 0].grid(True)
        
        # Feature Importance
        if hasattr(self.model, 'feature_importances_'):
            importance_df = pd.DataFrame({
                'feature': self.feature_columns,
                'importance': self.model.feature_importances_
            }).sort_values('importance', ascending=True).tail(15)
            
            axes[0, 1].barh(importance_df['feature'], importance_df['importance'])
            axes[0, 1].set_title('Top 15 Feature Importances')
            axes[0, 1].set_xlabel('Importance')
        
        # Probability Distribution
        fraud_probs = y_pred_proba[y_test == 1]
        normal_probs = y_pred_proba[y_test == 0]
        
        axes[1, 0].hist(normal_probs, bins=30, alpha=0.7, label='No Fraud', color='blue', density=True)
        axes[1, 0].hist(fraud_probs, bins=30, alpha=0.7, label='Fraud', color='red', density=True)
        axes[1, 0].set_xlabel('Fraud Probability')
        axes[1, 0].set_ylabel('Density')
        axes[1, 0].set_title('Probability Distribution by Class')
        axes[1, 0].legend()
        axes[1, 0].grid(True)
        
        # Confusion Matrix
        y_pred_binary = (y_pred_proba > 0.5).astype(int)
        cm = confusion_matrix(y_test, y_pred_binary)
        
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[1, 1])
        axes[1, 1].set_title('Confusion Matrix')
        axes[1, 1].set_xlabel('Predicted')
        axes[1, 1].set_ylabel('Actual')
        
        plt.tight_layout()
        
        # Save plot
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        plot_filename = f"model_evaluation_{timestamp}.png"
        plt.savefig(plot_filename, dpi=300, bbox_inches='tight')
        print(f"📊 Training plots saved: {plot_filename}")
        plt.show()
    
    def get_user_input(self):
        """
        Get CSV file path from user input
        """
        print("\n" + "="*70)
        print("🔍 INSURANCE FRAUD PREDICTION")
        print("="*70)
        
        while True:
            print("\n📁 Please enter the CSV file path for fraud prediction:")
            print("   Examples:")
            print("   • Auto_Insurance_Fraud_Claims_File03.csv")
            print("   • data/claims_2024.csv")
            print("   • /full/path/to/your/file.csv")
            
            file_path = input("\n➤ Enter file path: ").strip()
            
            if not file_path:
                print("❌ Please enter a valid file path!")
                continue
            
            # Add .csv extension if not present
            if not file_path.endswith('.csv'):
                file_path += '.csv'
            
            # Check if file exists
            if os.path.exists(file_path):
                print(f"✅ File found: {file_path}")
                return file_path
            else:
                print(f"❌ File not found: {file_path}")
                retry = input("   Try again? (y/n): ").strip().lower()
                if retry != 'y':
                    return None
    
    def predict_fraud(self, df=None):
        """
        Predict fraud for new data
        """
        # Get file path if dataframe not provided
        if df is None:
            file_path = self.get_user_input()
            if not file_path:
                print("❌ Exiting prediction.")
                return None
                
            try:
                print(f"\n📂 Loading data from: {file_path}")
                df = pd.read_csv(file_path)
                print(f"✅ Data loaded: {df.shape[0]} records, {df.shape[1]} columns")
            except Exception as e:
                print(f"❌ Error loading data: {str(e)}")
                return None
        
        # Load model if not already loaded
        if self.model is None:
            if not self.load_model():
                print("❌ Please train the model first!")
                return None
        
        # Store original data for results
        original_df = df.copy()
        
        # Preprocess data
        X = self.preprocess_data(df, is_training=False)
        
        # Scale features
        X_scaled = self.scaler.transform(X)
        
        # Make predictions
        print("\n🤖 Making fraud predictions...")
        predictions = self.model.predict(X_scaled)
        probabilities = self.model.predict_proba(X_scaled)[:, 1]
        
        # Create risk levels
        risk_levels = pd.cut(probabilities, 
                           bins=[0, 0.33, 0.66, 1.0], 
                           labels=["Low", "Medium", "High"])
        
        # Create results dataframe
        results_df = pd.DataFrame({
            'Claim_ID': original_df.get('Policy_Num', range(len(original_df))),
            'Fraud_Prediction': predictions,
            'Fraud_Probability': probabilities.round(4),
            'Risk_Level': risk_levels,
            'Fraud_Label': ['Fraud' if pred == 1 else 'No Fraud' for pred in predictions]
        })
        
        # Display results summary
        self.display_results_summary(results_df)
        
        # Save results
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        output_file = f"fraud_predictions_{timestamp}.csv"
        results_df.to_csv(output_file, index=False)
        print(f"\n💾 Results saved to: {output_file}")
        
        # Create prediction visualizations
        self.create_prediction_plots(results_df, original_df)
        
        return results_df
    
    def display_results_summary(self, results_df):
        """
        Display prediction results summary
        """
        print("\n" + "="*70)
        print("📊 PREDICTION RESULTS SUMMARY")
        print("="*70)
        
        total_records = len(results_df)
        fraud_cases = sum(results_df['Fraud_Prediction'])
        fraud_percentage = (fraud_cases / total_records) * 100
        
        print(f"📈 Overall Statistics:")
        print(f"   Total records analyzed: {total_records:,}")
        print(f"   Predicted fraud cases: {fraud_cases:,} ({fraud_percentage:.1f}%)")
        print(f"   Average fraud probability: {results_df['Fraud_Probability'].mean():.4f}")
        print(f"   Highest fraud probability: {results_df['Fraud_Probability'].max():.4f}")
        
        print(f"\n🚨 Risk Level Breakdown:")
        risk_counts = results_df['Risk_Level'].value_counts()
        for level in ['High', 'Medium', 'Low']:
            if level in risk_counts.index:
                count = risk_counts[level]
                percentage = (count / total_records) * 100
                print(f"   {level} Risk: {count:,} cases ({percentage:.1f}%)")
        
        # Show top risk cases
        if fraud_cases > 0:
            print(f"\n⚠️  Top 5 Highest Risk Cases:")
            top_5 = results_df.nlargest(5, 'Fraud_Probability')
            for idx, row in top_5.iterrows():
                print(f"   Claim {row['Claim_ID']}: {row['Fraud_Probability']:.4f} ({row['Risk_Level']} Risk)")
    
    def create_prediction_plots(self, results_df, original_df):
        """
        Create comprehensive prediction visualizations
        """
        print("\n📊 Creating prediction visualizations...")
        
        fig = plt.figure(figsize=(20, 15))
        
        # 1. Fraud Distribution Pie Chart
        plt.subplot(3, 4, 1)
        fraud_counts = results_df['Fraud_Label'].value_counts()
        colors = ['#2ecc71', '#e74c3c']
        plt.pie(fraud_counts.values, labels=fraud_counts.index, autopct='%1.1f%%', 
                colors=colors, startangle=90, explode=(0.05, 0.05))
        plt.title('Fraud vs Non-Fraud Distribution', fontsize=12, fontweight='bold')
        
        # 2. Risk Level Distribution
        plt.subplot(3, 4, 2)
        risk_counts = results_df['Risk_Level'].value_counts()
        colors_risk = ['#e74c3c', '#f39c12', '#2ecc71']
        bars = plt.bar(risk_counts.index, risk_counts.values, color=colors_risk)
        plt.title('Risk Level Distribution', fontsize=12, fontweight='bold')
        plt.ylabel('Number of Cases')
        for bar in bars:
            height = bar.get_height()
            plt.text(bar.get_x() + bar.get_width()/2., height + 0.5,
                    f'{int(height)}', ha='center', va='bottom')
        
        # 3. Probability Distribution
        plt.subplot(3, 4, 3)
        plt.hist(results_df['Fraud_Probability'], bins=30, alpha=0.7, color='skyblue', edgecolor='black')
        plt.axvline(results_df['Fraud_Probability'].mean(), color='red', linestyle='--', 
                   label=f'Mean: {results_df["Fraud_Probability"].mean():.3f}')
        plt.title('Fraud Probability Distribution', fontsize=12, fontweight='bold')
        plt.xlabel('Fraud Probability')
        plt.ylabel('Frequency')
        plt.legend()
        plt.grid(True, alpha=0.3)
        
        # 4. Risk Level Box Plot
        plt.subplot(3, 4, 4)
        risk_order = ['Low', 'Medium', 'High']
        sns.boxplot(data=results_df, x='Risk_Level', y='Fraud_Probability', order=risk_order)
        plt.title('Fraud Probability by Risk Level', fontsize=12, fontweight='bold')
        
        # 5. Top 10 High Risk Cases
        plt.subplot(3, 4, 5)
        top_10 = results_df.nlargest(10, 'Fraud_Probability')
        colors_top = ['red' if prob > 0.8 else 'orange' if prob > 0.5 else 'yellow' 
                     for prob in top_10['Fraud_Probability']]
        plt.barh(range(len(top_10)), top_10['Fraud_Probability'], color=colors_top)
        plt.yticks(range(len(top_10)), [f"Claim {str(id)[:8]}" for id in top_10['Claim_ID']])
        plt.title('Top 10 Highest Risk Cases', fontsize=12, fontweight='bold')
        plt.xlabel('Fraud Probability')
        
        # 6-12: Additional analysis plots based on available columns
        available_plots = [
            ('Total_Claim', 'Total Claim Amount vs Fraud Probability', 6),
            ('Age', 'Age Distribution by Fraud Prediction', 7),
            ('Vehicle_Cost', 'Vehicle Cost vs Fraud Risk', 8),
            ('Annual_Income', 'Income vs Fraud Probability', 9),
            ('Past_Number_of_Claims', 'Claims History vs Fraud Risk', 10),
        ]
        
        for col, title, subplot_num in available_plots:
            plt.subplot(3, 4, subplot_num)
            if col in original_df.columns:
                if col in ['Total_Claim', 'Vehicle_Cost', 'Annual_Income']:
                    scatter_colors = ['red' if x == 1 else 'blue' for x in results_df['Fraud_Prediction']]
                    plt.scatter(original_df[col], results_df['Fraud_Probability'], 
                              c=scatter_colors, alpha=0.6, s=20)
                    plt.xlabel(col.replace('_', ' '))
                    plt.ylabel('Fraud Probability')
                elif col in ['Age', 'Past_Number_of_Claims']:
                    fraud_by_col = original_df.groupby(col)[results_df['Fraud_Prediction'] == 1].mean()
                    plt.plot(fraud_by_col.index, fraud_by_col.values, marker='o', color='purple')
                    plt.xlabel(col.replace('_', ' '))
                    plt.ylabel('Fraud Rate')
                    plt.grid(True, alpha=0.3)
                plt.title(title, fontsize=10, fontweight='bold')
            else:
                plt.text(0.5, 0.5, f'{col}\ncolumn not found', ha='center', va='center', 
                        transform=plt.gca().transAxes, fontsize=10)
                plt.title(title, fontsize=10, fontweight='bold')
        
        # Summary Statistics Plot
        plt.subplot(3, 4, 11)
        plt.axis('off')
        
        # Calculate statistics
        total_cases = len(results_df)
        fraud_cases = sum(results_df['Fraud_Prediction'])
        high_risk = sum(results_df['Risk_Level'] == 'High')
        medium_risk = sum(results_df['Risk_Level'] == 'Medium')
        low_risk = sum(results_df['Risk_Level'] == 'Low')
        avg_prob = results_df['Fraud_Probability'].mean()
        max_prob = results_df['Fraud_Probability'].max()
        
        stats_text = f"""
📊 PREDICTION SUMMARY

Total Cases: {total_cases:,}
Fraud Cases: {fraud_cases:,} ({fraud_cases/total_cases*100:.1f}%)

🚨 Risk Breakdown:
High Risk: {high_risk:,} ({high_risk/total_cases*100:.1f}%)
Medium Risk: {medium_risk:,} ({medium_risk/total_cases*100:.1f}%)
Low Risk: {low_risk:,} ({low_risk/total_cases*100:.1f}%)

📈 Probability Stats:
Average: {avg_prob:.4f}
Maximum: {max_prob:.4f}

⚡ Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
        """
        
        plt.text(0.1, 0.9, stats_text, transform=plt.gca().transAxes, fontsize=10,
                verticalalignment='top', bbox=dict(boxstyle='round', facecolor='lightgray', alpha=0.8))
        
        # Probability vs Prediction Accuracy
        plt.subplot(3, 4, 12)
        fraud_probs = results_df[results_df['Fraud_Prediction'] == 1]['Fraud_Probability']
        no_fraud_probs = results_df[results_df['Fraud_Prediction'] == 0]['Fraud_Probability']
        
        if len(fraud_probs) > 0:
            plt.hist(no_fraud_probs, bins=20, alpha=0.7, label='Predicted No Fraud', color='lightblue', density=True)
            plt.hist(fraud_probs, bins=20, alpha=0.7, label='Predicted Fraud', color='salmon', density=True)
            plt.legend()
        else:
            plt.hist(results_df['Fraud_Probability'], bins=30, alpha=0.7, color='lightblue')
        
        plt.title('Probability Distribution by Prediction', fontsize=10, fontweight='bold')
        plt.xlabel('Fraud Probability')
        plt.ylabel('Density')
        
        plt.tight_layout(pad=3.0)
        
        # Save the plot
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        plot_filename = f"fraud_prediction_analysis_{timestamp}.png"
        plt.savefig(plot_filename, dpi=300, bbox_inches='tight')
        print(f"   ✅ Prediction plots saved as: {plot_filename}")
        plt.show()


def main():
    """
    Main function to run the fraud detection system
    """
    print("🚗 INSURANCE FRAUD DETECTION SYSTEM")
    print("="*70)
    
    # Initialize the system
    fraud_system = FraudDetectionSystem()
    
    while True:
        print("\n🔧 Choose an option:")
        print("1. Train new model (creates synthetic data)")
        print("2. Train model with your data")
        print("3. Predict fraud on new data")
        print("4. Exit")
        
        choice = input("\n➤ Enter your choice (1-4): ").strip()
        
        if choice == '1':
            print("\n🎯 Training model with synthetic data...")
            fraud_system.train_model()
            
        elif choice == '2':
            print("\n📁 Training model with your data...")
            file_path = fraud_system.get_user_input()
            if file_path:
                try:
                    df = pd.read_csv(file_path)
                    print(f"✅ Training data loaded: {df.shape}")
                    
                    # Check if 'Fraud' column exists
                    if 'Fraud' not in df.columns:
                        print("❌ Training data must contain a 'Fraud' column with labels (0/1)")
                        continue
                    
                    fraud_system.train_model(df)
                except Exception as e:
                    print(f"❌ Error loading training data: {str(e)}")
            
        elif choice == '3':
            print("\n🔍 Predicting fraud on new data...")
            
            # Check if model exists
            if not os.path.exists('fraud_detection_xgb_model.pkl'):
                print("❌ No trained model found!")
                train_now = input("   Would you like to train a model now? (y/n): ").strip().lower()
                if train_now == 'y':
                    print("🎯 Training model with synthetic data...")
                    fraud_system.train_model()
                else:
                    continue
            
            # Make predictions
            results = fraud_system.predict_fraud()
            
            if results is not None:
                print("\n✅ Fraud prediction completed successfully!")
                
                # Ask if user wants to see detailed analysis
                show_details = input("\nWould you like to see detailed case analysis? (y/n): ").strip().lower()
                if show_details == 'y':
                    print("\n🔍 DETAILED HIGH-RISK CASE ANALYSIS:")
                    print("-" * 50)
                    
                    high_risk_cases = results[results['Risk_Level'] == 'High'].nlargest(10, 'Fraud_Probability')
                    
                    if len(high_risk_cases) > 0:
                        for idx, case in high_risk_cases.iterrows():
                            print(f"📋 Claim ID: {case['Claim_ID']}")
                            print(f"   Fraud Probability: {case['Fraud_Probability']:.4f}")
                            print(f"   Risk Level: {case['Risk_Level']}")
                            print(f"   Prediction: {case['Fraud_Label']}")
                            print()
                    else:
                        print("   No high-risk cases found.")
            
        elif choice == '4':
            print("\n👋 Thank you for using the Fraud Detection System!")
            break
            
        else:
            print("❌ Invalid choice. Please enter 1, 2, 3, or 4.")
        
        # Ask if user wants to continue
        if choice in ['1', '2', '3']:
            continue_choice = input("\nWould you like to perform another operation? (y/n): ").strip().lower()
            if continue_choice != 'y':
                print("\n👋 Thank you for using the Fraud Detection System!")
                break


if __name__ == "__main__":
    try:
        main()
    except KeyboardInterrupt:
        print("\n\n⚠️  Operation cancelled by user.")
        print("👋 Thank you for using the Fraud Detection System!")
    except Exception as e:
        print(f"\n❌ An unexpected error occurred: {str(e)}")
        print("Please try again or contact support.")

🚗 INSURANCE FRAUD DETECTION SYSTEM

🔧 Choose an option:
1. Train new model (creates synthetic data)
2. Train model with your data
3. Predict fraud on new data
4. Exit



➤ Enter your choice (1-4):  3



🔍 Predicting fraud on new data...

🔍 INSURANCE FRAUD PREDICTION

📁 Please enter the CSV file path for fraud prediction:
   Examples:
   • Auto_Insurance_Fraud_Claims_File03.csv
   • data/claims_2024.csv
   • /full/path/to/your/file.csv



➤ Enter file path:  Auto_Insurance_Fraud_Claims_File03.csv


✅ File found: Auto_Insurance_Fraud_Claims_File03.csv

📂 Loading data from: Auto_Insurance_Fraud_Claims_File03.csv
✅ Data loaded: 10000 records, 52 columns
✅ Model and components loaded successfully!
🔧 Preprocessing data...

❌ An unexpected error occurred: 'bool' object has no attribute 'astype'
Please try again or contact support.
