In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, classification_report, confusion_matrix,
                             roc_auc_score, roc_curve)
import joblib
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('/kaggle/input/income-verify/incomeverify_data.csv')

print(f"\nDataset shape: {df.shape}")
print(f"Features: {df.shape[1] - 1}")
print(f"Samples: {df.shape[0]}")

print(f"\nColumns ({len(df.columns)}):")
for i, col in enumerate(df.columns, 1):
    print(f"  {i}. {col}")

print(f"\nFirst 5 rows:")
print(df.head())

print(f"\nTarget variable: income_verified")
print(f"  Class 0 (Fraud): {(df['income_verified'] == 0).sum()}")
print(f"  Class 1 (Genuine): {(df['income_verified'] == 1).sum()}")

print(f"\nDataset loaded successfully!")


In [None]:
print(f"\nTotal columns: {len(df.columns)}")
print(f"\nAll columns in your dataset:")
for i, col in enumerate(df.columns, 1):
    print(f"  {i:2d}. {col}")

print(f"\nDataset shape: {df.shape}")
print(f"\nTarget column: income_verified = {df['income_verified'].value_counts().to_dict()}")

# Store column names for later use
available_columns = df.columns.tolist()
print(f"\n Column list saved!")


In [None]:
# Class distribution
print(f"\nClass Distribution:")
print(df['income_verified'].value_counts())
print(f"\nPercentage:")
print(df['income_verified'].value_counts(normalize=True) * 100)

# Income statistics
print(f"\nIncome Statistics:")
print(f"  Minimum: ‚Çπ{df['stated_income'].min():,}")
print(f"  Maximum: ‚Çπ{df['stated_income'].max():,}")
print(f"  Mean: ‚Çπ{df['stated_income'].mean():,.0f}")
print(f"  Median: ‚Çπ{df['stated_income'].median():,.0f}")

print(f"\nMissing values: {df.isnull().sum().sum()}")

# Separate classes
genuine = df[df['income_verified'] == 1]
fraud = df[df['income_verified'] == 0]

print(f"\nGENUINE (1): {len(genuine)} samples")
print(f"  Mean income: ‚Çπ{genuine['stated_income'].mean():,.0f}")

# Only show statistics for columns that exist
if 'credit_score' in df.columns:
    print(f"  Mean credit score: {genuine['credit_score'].mean():.0f}")

if 'median_monthly_balance' in df.columns:
    print(f"  Mean balance: ‚Çπ{genuine['median_monthly_balance'].mean():,.0f}")

print(f"\nFRAUDULENT (0): {len(fraud)} samples")
print(f"  Mean income: ‚Çπ{fraud['stated_income'].mean():,.0f}")

if 'credit_score' in df.columns:
    print(f"  Mean credit score: {fraud['credit_score'].mean():.0f}")

if 'median_monthly_balance' in df.columns:
    print(f"  Mean balance: ‚Çπ{fraud['median_monthly_balance'].mean():,.0f}")

print

In [None]:
# Class distribution
print(f"\nClass Distribution:")
print(df['income_verified'].value_counts())
print(f"\nPercentage:")
print(df['income_verified'].value_counts(normalize=True) * 100)

# Income statistics
print(f"\nIncome Statistics:")
print(f"  Minimum: ‚Çπ{df['stated_income'].min():,}")
print(f"  Maximum: ‚Çπ{df['stated_income'].max():,}")
print(f"  Mean: ‚Çπ{df['stated_income'].mean():,.0f}")
print(f"  Median: ‚Çπ{df['stated_income'].median():,.0f}")

print(f"\nMissing values: {df.isnull().sum().sum()}")

# Separate classes
genuine = df[df['income_verified'] == 1]
fraud = df[df['income_verified'] == 0]

print(f"\nGENUINE (1): {len(genuine)} samples")
print(f"  Mean income: ‚Çπ{genuine['stated_income'].mean():,.0f}")

# Check for credit score
if 'credit_score' in df.columns:
    print(f"  Mean credit score: {genuine['credit_score'].mean():.0f}")

# Check for balance (use whatever exists)
if 'median_monthly_balance' in df.columns:
    print(f"  Mean balance: ‚Çπ{genuine['median_monthly_balance'].mean():,.0f}")

print(f"\nFRAUDULENT (0): {len(fraud)} samples")
print(f"  Mean income: ‚Çπ{fraud['stated_income'].mean():,.0f}")

if 'credit_score' in df.columns:
    print(f"  Mean credit score: {fraud['credit_score'].mean():.0f}")

if 'median_monthly_balance' in df.columns:
    print(f"  Mean balance: ‚Çπ{fraud['median_monthly_balance'].mean():,.0f}")

print(f"\nAnalysis complete!")

# VISUALIZATIONS - No avg_bank_balance


print("\nCreating visualizations...")

# Create 2x2 plot grid
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Plot 1: Income Distribution (ALWAYS exists)
axes[0, 0].hist(genuine['stated_income'], bins=40, alpha=0.6, label='Genuine', 
                color='green', edgecolor='black')
axes[0, 0].hist(fraud['stated_income'], bins=40, alpha=0.6, label='Fraudulent', 
                color='red', edgecolor='black')
axes[0, 0].set_xlabel('Stated Income (‚Çπ)', fontsize=12, fontweight='bold')
axes[0, 0].set_ylabel('Frequency', fontsize=12, fontweight='bold')
axes[0, 0].set_title('Income Distribution by Class', fontsize=14, fontweight='bold')
axes[0, 0].legend(fontsize=11)
axes[0, 0].grid(alpha=0.3)

# Plot 2: Credit Score (if exists)
if 'credit_score' in df.columns:
    axes[0, 1].hist(genuine['credit_score'], bins=30, alpha=0.6, label='Genuine', 
                    color='green', edgecolor='black')
    axes[0, 1].hist(fraud['credit_score'], bins=30, alpha=0.6, label='Fraudulent', 
                    color='red', edgecolor='black')
    axes[0, 1].set_xlabel('Credit Score', fontsize=12, fontweight='bold')
    axes[0, 1].set_ylabel('Frequency', fontsize=12, fontweight='bold')
    axes[0, 1].set_title('Credit Score Distribution', fontsize=14, fontweight='bold')
    axes[0, 1].legend(fontsize=11)
    axes[0, 1].grid(alpha=0.3)
else:
    axes[0, 1].text(0.5, 0.5, 'Credit Score\nNot Available', 
                    ha='center', va='center', fontsize=16, fontweight='bold')

# Plot 3: Balance (use median_monthly_balance)
if 'median_monthly_balance' in df.columns:
    axes[1, 0].hist(genuine['median_monthly_balance'], bins=40, alpha=0.6, 
                    label='Genuine', color='green', edgecolor='black')
    axes[1, 0].hist(fraud['median_monthly_balance'], bins=40, alpha=0.6, 
                    label='Fraudulent', color='red', edgecolor='black')
    axes[1, 0].set_xlabel('Median Monthly Balance (‚Çπ)', fontsize=12, fontweight='bold')
    axes[1, 0].set_ylabel('Frequency', fontsize=12, fontweight='bold')
    axes[1, 0].set_title('Balance Distribution', fontsize=14, fontweight='bold')
    axes[1, 0].legend(fontsize=11)
    axes[1, 0].grid(alpha=0.3)
else:
    axes[1, 0].text(0.5, 0.5, 'Balance Data\nNot Available', 
                    ha='center', va='center', fontsize=16, fontweight='bold')

# Plot 4: Electricity Bill (if exists)
if 'monthly_electricity_bill' in df.columns:
    axes[1, 1].hist(genuine['monthly_electricity_bill'], bins=40, alpha=0.6, 
                    label='Genuine', color='green', edgecolor='black')
    axes[1, 1].hist(fraud['monthly_electricity_bill'], bins=40, alpha=0.6, 
                    label='Fraudulent', color='red', edgecolor='black')
    axes[1, 1].set_xlabel('Monthly Electricity Bill (‚Çπ)', fontsize=12, fontweight='bold')
    axes[1, 1].set_ylabel('Frequency', fontsize=12, fontweight='bold')
    axes[1, 1].set_title('Electricity Bill Distribution', fontsize=14, fontweight='bold')
    axes[1, 1].legend(fontsize=11)
    axes[1, 1].grid(alpha=0.3)
else:
    axes[1, 1].text(0.5, 0.5, 'Bill Data\nNot Available', 
                    ha='center', va='center', fontsize=16, fontweight='bold')

plt.tight_layout()
plt.show()

print("Visualizations created!")

In [None]:
key_features = ['stated_income', 'monthly_electricity_bill', 'monthly_water_bill', 
                'avg_bank_balance', 'monthly_debt_payment', 'credit_score', 
                'vacations_annually', 'income_verified']

# Filter only existing columns
key_features = [col for col in key_features if col in df.columns]

plt.figure(figsize=(12, 10))
correlation = df[key_features].corr()
sns.heatmap(correlation, annot=True, fmt='.2f', cmap='coolwarm', 
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Correlation Matrix - Key Features', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

print(" Correlation matrix created!")


In [None]:
# Separate features and target
X = df.drop('income_verified', axis=1)
y = df['income_verified']

print(f"\nFeatures (X): {X.shape}")
print(f"Target (y): {y.shape}")
print(f"\nFeature columns: {X.shape[1]}")
print(f"Total samples: {X.shape[0]}")

# Store feature column names (needed for prediction)
feature_columns = X.columns.tolist()
print(f"\n Feature columns saved: {len(feature_columns)} features")


In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)

print(f"\nTraining set: {len(X_train)} samples ({len(X_train)/len(X)*100:.1f}%)")
print(f"Testing set: {len(X_test)} samples ({len(X_test)/len(X)*100:.1f}%)")

print(f"\nTraining set class distribution:")
print(y_train.value_counts())
print(f"\nTesting set class distribution:")
print(y_test.value_counts())

print("\n Data split complete!")


In [None]:
# Scale features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\nStandardScaler applied:")
print("  - Mean: 0")
print("  - Standard Deviation: 1")

print(f"\nOriginal feature ranges (before scaling):")
print(f"  Income: ‚Çπ{X_train['stated_income'].min():,.0f} - ‚Çπ{X_train['stated_income'].max():,.0f}")
print(f"  Credit Score: {X_train['credit_score'].min():.0f} - {X_train['credit_score'].max():.0f}")

print(f"\nScaled feature ranges (after scaling):")
print(f"  Min: {X_train_scaled.min():.2f}")
print(f"  Max: {X_train_scaled.max():.2f}")

print("\n Feature scaling complete!")


In [None]:
# Train Logistic Regression model

print("\nTraining Logistic Regression model...")

model = LogisticRegression(
    max_iter=2000,
    C=1.0,
    class_weight='balanced',
    random_state=42,
    solver='lbfgs',
    n_jobs=-1
)

# Train the model
model.fit(X_train_scaled, y_train)

print("Model training complete!")

print("\nModel parameters:")
print(f"  Max iterations: {model.max_iter}")
print(f"  Regularization (C): {model.C}")
print(f"  Class weight: {model.class_weight}")
print(f"  Solver: {model.solver}")


In [None]:
# Make predictions
y_train_pred = model.predict(X_train_scaled)
y_test_pred = model.predict(X_test_scaled)
y_pred_proba = model.predict_proba(X_test_scaled)

# Calculate metrics
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
precision = precision_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba[:, 1])


print(f"\nAccuracy:")
print(f"  Training: {train_accuracy*100:.2f}%")
print(f"  Testing:  {test_accuracy*100:.2f}%")
print(f"  Overfitting: {abs(train_accuracy - test_accuracy)*100:.2f}%")

print(f"\nTesting Set Metrics:")
print(f"  Precision: {precision*100:.2f}%")
print(f"  Recall:    {recall*100:.2f}%")
print(f"  F1-Score:  {f1*100:.2f}%")
print(f"  ROC-AUC:   {roc_auc:.4f}")

print(f"\n Model evaluation complete!")

In [None]:
# Detailed classification report
print("\n")
print(classification_report(y_test, y_test_pred, 
                          target_names=['Fraudulent (0)', 'Genuine (1)'],
                          digits=4))


In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_test_pred)

print(cm)

print(f"\nBreakdown:")
print(f"  True Negatives (TN):  {cm[0,0]} - Correctly identified fraud")
print(f"  False Positives (FP): {cm[0,1]} - Genuine marked as fraud (Type I Error)")
print(f"  False Negatives (FN): {cm[1,0]} - Fraud marked as genuine (Type II Error)")
print(f"  True Positives (TP):  {cm[1,1]} - Correctly verified genuine")


In [None]:
# Visualizations - model performance
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Confusion Matrix Heatmap
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0],
            xticklabels=['Fraudulent', 'Genuine'],
            yticklabels=['Fraudulent', 'Genuine'],
            cbar_kws={'label': 'Count'},
            annot_kws={'size': 14, 'weight': 'bold'})
axes[0].set_title('Confusion Matrix', fontsize=16, fontweight='bold', pad=15)
axes[0].set_ylabel('Actual', fontsize=13, fontweight='bold')
axes[0].set_xlabel('Predicted', fontsize=13, fontweight='bold')

# ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba[:, 1])
axes[1].plot(fpr, tpr, color='darkorange', lw=3, 
            label=f'ROC Curve (AUC = {roc_auc:.4f})')
axes[1].plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random Classifier')
axes[1].set_xlim([0.0, 1.0])
axes[1].set_ylim([0.0, 1.05])
axes[1].set_xlabel('False Positive Rate', fontsize=13, fontweight='bold')
axes[1].set_ylabel('True Positive Rate', fontsize=13, fontweight='bold')
axes[1].set_title('ROC Curve', fontsize=16, fontweight='bold', pad=15)
axes[1].legend(loc="lower right", fontsize=11)
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

print("Performance visualizations created!")


In [None]:
# Feature importance from coefficients
feature_importance = pd.DataFrame({
    'Feature': feature_columns,
    'Coefficient': model.coef_[0],
    'Abs_Coefficient': np.abs(model.coef_[0])
}).sort_values('Abs_Coefficient', ascending=False)


print("\n")
for idx, row in feature_importance.head(15).iterrows():
    impact = "Fraud Indicator" if row['Coefficient'] < 0 else "Genuine Indicator"
    print(f"{row['Feature']:<35} {row['Coefficient']:>8.4f}  ({impact})")

# Visualize top 12 features
plt.figure(figsize=(12, 8))
top_features = feature_importance.head(12)
colors = ['red' if x < 0 else 'green' for x in top_features['Coefficient']]
plt.barh(top_features['Feature'], top_features['Coefficient'], 
         color=colors, alpha=0.7, edgecolor='black', linewidth=1.5)
plt.xlabel('Coefficient Value', fontsize=13, fontweight='bold')
plt.title('Top 12 Feature Importance (Logistic Regression Coefficients)', 
          fontsize=16, fontweight='bold', pad=15)
plt.axvline(x=0, color='black', linestyle='--', linewidth=2)
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()

print("\nFeature importance analysis complete!")


In [None]:
# Prediction function for 7 user inputs
def predict_fraud_from_7_inputs(
    stated_income,
    monthly_electricity_bill,
    monthly_water_bill,
    vacations_annually,
    monthly_debt_payment,
    employment_status,
    avg_bank_balance
):
    
    income = stated_income
    bills_ratio = (monthly_electricity_bill + monthly_water_bill) / income
    balance_ratio = avg_bank_balance / income
    debt_burden = (monthly_debt_payment / income) * 100
    
    # Estimate missing features intelligently
    
    # Verified deposit (key fraud indicator)
    if bills_ratio < 0.015 or balance_ratio < 0.5:
        deposit = int(income * 0.55)  # Suspicious
    else:
        deposit = int(income * 0.98)  # Normal
    
    # Credit score estimation
    if debt_burden > 30 or balance_ratio < 0.5:
        credit = 520
    elif debt_burden > 20:
        credit = 650
    else:
        credit = 720 if income > 80000 else 680
    
    # Total debt estimation
    if debt_burden > 25:
        total_debt = int(income * 7)
    else:
        total_debt = int(monthly_debt_payment * 20)
    
    # Build complete feature set (match training data columns)
    features = {}
    for col in feature_columns:
        if col == 'stated_income':
            features[col] = income
        elif col == 'monthly_electricity_bill':
            features[col] = monthly_electricity_bill
        elif col == 'monthly_water_bill':
            features[col] = monthly_water_bill
        elif col == 'vacations_annually':
            features[col] = vacations_annually
        elif col == 'monthly_debt_payment':
            features[col] = monthly_debt_payment
        elif col == 'employment_status':
            features[col] = employment_status
        elif col == 'avg_bank_balance':
            features[col] = avg_bank_balance
        elif col == 'verified_monthly_deposit':
            features[col] = deposit
        elif col == 'has_regular_payroll':
            features[col] = 1 if employment_status == 1 else 0
        elif col == 'tenure_months':
            features[col] = 48 if employment_status == 1 else 12
        elif col == 'balance_volatility':
            features[col] = 0.3 if employment_status == 1 else 0.6
        elif col == 'credit_score':
            features[col] = credit
        elif col == 'delinquencies':
            features[col] = 0 if credit > 700 else 3
        elif col == 'total_outstanding_debt':
            features[col] = total_debt
        elif col == 'savings_avg_balance':
            features[col] = int(avg_bank_balance * 1.5)
        elif col == 'missed_bills_12m':
            features[col] = 0 if credit > 700 else 5
        elif col == 'income_to_deposit_ratio':
            features[col] = income / (deposit + 1)
        elif col == 'debt_to_income_ratio':
            features[col] = total_debt / income
        elif col == 'bills_to_income_ratio':
            features[col] = bills_ratio
        elif col == 'debt_burden_score':
            features[col] = debt_burden
        elif col == 'balance_to_income_ratio':
            features[col] = balance_ratio
        elif col == 'fraud_risk_score':
            features[col] = (
                (features.get('delinquencies', 0) > 2) * 3 +
                (features.get('missed_bills_12m', 0) > 3) * 2 +
                (features.get('balance_volatility', 0) > 0.5) * 2 +
                (features.get('has_regular_payroll', 0) == 0) * 2 +
                (features.get('credit_score', 700) < 600) * 3 +
                (features.get('income_to_deposit_ratio', 1) > 1.5) * 4 +
                (features.get('debt_to_income_ratio', 3) > 4.0) * 2
            )
        else:
            # For any other columns, use reasonable defaults
            features[col] = 0
    
    # Create DataFrame with correct column order
    df_input = pd.DataFrame([features])[feature_columns]
    
    # Scale and predict
    scaled = scaler.transform(df_input)
    prediction = model.predict(scaled)[0]
    probabilities = model.predict_proba(scaled)[0]
    
    fraud_score = probabilities[0] * 100
    genuine_score = probabilities[1] * 100
    
    # Determine risk and recommendation
    if fraud_score > 75:
        risk = 'VERY HIGH'
        action = 'REJECT'
    elif fraud_score > 55:
        risk = 'HIGH'
        action = 'REJECT'
    elif fraud_score > 35:
        risk = 'MEDIUM'
        action = 'MANUAL REVIEW'
    elif fraud_score > 15:
        risk = 'LOW'
        action = 'APPROVE'
    else:
        risk = 'VERY LOW'
        action = 'APPROVE'
    
    return {
        'status': 'FRAUDULENT' if prediction == 0 else 'GENUINE',
        'fraud_score': round(fraud_score, 1),
        'genuine_score': round(genuine_score, 1),
        'risk_level': risk,
        'recommendation': action,
        'indicators': {
            'income_to_deposit_ratio': round(features['income_to_deposit_ratio'], 2),
            'debt_to_income_ratio': round(features['debt_to_income_ratio'], 2),
            'fraud_risk_score': int(features['fraud_risk_score']),
            'estimated_credit_score': features['credit_score'],
            'estimated_monthly_deposit': features['verified_monthly_deposit']
        }
    }
#prediction function created

In [None]:
print("INCOME FRAUD DETECTION - USER INPUT")


# Get user inputs
print("\nPlease enter the following 7 parameters:\n")

# Input 1: Stated Income
stated_income = int(input("1. Stated Monthly Income (‚Çπ): "))

# Input 2: Monthly Electricity Bill
monthly_electricity_bill = int(input("2. Monthly Electricity Bill (‚Çπ): "))

# Input 3: Monthly Water Bill
monthly_water_bill = int(input("3. Monthly Water Bill (‚Çπ): "))

# Input 4: Vacations Annually
vacations_annually = int(input("4. Vacations per Year (0-6): "))

# Input 5: Monthly Debt Payment (EMI)
monthly_debt_payment = int(input("5. Monthly Debt Payment/EMI (‚Çπ): "))

# Input 6: Employment Status
print("\n6. Employment Status:")
print("   1 = Full-time")
print("   2 = Part-time")
print("   3 = Self-employed")
employment_status = int(input("   Enter (1/2/3): "))

# Input 7: Average Bank Balance
avg_bank_balance = int(input("7. Average Bank Balance (‚Çπ): "))

print("\n" + "="*80)
print("USER INPUTS RECEIVED")
print("="*80)

# Display inputs
print(f"\n1. Stated Income: ‚Çπ{stated_income:,}")
print(f"2. Electricity Bill: ‚Çπ{monthly_electricity_bill:,}")
print(f"3. Water Bill: ‚Çπ{monthly_water_bill:,}")
print(f"4. Vacations: {vacations_annually}")
print(f"5. Monthly EMI: ‚Çπ{monthly_debt_payment:,}")
print(f"6. Employment: {['', 'Full-time', 'Part-time', 'Self-employed'][employment_status]}")
print(f"7. Bank Balance: ‚Çπ{avg_bank_balance:,}")

# ============================================================================
# PREDICT FRAUD SCORE
# ============================================================================

print("\n" + "="*80)
print("PROCESSING PREDICTION...")
print("="*80)

# Calculate derived features
income = stated_income
bills_ratio = (monthly_electricity_bill + monthly_water_bill) / income
balance_ratio = avg_bank_balance / income
debt_burden = (monthly_debt_payment / income) * 100

# Estimate missing features intelligently
# Verified deposit (key fraud indicator)
if bills_ratio < 0.015 or balance_ratio < 0.5:
    deposit = int(income * 0.55)  # Suspicious
else:
    deposit = int(income * 0.98)  # Normal

# Credit score estimation
if debt_burden > 30 or balance_ratio < 0.5:
    credit = 520
elif debt_burden > 20:
    credit = 650
else:
    credit = 720 if income > 80000 else 680

# Total debt estimation
if debt_burden > 25:
    total_debt = int(income * 7)
else:
    total_debt = int(monthly_debt_payment * 20)

# Build complete feature set matching training data
features = {}
for col in feature_columns:
    if col == 'stated_income':
        features[col] = income
    elif col == 'monthly_electricity_bill':
        features[col] = monthly_electricity_bill
    elif col == 'monthly_water_bill':
        features[col] = monthly_water_bill
    elif col == 'vacations_annually':
        features[col] = vacations_annually
    elif col == 'monthly_debt_payment':
        features[col] = monthly_debt_payment
    elif col == 'employment_status':
        features[col] = employment_status
    elif col == 'verified_monthly_deposit':
        features[col] = deposit
    elif col == 'has_regular_payroll':
        features[col] = 1 if employment_status == 1 else 0
    elif col == 'tenure_months':
        features[col] = 48 if employment_status == 1 else 12
    elif col == 'balance_volatility':
        features[col] = 0.3 if employment_status == 1 else 0.6
    elif col == 'credit_score':
        features[col] = credit
    elif col == 'delinquencies':
        features[col] = 0 if credit > 700 else 3
    elif col == 'total_outstanding_debt':
        features[col] = total_debt
    elif col == 'median_monthly_balance' or col == 'avg_bank_balance':
        features[col] = avg_bank_balance
    elif col == 'savings_avg_balance':
        features[col] = int(avg_bank_balance * 1.5)
    elif col == 'missed_bills_12m':
        features[col] = 0 if credit > 700 else 5
    elif col == 'rent_mortgage_payment':
        features[col] = int(income * 0.30)
    elif col == 'local_income_ratio':
        features[col] = 1.0
    elif col == 'high_value_txns_12m':
        features[col] = max(0, vacations_annually)
    elif col == 'utility_bill_ratio':
        features[col] = 1.0
    elif col == 'annual_travel_spend':
        features[col] = int(vacations_annually * 40000)
    elif col == 'job_seniority_score':
        features[col] = 1.0 if employment_status == 1 else 0.5
    elif col == 'deposit_trend_slope':
        features[col] = 1.0 if bills_ratio > 0.015 else 0.8
    elif col == 'income_to_deposit_ratio':
        features[col] = income / (deposit + 1)
    elif col == 'debt_to_income_ratio':
        features[col] = total_debt / income
    elif col == 'bills_to_income_ratio':
        features[col] = bills_ratio
    elif col == 'debt_burden_score':
        features[col] = debt_burden
    elif col == 'balance_to_income_ratio':
        features[col] = balance_ratio
    elif col == 'savings_to_income_ratio':
        features[col] = (avg_bank_balance * 1.5) / income
    elif col == 'fraud_risk_score':
        # Calculate fraud risk score
        features[col] = (
            (features.get('delinquencies', 0) > 2) * 3 +
            (features.get('missed_bills_12m', 0) > 3) * 2 +
            (features.get('balance_volatility', 0) > 0.5) * 2 +
            (features.get('has_regular_payroll', 0) == 0) * 2 +
            (features.get('credit_score', 700) < 600) * 3 +
            (features.get('income_to_deposit_ratio', 1) > 1.5) * 4 +
            (features.get('debt_to_income_ratio', 3) > 4.0) * 2
        )
    else:
        # For any other columns, use reasonable defaults
        features[col] = 0

# Create DataFrame with correct column order
df_input = pd.DataFrame([features])[feature_columns]

# Scale and predict
scaled = scaler.transform(df_input)
prediction = model.predict(scaled)[0]
probabilities = model.predict_proba(scaled)[0]

fraud_score = probabilities[0] * 100
genuine_score = probabilities[1] * 100

# Determine risk and recommendation
if fraud_score > 75:
    risk = 'VERY HIGH'
    action = 'REJECT'
    color = 'üî¥'
elif fraud_score > 55:
    risk = 'HIGH'
    action = 'REJECT'
    color = 'üü†'
elif fraud_score > 35:
    risk = 'MEDIUM'
    action = 'MANUAL REVIEW'
    color = 'üü°'
elif fraud_score > 15:
    risk = 'LOW'
    action = 'APPROVE'
    color = 'üü¢'
else:
    risk = 'VERY LOW'
    action = 'APPROVE'
    color = 'üü¢'

#RESULTS

print(f"{color} PREDICTION RESULT {color}")

status = 'FRAUDULENT' if prediction == 0 else 'GENUINE'
print(f"\n{'='*80}")
print(f"STATUS: {status}")
print(f"{'='*80}")

print(f"\nüìä FRAUD SCORE: {fraud_score:.1f}%")
print(f"‚úÖ GENUINE SCORE: {genuine_score:.1f}%")

print(f"\n‚ö†Ô∏è  RISK LEVEL: {risk}")
print(f"üí° RECOMMENDATION: {action}")

print(f"\n" + "-"*80)
print("KEY INDICATORS:")
print("-"*80)
print(f"  ‚Ä¢ Income-to-Deposit Ratio: {features['income_to_deposit_ratio']:.2f}")
print(f"    {'‚úÖ Normal (close to 1.0)' if features['income_to_deposit_ratio'] < 1.3 else '‚ö†Ô∏è Suspicious (much higher than 1.0)'}")

print(f"\n  ‚Ä¢ Debt-to-Income Ratio: {features['debt_to_income_ratio']:.2f}")
print(f"    {'‚úÖ Healthy (< 4.0)' if features['debt_to_income_ratio'] < 4.0 else '‚ö†Ô∏è High debt burden (> 4.0)'}")

print(f"\n  ‚Ä¢ Fraud Risk Score: {int(features['fraud_risk_score'])}/18")
print(f"    {'‚úÖ Low risk (< 8)' if features['fraud_risk_score'] < 8 else '‚ö†Ô∏è High risk (‚â• 8)'}")

print(f"\n  ‚Ä¢ Estimated Credit Score: {features['credit_score']}")
print(f"    {'‚úÖ Good (> 700)' if features['credit_score'] > 700 else '‚ö†Ô∏è Needs improvement (< 700)'}")

print(f"\n  ‚Ä¢ Estimated Monthly Deposit: ‚Çπ{features['verified_monthly_deposit']:,}")
print(f"    {'‚úÖ Matches income' if abs(features['income_to_deposit_ratio'] - 1.0) < 0.3 else '‚ö†Ô∏è Does not match income'}")

# Summary box
print(f"\n" + "="*80)
if fraud_score > 50:
    print("‚ö†Ô∏è  WARNING: High probability of income fraud detected!")
    print("   Recommendation: Income statement requires verification.")
elif fraud_score > 30:
    print("‚ö†Ô∏è  CAUTION: Moderate fraud indicators present.")
    print("   Recommendation: Manual review recommended before approval.")
else:
    print("‚úÖ LEGITIMATE: Income statement appears genuine.")
    print("   Recommendation: Safe to proceed with application.")
print("="*80)

print("\n‚úÖ Prediction complete!")