# Insurance Claim Approval ML

This notebook walks through:
- Loading and exploring the dataset
- Preprocessing and encoding features
- Training a classification model
- Flagging claims for manual review based on model uncertainty

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score


In [None]:
# Load dataset
df = pd.read_csv('../data/enhanced_health_insurance_claims.csv')
df.head()

## Basic Cleaning + Target Setup

In [None]:
# Drop PENDING claims and convert target to binary

df = df[df['ClaimStatus'].isin(['Approved', 'Denied'])].copy()
df['Target'] = (df['ClaimStatus'] == 'Approved').astype(int)

# Drop identifiers and target
X = df.drop(columns=['ClaimID', 'PatientID', 'ProviderID', 'ClaimStatus', 'Target', 'ClaimDate'])
y = df['Target']

## Split and Preprocess

In [None]:
# Identify feature types
num_feats = ['ClaimAmount', 'PatientAge', 'PatientIncome']
cat_feats = [col for col in X.columns if col not in num_feats]

# Build preprocessing pipeline
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_feats),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_feats)
])

# Combine with model
pipeline = Pipeline([
    ('prep', preprocessor),
    ('clf', RandomForestClassifier(random_state=42))
])

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Fit model
pipeline.fit(X_train, y_train)

## Evaluate + Triage Logic

In [None]:
# Predict and get probabilities
y_pred = pipeline.predict(X_test)
y_proba = pipeline.predict_proba(X_test)[:, 1]

# Evaluate
print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_proba))

# Triage tagging
def triage_flag(prob):
    if 0.4 <= prob <= 0.6:
        return 'Needs Review'
    return 'Confident'

triage = pd.DataFrame({
    'Predicted': y_pred,
    'Prob_Approved': y_proba,
    'TriageFlag': [triage_flag(p) for p in y_proba]
})

triage['TriageFlag'].value_counts()