<a href="https://colab.research.google.com/github/fjadidi2001/Insurance/blob/main/ClaimYN_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from imblearn.over_sampling import SMOTE
import shap
import matplotlib.pyplot as plt
import seaborn as sns

def load_and_preprocess_data(file_path):
    """Load and preprocess the insurance dataset."""
    # Load data
    df = pd.read_csv(file_path)

    # Create target variable
    df['ClaimYN'] = ((df['NB_Claim'] >= 1) & (df['AMT_Claim'] > 1000)).astype(int)

    # Drop unnecessary columns
    df = df.drop(['NB_Claim', 'AMT_Claim'], axis=1)

    return df

def encode_features(df):
    """Encode categorical features and scale numerical features."""
    # Separate categorical and numerical columns
    cat_cols = df.select_dtypes(include=['object']).columns
    num_cols = df.select_dtypes(exclude=['object']).columns
    num_cols = num_cols.drop('ClaimYN') if 'ClaimYN' in num_cols else num_cols

    # Initialize encoders and scaler
    encoders = {}
    scaler = StandardScaler()

    # Encode categorical variables
    df_encoded = df.copy()
    for col in cat_cols:
        encoders[col] = LabelEncoder()
        df_encoded[col] = encoders[col].fit_transform(df[col])

    # Scale numerical variables
    df_encoded[num_cols] = scaler.fit_transform(df[num_cols])

    return df_encoded, encoders, scaler

def handle_class_imbalance(X, y):
    """Handle class imbalance using SMOTE."""
    smote = SMOTE(random_state=42)
    X_balanced, y_balanced = smote.fit_resample(X, y)
    return X_balanced, y_balanced

def train_model(X_train, y_train):
    """Train Random Forest model."""
    model = RandomForestClassifier(
        n_estimators=100,
        max_depth=10,
        random_state=42,
        class_weight='balanced'
    )
    model.fit(X_train, y_train)
    return model

def evaluate_model(model, X_test, y_test):
    """Evaluate model performance."""
    # Make predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]

    # Calculate metrics
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    print("\nROC-AUC Score:", roc_auc_score(y_test, y_pred_proba))

    # Plot confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(
        confusion_matrix(y_test, y_pred),
        annot=True,
        fmt='d',
        cmap='Blues'
    )
    plt.title('Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.show()

def analyze_feature_importance(model, X_train):
    """Analyze feature importance using SHAP values."""
    # Calculate SHAP values
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_train)

    # Plot feature importance
    plt.figure(figsize=(12, 8))
    shap.summary_plot(shap_values[1] if isinstance(shap_values, list) else shap_values,
                     X_train,
                     plot_type="bar",
                     max_display=20)
    plt.title('Feature Importance (SHAP Values)')
    plt.show()

def main():
    # Load and preprocess data
    df = load_and_preprocess_data('/content/drive/My Drive/telematics_syn.csv')

    # Encode features
    df_encoded, encoders, scaler = encode_features(df)

    # Split features and target
    X = df_encoded.drop('ClaimYN', axis=1)
    y = df_encoded['ClaimYN']

    # Split into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # Handle class imbalance
    X_train_balanced, y_train_balanced = handle_class_imbalance(X_train, y_train)

    # Train model
    model = train_model(X_train_balanced, y_train_balanced)

    # Evaluate model
    evaluate_model(model, X_test, y_test)

    # Analyze feature importance
    analyze_feature_importance(model, X_test)

    return model, encoders, scaler

if __name__ == "__main__":
    model, encoders, scaler = main()

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/My Drive/telematics_syn.csv'