<a href="https://colab.research.google.com/github/fjadidi2001/Insurance/blob/main/newVersionNov11.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
# Mount Google Drive
from google.colab import drive

In [2]:
drive.mount('/content/drive')
# Specify file path
file_path = '/content/drive/My Drive/telematics_syn.csv'
# Read the CSV file
df = pd.read_csv(file_path)

Mounted at /content/drive


In [3]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

# 1. Exploratory Data Analysis (EDA)
def perform_eda(df):
    """Perform exploratory data analysis on the insurance dataset."""
    analysis = {
        'basic_stats': {},
        'correlations': {},
        'feature_importance': {}
    }

    # Basic statistics
    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
    analysis['basic_stats']['numeric'] = df[numeric_cols].describe()

    # Calculate risk category based on claims
    df['Risk_Category'] = np.where(
        (df['NB_Claim'] <= 0) & (df['AMT_Claim'] <= 0),
        0,  # Low risk
        np.where(
            (df['NB_Claim'] <= 2) & (df['AMT_Claim'] <= 2),
            1,  # Medium risk
            2   # High risk
        )
    )

    # Correlation analysis
    correlations = df[numeric_cols].corr()
    analysis['correlations'] = correlations

    # Feature importance based on correlation with Risk_Category
    feature_importance = abs(correlations['Risk_Category']).sort_values(ascending=False)
    analysis['feature_importance'] = feature_importance

    return analysis, df

# 2. Data Preprocessing
def preprocess_data(df):
    """Preprocess the insurance dataset for modeling."""
    # Separate features and target
    X = df.drop(['NB_Claim', 'AMT_Claim', 'Risk_Category'], axis=1)
    y = df['Risk_Category']

    # Split categorical and numerical columns
    categorical_cols = X.select_dtypes(include=['object', 'bool']).columns
    numerical_cols = X.select_dtypes(include=['float64', 'int64']).columns

    # Scale numerical features
    scaler = StandardScaler()
    X[numerical_cols] = scaler.fit_transform(X[numerical_cols])

    # Encode categorical features
    label_encoders = {}
    for col in categorical_cols:
        label_encoders[col] = LabelEncoder()
        X[col] = label_encoders[col].fit_transform(X[col])

    return X, y, numerical_cols, categorical_cols

# 3. Combined TabNet and XGBoost Model
class TabNetXGBoostEnsemble:
    def __init__(self, n_features, n_classes,
                 n_decision_steps=5, feature_mask_size=None,
                 n_trees=100, learning_rate=0.1):
        self.n_features = n_features
        self.n_classes = n_classes
        self.n_decision_steps = n_decision_steps
        self.feature_mask_size = feature_mask_size or n_features // 2
        self.n_trees = n_trees
        self.learning_rate = learning_rate

        # Initialize TabNet components
        self.feature_transformers = []
        self.decision_steps = []

        # Initialize XGBoost-like components
        self.trees = []

    def _tabnet_forward(self, X):
        """TabNet forward pass implementation."""
        batch_size = len(X)
        features = X.copy()

        for step in range(self.n_decision_steps):
            # Feature selection mask
            mask = np.random.choice(self.n_features,
                                  size=self.feature_mask_size,
                                  replace=False)

            # Apply mask and transform features
            masked_features = features[:, mask]

            # Simple linear transformation (can be replaced with more complex ones)
            transformed = np.dot(masked_features,
                               np.random.randn(self.feature_mask_size, self.n_classes))

            # Store transformations
            self.feature_transformers.append((mask, transformed))

            # Update features for next step
            features[:, mask] *= 0.8  # Attenuate selected features

        return transformed

    def _xgboost_forward(self, X):
        """XGBoost-like forward pass implementation."""
        predictions = np.zeros((len(X), self.n_classes))

        for tree in range(self.n_trees):
            # Simplified decision tree implementation
            # In practice, this would be a full CART implementation
            split_feature = np.random.randint(0, self.n_features)
            split_value = X[:, split_feature].mean()

            # Simple splitting logic
            left_mask = X[:, split_feature] <= split_value
            right_mask = ~left_mask

            # Calculate predictions for each split
            left_pred = np.random.randn(self.n_classes)
            right_pred = np.random.randn(self.n_classes)

            # Update predictions
            predictions[left_mask] += self.learning_rate * left_pred
            predictions[right_mask] += self.learning_rate * right_pred

            # Store tree info
            self.trees.append({
                'split_feature': split_feature,
                'split_value': split_value,
                'left_pred': left_pred,
                'right_pred': right_pred
            })

        return predictions

    def fit(self, X, y):
        """Train the combined model."""
        # TabNet forward pass
        tabnet_output = self._tabnet_forward(X)

        # XGBoost forward pass
        xgb_output = self._xgboost_forward(X)

        # Combine predictions (simple average)
        self.final_output = 0.5 * (tabnet_output + xgb_output)

        return self

    def predict(self, X):
        """Make predictions using the trained model."""
        # TabNet prediction
        tabnet_pred = self._tabnet_forward(X)

        # XGBoost prediction
        xgb_pred = self._xgboost_forward(X)

        # Combine predictions
        combined_pred = 0.5 * (tabnet_pred + xgb_pred)

        # Return class predictions
        return np.argmax(combined_pred, axis=1)

# 4. Main execution function
def run_insurance_analysis(data):
    """Run the complete insurance analysis pipeline."""
    # Convert data to DataFrame if needed
    if not isinstance(data, pd.DataFrame):
        data = pd.DataFrame(data)

    # 1. Perform EDA
    eda_results, data_with_risk = perform_eda(data)

    # 2. Preprocess data
    X, y, numerical_cols, categorical_cols = preprocess_data(data_with_risk)

    # 3. Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # 4. Train model
    model = TabNetXGBoostEnsemble(
        n_features=X.shape[1],
        n_classes=len(np.unique(y)),
        n_decision_steps=5,
        n_trees=100,
        learning_rate=0.1
    )

    model.fit(X_train, y_train)

    # 5. Make predictions
    predictions = model.predict(X_test)

    return {
        'eda_results': eda_results,
        'model': model,
        'predictions': predictions,
        'true_values': y_test
    }