<a href="https://colab.research.google.com/github/fjadidi2001/DataScienceJourney/blob/master/ensemble_GBM_insurance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Step 1: Load the dataset
from google.colab import drive
import pandas as pd
import numpy as np

# Mount Google Drive
drive.mount('/content/drive')

# Specify file path
file_path = '/content/drive/My Drive/telematics_syn.csv'

# Read the CSV file
data = pd.read_csv(file_path)

# Step 2: Explore the data
print("First few rows of the dataset:")
print(data.head())
print("\nDataset information:")
print(data.info())
print("\nChecking for missing values:")
print(data.isnull().sum())
print("\nBasic statistics:")
print(data.describe())

# Step 3: Create target variable ClaimYN
data['ClaimYN'] = ((data['NB_Claim'] >= 1) & (data['AMT_Claim'] >= 1000)).astype(int)

# Display distribution of ClaimYN
print("\nDistribution of ClaimYN:")
print(data['ClaimYN'].value_counts(normalize=True) * 100)
print("\nAbsolute counts:")
print(data['ClaimYN'].value_counts())
 # Import required libraries
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from collections import Counter
import pandas as pd
import numpy as np

# Print initial column count
print("Initial number of columns:", len(data.columns))

# Separate features and target
# Exclude NB_Claim and AMT_Claim as they were used to create the target
features = data.drop(['ClaimYN', 'NB_Claim', 'AMT_Claim'], axis=1)
print("\nNumber of columns after dropping target and claim columns:", len(features.columns))

# Print categorical columns unique values
print("\nUnique values in categorical columns:")
categorical_columns = ['Insured.sex', 'Marital', 'Car.use', 'Region']
for col in categorical_columns:
    print(f"{col}: {features[col].unique()}")

# Convert categorical variables to numeric using one-hot encoding
features = pd.get_dummies(features, columns=categorical_columns)
print("\nColumns after one-hot encoding:")
print(features.columns.tolist())
print("\nTotal number of columns after encoding:", len(features.columns))

X = features
y = data['ClaimYN']

# Proceed with train-test split and SMOTE
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# Print final shapes
print("\nFinal dataset shapes:")
print(f"Original features: {X.shape}")
print(f"X_train_balanced: {X_train_balanced.shape}")
print(f"X_test: {X_test.shape}")
 # Import required libraries
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import pandas as pd
import numpy as np

# Feature Engineering
def create_aggregated_features(df):
    """Create aggregated features from existing ones"""

    # Aggregate acceleration features
    df['Agg_Accel'] = df[[
        'Accel.06miles', 'Accel.08miles', 'Accel.09miles',
        'Accel.11miles', 'Accel.12miles', 'Accel.14miles'
    ]].mean(axis=1)

    # Aggregate braking features
    df['Agg_Brake'] = df[[
        'Brake.06miles', 'Brake.08miles', 'Brake.09miles',
        'Brake.11miles', 'Brake.12miles', 'Brake.14miles'
    ]].mean(axis=1)

    # Aggregate left turn features
    df['Agg_Left_Turn'] = df[[
        'Left.turn.intensity08', 'Left.turn.intensity09', 'Left.turn.intensity10',
        'Left.turn.intensity11', 'Left.turn.intensity12'
    ]].mean(axis=1)

    # Aggregate right turn features
    df['Agg_Right_Turn'] = df[[
        'Right.turn.intensity08', 'Right.turn.intensity09', 'Right.turn.intensity10',
        'Right.turn.intensity11', 'Right.turn.intensity12'
    ]].mean(axis=1)

    # Create overall harsh driving score
    df['Harsh_Driving_Score'] = (
        df['Agg_Accel'] + df['Agg_Brake'] +
        df['Agg_Left_Turn'] + df['Agg_Right_Turn']
    ) / 4

    # Create rush hour driving ratio
    df['Rush_Hour_Ratio'] = (
        df['Pct.drive.rush am'] + df['Pct.drive.rush pm']
    ) / df['Total.miles.driven']

    return df

# Standardization function
def standardize_features(X_train, X_test):
    """Standardize numerical features"""
    scaler = StandardScaler()

    # Get numerical columns (exclude dummy variables)
    numerical_cols = X_train.select_dtypes(include=['float64', 'int64']).columns

    # Fit and transform training data
    X_train_scaled = X_train.copy()
    X_train_scaled[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])

    # Transform test data
    X_test_scaled = X_test.copy()
    X_test_scaled[numerical_cols] = scaler.transform(X_test[numerical_cols])

    return X_train_scaled, X_test_scaled, scaler

# Apply preprocessing pipeline
def preprocess_data(X_train, X_test):
    """Complete preprocessing pipeline"""

    # 1. Feature Engineering
    print("Applying feature engineering...")
    X_train = create_aggregated_features(X_train)
    X_test = create_aggregated_features(X_test)

    # 2. Standardization
    print("Standardizing features...")
    X_train_scaled, X_test_scaled, scaler = standardize_features(X_train, X_test)

    # Print feature names and their shapes
    print("\nFinal feature set:")
    print(f"Training set shape: {X_train_scaled.shape}")
    print(f"Test set shape: {X_test_scaled.shape}")
    print("\nFeatures included:")
    print(X_train_scaled.columns.tolist())

    return X_train_scaled, X_test_scaled, scaler

# Apply preprocessing to our balanced dataset
X_train_processed, X_test_processed, scaler = preprocess_data(X_train_balanced, X_test)

# Print sample statistics to verify preprocessing
print("\nSample statistics after preprocessing:")
print(X_train_processed.describe().round(2))


Mounted at /content/drive
First few rows of the dataset:
   Duration  Insured.age Insured.sex  Car.age  Marital  Car.use  Credit.score  \
0       366           45        Male       -1  Married  Commute         609.0   
1       182           44      Female        3  Married  Commute         575.0   
2       184           48      Female        6  Married  Commute         847.0   
3       183           71        Male        6  Married  Private         842.0   
4       183           84        Male       10  Married  Private         856.0   

  Region  Annual.miles.drive  Years.noclaims  ...  Left.turn.intensity10  \
0  Urban             6213.71              25  ...                    1.0   
1  Urban            12427.42              20  ...                   58.0   
2  Urban            12427.42              14  ...                    0.0   
3  Urban             6213.71              43  ...                    0.0   
4  Urban             6213.71              65  ...                    2.0   


In [2]:
# Step 4: Ensemble Method with GBM, RF, and XGBoost

import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier

# Define parameter grids for each model
gb_param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

xgb_param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'min_child_weight': [1, 5, 10],
    'subsample': [0.7, 0.8, 0.9]
}

# Function to perform RandomizedSearchCV
def perform_random_search(model, param_grid, X, y):
    random_search = RandomizedSearchCV(model, param_distributions=param_grid,
                                       n_iter=10, cv=3, random_state=42, n_jobs=-1)
    random_search.fit(X, y)
    return random_search.best_estimator_

# Perform RandomizedSearchCV for each model
print("Tuning Gradient Boosting Machine...")
gb_best = perform_random_search(GradientBoostingClassifier(random_state=42), gb_param_grid, X_train_processed, y_train_balanced)

print("Tuning Random Forest...")
rf_best = perform_random_search(RandomForestClassifier(random_state=42), rf_param_grid, X_train_processed, y_train_balanced)

print("Tuning XGBoost...")
xgb_best = perform_random_search(XGBClassifier(random_state=42), xgb_param_grid, X_train_processed, y_train_balanced)

# Create the ensemble model
ensemble = VotingClassifier(
    estimators=[('gb', gb_best), ('rf', rf_best), ('xgb', xgb_best)],
    voting='soft'
)

# Fit the ensemble model
print("Fitting the ensemble model...")
ensemble.fit(X_train_processed, y_train_balanced)

# Make predictions
y_pred = ensemble.predict(X_test_processed)
y_pred_proba = ensemble.predict_proba(X_test_processed)[:, 1]

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc_roc = roc_auc_score(y_test, y_pred_proba)

# Print results
print("\nEnsemble Model Performance:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")
print(f"AUC-ROC: {auc_roc:.4f}")

# Print best parameters for each model
print("\nBest parameters for Gradient Boosting Machine:")
print(gb_best.get_params())
print("\nBest parameters for Random Forest:")
print(rf_best.get_params())
print("\nBest parameters for XGBoost:")
print(xgb_best.get_params())

Tuning Gradient Boosting Machine...


KeyboardInterrupt: 