In [4]:
import pandas as pd
import numpy as np

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from scipy.stats import chi2_contingency

In [2]:
df = pd.read_excel('Dataset.xlsx')

## Clean Data

In [3]:
# Get the list of columns to convert to categorical
categorical_columns = df.select_dtypes(include='int64').columns.tolist()

# Convert the selected columns to categorical
df[categorical_columns] = df[categorical_columns].astype('category')

numeric_data = df.select_dtypes(include=[np.number])

# Calculate MAD for each column
mad = numeric_data.mad()

# Choose a threshold multiplier
k = 3

# Calculate the threshold value
threshold = k * mad

# Identify outliers
outliers = (np.abs(numeric_data - numeric_data.median()) > threshold)

# Apply logarithm to the specified columns
outlier_columns = ['CommissionSacrificePercentage', 'BonusCommissionPercentage']
for column in outlier_columns:
    df[column] = np.log1p(df[column])

numeric_data = df.select_dtypes(include=[np.number])

# Check that all outliers have been handled
outliers = (np.abs(numeric_data - numeric_data.median()) > threshold)

# Count the number of outliers in each column
outlier_counts = outliers.sum()

# Create a new DataFrame from outlier_counts
outlier_table = pd.DataFrame({'Column': outlier_counts.index, 'Outlier Count': outlier_counts.values})

# Display the table
print(outlier_table)

                          Column  Outlier Count
0  CommissionSacrificePercentage              0
1      BonusCommissionPercentage              0


## Feature Importance Techniques

### Random Forest Feature Importance Scoring

In [6]:
# Define features and target variable
features = ['Product', 'ProductGroup', 'ProductType', 'Agency', 'WorkflowStatus', 'Indexation', 'NoOfLives',
            'CommDateProvided', 'PaymentFreq', 'UWDecision', 'ComissionSacrifice', 'CommissionSacrificeType',
            'RenewalSacrificeType', 'CommissionTerms', 'Discount', 'BonusCommission', 'FreeCover',
            'SeriousIllnessType', 'SignedDecReceived']
target = 'PolicyIssued'

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.2, random_state=42)

# Train a Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

# Get feature importances
feature_importances = pd.Series(rf_classifier.feature_importances_, index=features).sort_values(ascending=False)

# Print feature importances
print("Feature Importances:")
print(feature_importances)

Feature Importances:
Agency                     0.559780
WorkflowStatus             0.173468
UWDecision                 0.118432
CommDateProvided           0.033788
SignedDecReceived          0.013722
NoOfLives                  0.013316
ProductGroup               0.011579
SeriousIllnessType         0.011124
BonusCommission            0.011008
Product                    0.010811
CommissionTerms            0.010571
Indexation                 0.009274
ProductType                0.006298
RenewalSacrificeType       0.004066
CommissionSacrificeType    0.003736
ComissionSacrifice         0.003247
Discount                   0.002656
PaymentFreq                0.002633
FreeCover                  0.000490
dtype: float64


In [12]:
# Select the top-n features based on importance
n = 4 
selected_features = feature_importances[:n].index.tolist()

In [13]:
# Create a new dataset with the selected features
selected_data = df[selected_features + [target]]

In [14]:
# Conduct experiments by training and evaluating the model on different feature combinations
for feature_combination in range(1, n + 1):
    selected_features_combination = selected_features[:feature_combination]
    X_train_combination = X_train[selected_features_combination]
    X_test_combination = X_test[selected_features_combination]

    rf_classifier_combination = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_classifier_combination.fit(X_train_combination, y_train)
    y_pred_combination = rf_classifier_combination.predict(X_test_combination)

    accuracy = accuracy_score(y_test, y_pred_combination)
    report = classification_report(y_test, y_pred_combination)
    print(f"\nExperiment with {feature_combination} features:")
    print(f"Selected Features: {selected_features_combination}")
    print(f"Accuracy: {accuracy}")
    print(f"Classification Report:\n{report}")


Experiment with 1 features:
Selected Features: ['Agency']
Accuracy: 0.779639462054494
Classification Report:
              precision    recall  f1-score   support

           0       0.55      0.03      0.06      6973
           1       0.78      0.99      0.88     24480

    accuracy                           0.78     31453
   macro avg       0.67      0.51      0.47     31453
weighted avg       0.73      0.78      0.69     31453


Experiment with 2 features:
Selected Features: ['Agency', 'WorkflowStatus']
Accuracy: 0.7996057609766953
Classification Report:
              precision    recall  f1-score   support

           0       0.58      0.35      0.44      6973
           1       0.83      0.93      0.88     24480

    accuracy                           0.80     31453
   macro avg       0.71      0.64      0.66     31453
weighted avg       0.78      0.80      0.78     31453


Experiment with 3 features:
Selected Features: ['Agency', 'WorkflowStatus', 'UWDecision']
Accuracy: 0.8020

#### Try different test sizes and different n estimators

In [19]:
# Define different test sizes and n estimator values to test
test_sizes = [0.2, 0.25, 0.3, 0.33]
n_estimators_values = [50, 100, 150, 200]
n_values = [4,5,6,7]

# Iterate through each test size and n estimator value
for test_size in test_sizes:
    for n_estimators_value in n_estimators_values:
        # Split the data into training and testing sets with the current test size
        X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=test_size, random_state=42)

        # Train a Random Forest classifier with the current n estimator value
        rf_classifier = RandomForestClassifier(n_estimators=n_estimators_value, random_state=42)
        rf_classifier.fit(X_train, y_train)

        # Get feature importances
        feature_importances = pd.Series(rf_classifier.feature_importances_, index=features).sort_values(ascending=False)

        # Print feature importances for the current test size and n estimator value
        print(f"\nTest Size: {test_size}, n_estimators: {n_estimators_value}")
        print("Feature Importances:")
        print(feature_importances)

        # Select the top-n features based on importance
        for n in n_values:
        #n = 4
            selected_features = feature_importances[:n].index.tolist()

            # Create a new dataset with the selected features
            selected_data = df[selected_features + [target]]

            # Train and evaluate the model with the current feature combination
            X_train_selected = X_train[selected_features]
            X_test_selected = X_test[selected_features]

            rf_classifier_selected = RandomForestClassifier(n_estimators=n_estimators_value, random_state=42)
            rf_classifier_selected.fit(X_train_selected, y_train)
            y_pred_selected = rf_classifier_selected.predict(X_test_selected)

            accuracy = accuracy_score(y_test, y_pred_selected)
            report = classification_report(y_test, y_pred_selected)
            print(f"Selected Features: {selected_features}")
            print(f"Accuracy: {accuracy}")
            print(f"Classification Report:\n{report}")


Test Size: 0.2, n_estimators: 50
Feature Importances:
Agency                     0.560283
WorkflowStatus             0.171679
UWDecision                 0.117542
CommDateProvided           0.036348
NoOfLives                  0.014011
SignedDecReceived          0.013423
ProductGroup               0.012310
Product                    0.011039
BonusCommission            0.010791
SeriousIllnessType         0.010695
CommissionTerms            0.010358
Indexation                 0.009120
ProductType                0.005754
RenewalSacrificeType       0.003982
CommissionSacrificeType    0.003621
ComissionSacrifice         0.003282
Discount                   0.002636
PaymentFreq                0.002599
FreeCover                  0.000528
dtype: float64
Selected Features: ['Agency', 'WorkflowStatus', 'UWDecision', 'CommDateProvided']
Accuracy: 0.8017359234413252
Classification Report:
              precision    recall  f1-score   support

           0       0.58      0.37      0.45      6973
   

Selected Features: ['Agency', 'WorkflowStatus', 'UWDecision', 'CommDateProvided']
Accuracy: 0.8017995103805678
Classification Report:
              precision    recall  f1-score   support

           0       0.58      0.37      0.45      6973
           1       0.84      0.93      0.88     24480

    accuracy                           0.80     31453
   macro avg       0.71      0.65      0.66     31453
weighted avg       0.78      0.80      0.78     31453

Selected Features: ['Agency', 'WorkflowStatus', 'UWDecision', 'CommDateProvided', 'NoOfLives']
Accuracy: 0.8006867389438209
Classification Report:
              precision    recall  f1-score   support

           0       0.58      0.37      0.45      6973
           1       0.84      0.92      0.88     24480

    accuracy                           0.80     31453
   macro avg       0.71      0.65      0.67     31453
weighted avg       0.78      0.80      0.78     31453

Selected Features: ['Agency', 'WorkflowStatus', 'UWDecision', 'Co

Selected Features: ['Agency', 'WorkflowStatus', 'UWDecision', 'CommDateProvided', 'NoOfLives']
Accuracy: 0.8013277037338488
Classification Report:
              precision    recall  f1-score   support

           0       0.57      0.39      0.46      8650
           1       0.84      0.92      0.88     30666

    accuracy                           0.80     39316
   macro avg       0.71      0.65      0.67     39316
weighted avg       0.78      0.80      0.79     39316

Selected Features: ['Agency', 'WorkflowStatus', 'UWDecision', 'CommDateProvided', 'NoOfLives', 'SignedDecReceived']
Accuracy: 0.7971055041204599
Classification Report:
              precision    recall  f1-score   support

           0       0.56      0.39      0.46      8650
           1       0.84      0.91      0.88     30666

    accuracy                           0.80     39316
   macro avg       0.70      0.65      0.67     39316
weighted avg       0.78      0.80      0.78     39316

Selected Features: ['Agency', '

Selected Features: ['Agency', 'WorkflowStatus', 'UWDecision', 'CommDateProvided', 'SignedDecReceived', 'NoOfLives']
Accuracy: 0.7963288751351236
Classification Report:
              precision    recall  f1-score   support

           0       0.55      0.40      0.46     10413
           1       0.84      0.91      0.87     36766

    accuracy                           0.80     47179
   macro avg       0.70      0.65      0.67     47179
weighted avg       0.78      0.80      0.78     47179

Selected Features: ['Agency', 'WorkflowStatus', 'UWDecision', 'CommDateProvided', 'SignedDecReceived', 'NoOfLives', 'BonusCommission']
Accuracy: 0.7916021958922402
Classification Report:
              precision    recall  f1-score   support

           0       0.54      0.40      0.46     10413
           1       0.84      0.90      0.87     36766

    accuracy                           0.79     47179
   macro avg       0.69      0.65      0.67     47179
weighted avg       0.77      0.79      0.78   

Selected Features: ['Agency', 'WorkflowStatus', 'UWDecision', 'CommDateProvided', 'SignedDecReceived', 'NoOfLives', 'ProductGroup']
Accuracy: 0.7922037882729253
Classification Report:
              precision    recall  f1-score   support

           0       0.54      0.40      0.46     11494
           1       0.84      0.90      0.87     40403

    accuracy                           0.79     51897
   macro avg       0.69      0.65      0.67     51897
weighted avg       0.77      0.79      0.78     51897


Test Size: 0.33, n_estimators: 100
Feature Importances:
Agency                     0.564858
WorkflowStatus             0.175017
UWDecision                 0.108815
CommDateProvided           0.033528
SignedDecReceived          0.013987
NoOfLives                  0.013651
SeriousIllnessType         0.011698
BonusCommission            0.011642
ProductGroup               0.011488
Product                    0.011156
CommissionTerms            0.010968
Indexation                 0.009743


The feature importance scoring built into random forest appears to be suggesting that the same four independent variables as identifed with Cramer's V are the most important predictors of Policy Issuance.