In [155]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from scipy import stats
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import concurrent.futures
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

The goal of the project is to predict policy holders who are likely to change their insurer to other companies. This is a classification problem with binary response variable "convert_ind", where 0 indicates the policy holder continues to use Travelers. 

In [144]:
df = pd.read_csv("C:\\Users\\joonw\\trav\\trav_dataset1.csv")

### Spliting dataset into training set and testing set

In [160]:
# test_set = df[df['split']=='Test']  # all 0 for test data to prevent cheating

# test_set = df.sample(frac=0.2, random_state=24)
# test_mask = df.index.isin(test_set.index)
# rain_set = df.loc[~test_mask,:].reset_index(drop=True)
# test_set = test_set.reset_index(drop=True)

from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(df, test_size=0.2, stratify=df['convert_ind'], random_state=24)

train_set = train_set.drop(columns=['split'])
test_set = test_set.drop(columns=['split'])

# Separate features and target from the entire training set
train_y = train_set['convert_ind']
train_x = train_set.drop(columns=['convert_ind'])

test_y = test_set['convert_ind']
test_x = test_set.drop(columns=['convert_ind'])

### Parameter Opitmization

It seems grid search is more efficient than RandomizedSearch

Grid search

In [None]:
def evaluate_model(params, train_x, train_y, skf):
    """Evaluate a model with the given parameters using cross-validation."""
    train_set = lgb.Dataset(train_x, label=train_y)
    
    # Perform cross-validation
    cv_results = lgb.cv(
        params,
        train_set,
        num_boost_round=2000,
        folds=skf.split(train_x, train_y),
        metrics='auc',
        seed=42
    )
    
    # Get the best AUC score
    score = max(cv_results['valid auc-mean'])
    return score

# Parameter grid for hyperparameter tuning
param_grid = {
    'num_leaves': np.arange(10, 20),
    'learning_rate': [0.005, 0.01, 0.02, 0.03],
    'feature_fraction': [0.75, 0.8, 0.85],
    'min_data_in_leaf': [10, 20, 30],
    'lambda_l1': [0, 0.1, 1],
    'lambda_l2': [0, 0.1, 1],
    'early_stopping_rounds':50
}

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
best_score = 0
best_params = None

# Create a list of all parameter combinations
param_combinations = [
    {
        'objective': 'binary',
        'boosting_type': 'gbdt',
        'metric': 'auc',
        'num_leaves': num_leaves,
        'learning_rate': learning_rate,
        'feature_fraction': feature_fraction,
        'min_data_in_leaf': min_data_in_leaf,
        'lambda_l1': lambda_l1,
        'lambda_l2': lambda_l2,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'random_state': 42,
        'is_unbalance': True
    }
    for num_leaves in param_grid['num_leaves']
    for learning_rate in param_grid['learning_rate']
    for feature_fraction in param_grid['feature_fraction']
    for min_data_in_leaf in param_grid['min_data_in_leaf']
    for lambda_l1 in param_grid['lambda_l1']
    for lambda_l2 in param_grid['lambda_l2']
]

# Evaluate parameter combinations in parallel
with concurrent.futures.ThreadPoolExecutor(max_workers=32) as executor:
    futures = [
        executor.submit(evaluate_model, params, train_x, train_y, skf)
        for params in param_combinations
    ]
    for future, params in zip(concurrent.futures.as_completed(futures), param_combinations):
        score = future.result()
        print(f"Params: {params}, AUC: {score}")
        if score > best_score:
            best_score = score
            best_params = params

Params: {'objective': 'binary', 'boosting_type': 'gbdt', 'metric': 'auc', 'num_leaves': np.int64(10), 'learning_rate': 0.005, 'feature_fraction': 0.75, 'min_data_in_leaf': 10, 'lambda_l1': 0, 'lambda_l2': 0, 'bagging_fraction': 0.8, 'bagging_freq': 5, 'random_state': 42, 'is_unbalance': True}, AUC: 0.6755706985074343
Params: {'objective': 'binary', 'boosting_type': 'gbdt', 'metric': 'auc', 'num_leaves': np.int64(10), 'learning_rate': 0.005, 'feature_fraction': 0.75, 'min_data_in_leaf': 10, 'lambda_l1': 0, 'lambda_l2': 0.1, 'bagging_fraction': 0.8, 'bagging_freq': 5, 'random_state': 42, 'is_unbalance': True}, AUC: 0.6752097797607799
Params: {'objective': 'binary', 'boosting_type': 'gbdt', 'metric': 'auc', 'num_leaves': np.int64(10), 'learning_rate': 0.005, 'feature_fraction': 0.75, 'min_data_in_leaf': 10, 'lambda_l1': 0, 'lambda_l2': 1, 'bagging_fraction': 0.8, 'bagging_freq': 5, 'random_state': 42, 'is_unbalance': True}, AUC: 0.675680437448145
Params: {'objective': 'binary', 'boosting_

### previous code

In [None]:
param_grid = {
    'num_leaves': [15, 31, 63],
    'learning_rate': [0.01, 0.05, 0.1],
    'feature_fraction': [0.6, 0.8, 1.0]
}

best_score = 0
best_params = None

for num_leaves in param_grid['num_leaves']:
    for learning_rate in param_grid['learning_rate']:
        for feature_fraction in param_grid['feature_fraction']:
            # Update parameters
            params = {
                'objective': 'binary',
                'boosting_type': 'gbdt',
                'metric': 'auc',
                'num_leaves': num_leaves,
                'learning_rate': learning_rate,
                'feature_fraction': feature_fraction,
                'bagging_fraction': 0.8,
                'bagging_freq': 5,
                'random_state': 42,
                'early_stopping_round': 50  # Include early stopping here
            }
            
            # Perform cross-validation
            cv_results = lgb.cv(
                params,
                train_set,
                num_boost_round=1000,
                nfold=10,
                metrics='auc',
                seed=42
            )
            
            # Get the best score
            # print(cv_results.keys())
            score = max(cv_results['valid auc-mean'])  # Access the mean AUC
            if score > best_score:
                best_score = score
                best_params = params

# Print the best parameters and score
print("Best Parameters:", best_params)
print("Best CV AUC Score:", best_score)



Best Parameters: {'objective': 'binary', 'boosting_type': 'gbdt', 'metric': 'auc', 'num_leaves': 15, 'learning_rate': 0.01, 'feature_fraction': 0.8, 'bagging_fraction': 0.8, 'bagging_freq': 5, 'random_state': 42, 'early_stopping_round': 50}
Best CV AUC Score: 0.677994801447962

### Modeling after Optimizing Tuning Parameters

In [None]:
# Define the best hyperparameters from cross-validation
best_params = {
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'metric': 'auc',
    'num_leaves': 15,  # Example: From the grid search
    'learning_rate': 0.01,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'random_state': 42,
    'early_stopping_rounds': 50,
}

# Create the full LightGBM dataset
full_train_dataset = lgb.Dataset(train_x, label=train_y)

# Define a logging callback
callbacks = [lgb.log_evaluation(period=50)]

# Train the model on the full training set
final_model = lgb.train(
    best_params,
    full_train_dataset,
    num_boost_round=2000,  # Use a high value to allow full convergence
    valid_sets=[full_train_dataset],
    valid_names=['train'],
    callbacks=callbacks  # Use callbacks for logging
)

# Save the model for future use
final_model.save_model('final_model.txt')

# Evaluate on the test set
test_y = test_set['convert_ind']
test_x = test_set.drop(columns=['convert_ind'])

test_pred = final_model.predict(test_x)
test_auc = roc_auc_score(test_y, test_pred)
print(f"Test AUC: {test_auc:.4f}")

### Results

Confusion matrix

In [102]:
# view confusion-matrix
# Print the Confusion Matrix and slice it into four pieces

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(test_y, test_pred)
print('Confusion matrix\n\n', cm)
print('\nTrue Positives(TP) = ', cm[0,0])
print('\nTrue Negatives(TN) = ', cm[1,1])
print('\nFalse Positives(FP) = ', cm[0,1])
print('\nFalse Negatives(FN) = ', cm[1,0])

Confusion matrix

 [[3573    0]
 [ 360    0]]

True Positives(TP) =  3573

True Negatives(TN) =  0

False Positives(FP) =  0

False Negatives(FN) =  360


Classification matrix

In [None]:
print(classification_report(test_y, test_pred))

              precision    recall  f1-score   support

         0.0       0.91      1.00      0.95      3573
         1.0       0.00      0.00      0.00       360

    accuracy                           0.91      3933
   macro avg       0.45      0.50      0.48      3933
weighted avg       0.83      0.91      0.86      3933



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
