# imports

In [1]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, Pool
from xgboost import XGBClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from category_encoders import TargetEncoder
# from openfe import OpenFE, transform
import warnings
from sklearn.utils import resample
import gc


warnings.filterwarnings("ignore")
import optuna

In [2]:
train_data = pd.read_csv("/kaggle/input/despacitoinsur/trainreduced.csv")
test_data = pd.read_csv("/kaggle/input/despacitoinsur/testreduced.csv")
submission = pd.read_csv("/kaggle/input/playground-series-s4e7/sample_submission.csv")

In [13]:
train_data.shape

(11504798, 11)

In [4]:
# target encoding and drop duplicates
columns_to_convert = ['Vintage', 'Annual_Premium', 'Region_Code', 'Policy_Sales_Channel']
train_data[columns_to_convert] = train_data[columns_to_convert].astype(str)
test_data[columns_to_convert] = test_data[columns_to_convert].astype(str)

# Define the features and the target
features_to_encode = ['Vintage', 'Annual_Premium', 'Region_Code', 'Policy_Sales_Channel']
target = 'Response'

# Apply target encoding
encoder = TargetEncoder(cols=features_to_encode)
train_data[features_to_encode] = encoder.fit_transform(train_data[features_to_encode], train_data[target])

test_data[features_to_encode] = encoder.transform(test_data[features_to_encode])

# Round the encoded values
round_precision = 0.05
train_data[features_to_encode] = train_data[features_to_encode].apply(lambda x: (x / round_precision).round() * round_precision)
test_data[features_to_encode] = test_data[features_to_encode].apply(lambda x: (x / round_precision).round() * round_precision)

# Remove duplicate rows
train_data = train_data.drop_duplicates()

In [5]:
train_data.shape

(515198, 11)

# encoded-data

In [8]:
type_mappings = {
    'Gender': bool,
    'Age': 'int32',
    'Region_Code': str,
    'Vehicle_Age': 'int32',
    'Vintage': 'int32',
    'Driving_License': bool,
    'Previously_Insured': bool,
    'Response': bool,  # Only in train_data
    'Vehicle_Damage': bool,
    'Policy_Sales_Channel': str
}

# Apply type conversions
for column, dtype in type_mappings.items():
    if column in train_data.columns:
        train_data[column] = train_data[column].astype(dtype)
    if column in test_data.columns:
        test_data[column] = test_data[column].astype(dtype)

# Convert categorical variables to dummy/indicator variables
train_data = pd.get_dummies(train_data)
test_data = pd.get_dummies(test_data)

In [9]:
train_data.shape

(100000, 175)

# under-sampling

In [None]:
sample_size = train_data['Response'].value_counts().min()

# Undersample both classes and combine them
train_data = pd.concat([
    resample(train_data[train_data['Response'] == 1], replace=False, n_samples=sample_size, random_state=42),
    resample(train_data[train_data['Response'] == 0], replace=False, n_samples=sample_size, random_state=42)
]).sample(frac=1, random_state=42).reset_index(drop=True)

gc.collect()

In [None]:
df_balanced.shape

In [None]:
train = train_data.sample(n = 10000)
test_data = test_data.sample(n = 10000)

In [7]:
features = []
train_feats = []

for feature in train_data.columns:
    if feature in test_data.columns:
        features.append(feature)
    else:
        train_feats.append(feature)
    
features = []
test_feats = []
for feature in test_data.columns:
    if feature in train_data.columns:
        features.append(feature)
    else:
        test_feats.append(feature)

print(test_feats)
print(train_feats)

[]
['Response']


In [None]:
test_data = test_data.drop(test_feats, axis = 1)
train_feats.remove("Response")
for feature in train_feats:
    test_data[feature] = 0

In [None]:
def feature_engineering(df):
 
    df['Age_Vehicle_Age'] = df['Age'] * df['Vehicle_Age']
    df['Age_Previously_Insured'] = df['Age'] * df['Previously_Insured']
    df['Vehicle_Age_Damage'] = df['Vehicle_Age'] * df['Vehicle_Damage']
    df['Previously_Insured_Damage'] = df['Previously_Insured'] * df['Vehicle_Damage']   
    df['Age_squared'] = df['Age'] ** 2
    df['Vehicle_Age_squared'] = df['Vehicle_Age'] ** 2   
    df['Annual_Premium_per_Age'] = df['Annual_Premium'] / (df['Age'] + 1)
    return df

In [None]:
feature_engineering(train)
feature_engineering(test_data)

In [None]:
response_counts = train['Response'].value_counts()

# Calculate the percentage of each Response value
response_percentages = (response_counts / len(train)) * 100

# Display the results
print("Response value counts:")
print(response_counts)
print("\nResponse value percentages:")
print(response_percentages)

## hyper tuning

In [14]:
train_data = train_data.sample(n = 100000)

In [15]:
train_data.shape

(100000, 11)

In [16]:
model = XGBClassifier(
        n_estimators = 709,
        learning_rate = 0.013166529236809608,
        eta = 0.05,
        alpha =  6.428394209273172,
        subsample = 0.9719577127113721, 
        colsample_bytree = 0.6314469581598212, 
        max_depth = 10,
        min_child_weight = 5,
        gamma = 0.0017688666476104672,
        eval_metric = 'auc',
        max_bin = 262143,
        tree_method = 'gpu_hist'
    )

X_train = train_data.drop(columns=['Response'])
y_train = train_data['Response']
print("training")
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(model, X_train, y_train, scoring='roc_auc', cv=kf)

np.mean(cv_scores)

training


0.8721466432683871

In [None]:
X_train = train.drop(columns=['Response'])
y_train = train['Response']

def objective(trial):

    model = XGBClassifier(
        n_estimators = trial.suggest_int("n_estimators", 700, 1300),
        learning_rate = trial.suggest_float("learning_rate", 0.001, 0.9, log = True),
        eta = 0.05,
        alpha =  6.428394209273172,
        subsample = 0.9719577127113721, 
        colsample_bytree = 0.6314469581598212, 
        max_depth = 10,
        min_child_weight = 5,
        gamma = 0.0017688666476104672,
        eval_metric = 'auc',
        max_bin = 262143,
        tree_method = 'gpu_hist'
    )
    
    model.fit(X_train, y_train)
    
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = cross_val_score(model, X_train, y_train, scoring='roc_auc', cv=kf)
    
    
    return np.mean(cv_scores)

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

In [None]:
study.best_params

## training

In [None]:
# columns_to_convert = ['Vintage', 'Annual_Premium', 'Region_Code', 'Policy_Sales_Channel', 'Annual_Premium_per_Age']

# train[columns_to_convert] = train[columns_to_convert].astype(str)
# test_data[columns_to_convert] = test_data[columns_to_convert].astype(str)

train["Annual_Premium"] = train["Annual_Premium"].astype(str)
test_data["Annual_Premium"] = test_data["Annual_Premium"].astype(str)


cat_aucs = []
cat_preds = []

all_features = [c for c in train.columns if c not in ['Response']]

train = train.reset_index(drop=True)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx, valid_idx) in enumerate(skf.split(train, train['Response'])):
    print(f'### Fold {fold+1} Training ###')

    X_train = train.loc[train_idx, [c for c in train.columns if c not in ['Response']]]
    y_train = train.loc[train_idx, 'Response']
    X_valid = train.loc[valid_idx, X_train.columns]
    y_valid = train.loc[valid_idx, 'Response']
    X_test = test_data[X_train.columns]
    
    cat_features = all_features
    print(123)
    X_train_pool = Pool(X_train, y_train, cat_features=cat_features)
    print(123)
    X_valid_pool = Pool(X_valid, y_valid, cat_features=cat_features)
    print(123)
    X_test_pool = Pool(X_test, cat_features=cat_features)

    model = CatBoostClassifier(
        loss_function='Logloss',
        eval_metric='AUC',
        learning_rate=0.05,
        iterations=5000,
        depth=9,
        random_strength=0,
        l2_leaf_reg=0.5,
        task_type='GPU',
        random_seed=42,
        verbose=False
    )

    model.fit(X=X_train_pool, eval_set=X_valid_pool, verbose=1000, early_stopping_rounds=200)

    pred_valid = model.predict_proba(X_valid_pool)[:, 1]
    cat_preds.append(model.predict_proba(X_test_pool)[:, 1])

    auc = roc_auc_score(y_valid, pred_valid)
    cat_aucs.append(auc)

    print(f'Fold {fold+1} AUC: {auc:.5f}\n')

print(f'\nOverall AUC: {np.mean(cat_aucs):.5f} +/- {np.std(cat_aucs):.5f}')

In [None]:
train = train.sample(n = 10000)
test_data = test_data.sample(n = 10000)
submission = submission.sample(n = 10000)

In [None]:
test_data.shape

In [None]:
xgb_aucs = []
xgb_preds = []

train = train_data
train = train.reset_index(drop=True)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx, valid_idx) in enumerate(
    skf.split(train, train['Response'])):
    print(f'### Fold {fold+1} Training ###')

    X_train = train.loc[train_idx, [c for c in train.columns if c not in ['Response']]]
    y_train = train.loc[train_idx, 'Response']
    X_valid = train.loc[valid_idx, X_train.columns]
    y_valid = train.loc[valid_idx, 'Response']
    X_test = test_data[X_train.columns]
    print("training")
    model = XGBClassifier(
        n_estimators = 709,
        learning_rate = 0.013166529236809608,
        eta = 0.05,
        alpha =  6.428394209273172,
        subsample = 0.9719577127113721, 
        colsample_bytree = 0.6314469581598212, 
        max_depth = 10,
        min_child_weight = 5,
        gamma = 0.0017688666476104672,
        eval_metric = 'auc',
        max_bin = 262143,
        tree_method = 'gpu_hist'
    )
    
    model.fit(X=X_train, y=y_train)

    pred_valid = model.predict_proba(X_valid)[:, 1]
    xgb_preds.append(model.predict_proba(X_test)[:, 1])

    auc = roc_auc_score(y_valid, pred_valid)
    xgb_aucs.append(auc)

    print(f'Fold {fold+1} AUC: {auc:.5f}\n')
    gc.collect()
print(f'\nOverall AUC: {np.mean(xgb_aucs):.5f} +/- {np.std(xgb_aucs):.5f}')

### Fold 1 Training ###
training
Fold 1 AUC: 0.87951

### Fold 2 Training ###
training


In [None]:
submission['Response'] = np.mean(xgb_aucs, axis=0)
submission.to_csv('submission.csv', index=False)