In [3]:
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, RobustScaler, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, roc_curve, auc, ConfusionMatrixDisplay

# Load datasets
orig = pd.read_csv("train.csv")
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sample_sub = pd.read_csv("sample_submission.csv")

orig_filtered = orig[orig['Response'] == 1]

# Merge the filtered orig dataframe with the train dataframe
train = pd.concat([train, orig_filtered], axis=0).reset_index(drop=True)


# Convert less unique columns to category
less = [col for col in train.columns if train[col].nunique() < 4 and col in test.columns]
for col in less:
    train[col] = train[col].astype("category")

# Convert 'Vehicle_Age' to ordered categorical
from pandas.api.types import CategoricalDtype
new_categories = ['< 1 Year', '1-2 Year', '> 2 Years']
new_dtype = CategoricalDtype(categories=new_categories, ordered=True)
train['Vehicle_Age'] = train['Vehicle_Age'].astype(new_dtype)

# Binning and encoding additional features
train['Not_Insured and Damaged'] = train['Vehicle_Damage'] * (1 - train['Previously_Insured'])
def add_feat(df):
    df['Age_Vehicle_Age'] = df['Age'] * df['Vehicle_Age']
    df['Age_Annual_Premium'] = df['Age'] * df['Annual_Premium']
    df['Vehicle_Damage_Annual_Premium'] = df['Vehicle_Damage'] * df['Annual_Premium']
    return df
train = add_feat(train)
train['Region_8'] = np.where(train['Region_Code'] == 8, 1, 0).astype(np.int8)
train['Region_28'] = np.where(train['Region_Code'] == 28, 1, 0).astype(np.int8)
train['Channel_26'] = np.where(train['Policy_Sales_Channel'] == 26, 1, 0).astype(np.int8)
train['Channel_124'] = np.where(train['Policy_Sales_Channel'] == 124, 1, 0).astype(np.int8)
train['Channel_152'] = np.where(train['Policy_Sales_Channel'] == 152, 1, 0).astype(np.int8)

# Drop unused columns
cols_to_drop = ['id', 'Age', 'Region_Code', 'Policy_Sales_Channel','Vintage','Age_Bin','Policy_Sales_Channel_Bin','Region_Code_Bin','Vintage_Bin']
X = train.drop(columns=cols_to_drop + ['Response'], axis=1)
y = train['Response']

# Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Define the ColumnTransformer
coltrans = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), ['Driving_License', 'Gender', 'Previously_Insured', 'Vehicle_Damage']),
        ('ordinal', OrdinalEncoder(categories=[['< 1 Year', '1-2 Year', '> 2 Years']]), ['Vehicle_Age']),
        ('robust', RobustScaler(), ['Annual_Premium']),
        ('standard', StandardScaler(), ['Age_Class', 'PSC_Class', 'Vintage_Class', 'Region_Code_Class'])
    ],
    remainder='passthrough'
)

# Apply transformations
X_train_trans = coltrans.fit_transform(X_train)
X_val_trans = coltrans.transform(X_val)

# Preprocess test data in the same way
for col in less:
    test[col] = test[col].astype("category")
test['Not_Insured and Damaged'] = test['Vehicle_Damage'] * (1 - test['Previously_Insured'])
test = add_feat(test)
test['Region_8'] = np.where(test['Region_Code'] == 8, 1, 0).astype(np.int8)
test['Region_28'] = np.where(test['Region_Code'] == 28, 1, 0).astype(np.int8)
test['Channel_26'] = np.where(test['Policy_Sales_Channel'] == 26, 1, 0).astype(np.int8)
test['Channel_124'] = np.where(test['Policy_Sales_Channel'] == 124, 1, 0).astype(np.int8)
test['Channel_152'] = np.where(test['Policy_Sales_Channel'] == 152, 1, 0).astype(np.int8)

# Transform the test data
test_trans = coltrans.transform(test.drop(columns=cols_to_drop, axis=1))

# Define evaluation function
def evaluate_model(model, X_test, y_test):
    y_hat_test = model.predict(X_test)
    print('             Classification Report')
    print('---------------------------------------------')
    print(classification_report(y_test, y_hat_test))
    fig, axes = plt.subplots(figsize=(12, 8), ncols=2)
    cm = ConfusionMatrixDisplay.from_estimator(model, X_test, y_test, normalize='true', cmap='Blues', ax=axes[0])
    axes[0].set_title('Confusion Matrix')
    y_score = model.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_score)
    roc_auc = auc(fpr, tpr)
    axes[1].plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
    axes[1].plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    axes[1].set_xlim([0.0, 1.0])
    axes[1].set_ylim([0.0, 1.05])
    axes[1].set_xlabel('False Positive Rate')
    axes[1].set_ylabel('True Positive Rate')
    axes[1].set_title('ROC-AUC Curve')
    axes[1].legend(loc='lower right')
    axes[1].grid()
    fig.tight_layout()
    plt.show()

# Define the model and perform grid search
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [4, 5]
}

TypeError: unsupported operand type(s) for -: 'int' and 'Categorical'

In [None]:
from sklearn.metrics import rocauc_score
import lightgbm as lgb
from lightgbm import LGBMClassifier
model = LGBMClassifier(

                        boosting_type = 'gbdt',
                        objective = 'binary',
                        metric = 'auc',
                        verbosity = -1,
                        n_estimators = 1000,
                        max_depth = 5,
                        random_state = 42
)


model.fit(
    X_train_trans,
    y_train,
    eval_set=[(X_val_trans, y_val)],
)

y_pred = model.predict(X_val_trans)
y_pred_prob = model.predict_proba(X_val_trans)[:,1]
auc = rocauc_score(y_val,y_pred_prob)
print(f" AUC: {auc:.5f}")

In [None]:
importances = model.featureimportances
feature_names = coltrans.get_feature_names_out()

importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
})


importance_df = importance_df.sort_values(by='Importance', ascending=False)


plt.figure(figsize=(12, 8))
plt.title('Feature Importances')
plt.barh(importance_df['Feature'], importance_df['Importance'], color='skyblue')
plt.xlabel('Relative Importance')
plt.gca().invert_yaxis()
plt.show()

In [None]:
print(importance_df)

In [None]:
evaluate_model(model, X_val_trans, y_val)

In [None]:
test_preds = model.predict_proba(test_trans)[:, 1] 
test_preds

In [None]:
sample_sub['Response'] = test_preds.astype(np.float32)
sample_sub['id'] = sample_sub['id'].astype(np.int32)
sample_sub.to_csv('submission3.csv', index=False)
sample_sub