In [None]:
import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMClassifier
import warnings
warnings.simplefilter('ignore')

# Load the train and test datasets
train = pd.read_csv("/kaggle/input/playground-series-s4e7/train.csv")
test = pd.read_csv("/kaggle/input/playground-series-s4e7/test.csv")

# Concatenate train and test dataframes
df = pd.concat([train, test])

# Replace and cast columns
df['Gender'] = df['Gender'].replace({'Male': 0, 'Female': 1}).astype(int)
df['Region_Code'] = df['Region_Code'].astype(int)
df['Vehicle_Age'] = df['Vehicle_Age'].replace({'< 1 Year': 0, '1-2 Year': 1, '> 2 Years': 2}).astype(int)
df['Vehicle_Damage'] = df['Vehicle_Damage'].replace({'No': 0, 'Yes': 1}).astype(int)
df['Annual_Premium'] = df['Annual_Premium'].astype(int)
df['Policy_Sales_Channel'] = df['Policy_Sales_Channel'].astype(int)

# Combine and factorize columns
df['Previously_Insured_Annual_Premium'] = LabelEncoder().fit_transform((df['Previously_Insured'].astype(str) + df['Annual_Premium'].astype(str)).values)
df['Previously_Insured_Vehicle_Age'] = LabelEncoder().fit_transform((df['Previously_Insured'].astype(str) + df['Vehicle_Age'].astype(str)).values)
df['Previously_Insured_Vehicle_Damage'] = LabelEncoder().fit_transform((df['Previously_Insured'].astype(str) + df['Vehicle_Damage'].astype(str)).values)
df['Previously_Insured_Vintage'] = LabelEncoder().fit_transform((df['Previously_Insured'].astype(str) + df['Vintage'].astype(str)).values)

# Create binary columns for specific regions and channels
df['Region_8'] = np.where(df['Region_Code'] == 8, 1, 0).astype(int)
df['Region_28'] = np.where(df['Region_Code'] == 28, 1, 0).astype(np.int8)
df['Channel_26'] = np.where(df['Policy_Sales_Channel'] == 26, 1, 0).astype(int)
df['Channel_124'] = np.where(df['Policy_Sales_Channel'] == 124, 1, 0).astype(int)
df['Channel_152'] = np.where(df['Policy_Sales_Channel'] == 152, 1, 0).astype(int)

# Split back into train and test dataframes
train = df.iloc[:train.shape[0]].copy()
test = df.iloc[train.shape[0]:].copy()

# Define features and target variable
X = train.drop(columns=["id", "Response"])
y = train["Response"]
X_test = test.drop(columns=["id", "Response"])

# Define the LightGBM classifier with specified parameters
params_lgb = {
    'verbose': -1,
    'objective': 'binary',
    "eval_metric": 'auc',
    'device': 'cpu',
    'n_estimators': 2350,
    'learning_rate': 0.54,
    'max_depth': 17,
    'min_data_in_leaf': 20,
    'reg_alpha': 6.0,
    'reg_lambda': 1.5,
    'max_bin': 54000,
    'num_leaves': 60,
    'min_data_in_bin': 2,
    'colsample_bytree': 0.3,
    'random_state': 0,
}
estimator = LGBMClassifier(**params_lgb)

# Create the pipeline with scaling and classifier
pipeline = make_pipeline(StandardScaler(), estimator)

# Train the model on the entire training data
pipeline.fit(X, y)

# Predict probabilities on the test set
y_pred_test = pipeline.predict_proba(X_test)[:, 1]

# Save the predictions to a CSV file
submission = pd.DataFrame({'id': test['id'], 'Response': y_pred_test})
submission.to_csv("submission_lightgbm.csv", index=False)
