In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s4e11/sample_submission.csv
/kaggle/input/playground-series-s4e11/train.csv
/kaggle/input/playground-series-s4e11/test.csv


# Libraries

In [2]:
# Core Libraries
import pandas as pd
import numpy as np
from scipy import stats
import random
import warnings

# Visualization Libraries
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
import seaborn as sns
import squarify
import plotly.graph_objects as go
import plotly.express as px
%matplotlib inline

# Machine Learning Libraries
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, FunctionTransformer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import IsolationForest
from xgboost import XGBClassifier

# GPU

In [3]:
import torch

def check_gpu_status():
    if torch.cuda.is_available():
        print(f"可用GPU数量: {torch.cuda.device_count()}")
        for i in range(torch.cuda.device_count()):
            print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
            print(f"显存使用情况: {torch.cuda.memory_allocated(i) / 1024**2:.2f} MB 已用, {torch.cuda.memory_reserved(i) / 1024**2:.2f} MB 总分配")
            print(f"显存空余: {torch.cuda.memory_reserved(i) - torch.cuda.memory_allocated(i):.2f} 字节\n")
    else:
        print("未检测到 GPU！")

check_gpu_status()

可用GPU数量: 2
GPU 0: Tesla T4
显存使用情况: 0.00 MB 已用, 0.00 MB 总分配
显存空余: 0.00 字节

GPU 1: Tesla T4
显存使用情况: 0.00 MB 已用, 0.00 MB 总分配
显存空余: 0.00 字节



# IMPORT DATA

In [4]:
df_train = pd.read_csv("/kaggle/input/playground-series-s4e11/train.csv")
df_test = pd.read_csv("/kaggle/input/playground-series-s4e11/test.csv")

In [5]:
df_train.head()

Unnamed: 0,id,Name,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,0,Aaradhya,Female,49.0,Ludhiana,Working Professional,Chef,,5.0,,,2.0,More than 8 hours,Healthy,BHM,No,1.0,2.0,No,0
1,1,Vivan,Male,26.0,Varanasi,Working Professional,Teacher,,4.0,,,3.0,Less than 5 hours,Unhealthy,LLB,Yes,7.0,3.0,No,1
2,2,Yuvraj,Male,33.0,Visakhapatnam,Student,,5.0,,8.97,2.0,,5-6 hours,Healthy,B.Pharm,Yes,3.0,1.0,No,1
3,3,Yuvraj,Male,22.0,Mumbai,Working Professional,Teacher,,5.0,,,1.0,Less than 5 hours,Moderate,BBA,Yes,10.0,1.0,Yes,1
4,4,Rhea,Female,30.0,Kanpur,Working Professional,Business Analyst,,1.0,,,1.0,5-6 hours,Unhealthy,BBA,Yes,9.0,4.0,Yes,0


# DATA Processing

In [6]:
# Drop 'id' column
df_train = df_train.drop(['id'], axis=1)

# Define the target column
target_column = 'Depression'

# Select categorical columns
categorical_columns = df_train.select_dtypes(include=['object']).columns

# Select numerical columns, excluding the target column 
numerical_columns = df_train.select_dtypes(exclude=['object']).columns.drop(target_column)

# Print out the lists of columns
print("Target Column:", target_column)
print("\nCategorical Columns:", categorical_columns.tolist())
print("\nNumerical Columns:", numerical_columns.tolist())

Target Column: Depression

Categorical Columns: ['Name', 'Gender', 'City', 'Working Professional or Student', 'Profession', 'Sleep Duration', 'Dietary Habits', 'Degree', 'Have you ever had suicidal thoughts ?', 'Family History of Mental Illness']

Numerical Columns: ['Age', 'Academic Pressure', 'Work Pressure', 'CGPA', 'Study Satisfaction', 'Job Satisfaction', 'Work/Study Hours', 'Financial Stress']


In [7]:
# Define features and target
X_train = df_train.drop('Depression', axis=1)
y_train = df_train['Depression']

# Define preprocessing pipelines
numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')), 
    ('scaler', StandardScaler()),
    ('convert_to_float32', FunctionTransformer(lambda x: x.astype(np.float32)))
])

categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), 
    ('ordinal', OrdinalEncoder(dtype=np.int32, handle_unknown='use_encoded_value', unknown_value=-1))
])

# Combine the numerical and categorical pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_columns),
        ('cat', categorical_pipeline, categorical_columns)
    ]
)

# Apply the transformations to the training and test sets
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(df_test) 

# Apply Isolation Forest for outlier detection on the training data
isolation_forest = IsolationForest(contamination=0.04, random_state=10)
outlier_labels = isolation_forest.fit_predict(X_train_preprocessed)

# Filter out outliers from both X_train_preprocessed and y_train
non_outliers_mask = outlier_labels != -1
X_train_preprocessed = X_train_preprocessed[non_outliers_mask]
y_train = y_train[non_outliers_mask]

# XGBoost

In [8]:
pip install xgboost


Note: you may need to restart the kernel to use updated packages.


In [9]:
pip install xgboost --upgrade


Collecting xgboost
  Downloading xgboost-2.1.2-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting nvidia-nccl-cu12 (from xgboost)
  Downloading nvidia_nccl_cu12-2.23.4-py3-none-manylinux2014_x86_64.whl.metadata (1.8 kB)
Downloading xgboost-2.1.2-py3-none-manylinux_2_28_x86_64.whl (153.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.9/153.9 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading nvidia_nccl_cu12-2.23.4-py3-none-manylinux2014_x86_64.whl (199.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.0/199.0 MB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nvidia-nccl-cu12, xgboost
  Attempting uninstall: xgboost
    Found existing installation: xgboost 2.0.3
    Uninstalling xgboost-2.0.3:
      Successfully uninstalled xgboost-2.0.3
Successfully installed nvidia-nccl-cu12-2.23.4 xgboost-2.1.2
Note: you may need to restart the kernel to use updated packages.


In [10]:
import optuna
import joblib
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier

# 定义目标函数
def objective(trial):
    # 定义要优化的超参数
    param_grid = {
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1.0),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0.0, 5.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 1.0, 10.0),
    }

    # 初始化模型，启用 GPU 加速
    model = XGBClassifier(**param_grid, use_label_encoder=False, random_state=10, tree_method='hist', device='cuda')

    # 交叉验证
    cv_scores = cross_val_score(model, X_train_preprocessed, y_train, cv=5, scoring='accuracy')
    mean_cv_score = cv_scores.mean()

    # 返回目标值（越大越好）
    return mean_cv_score

def optimize_hyperparameters():
    """运行 Optuna 超参数优化并返回最优参数和分数"""
    study = optuna.create_study(direction='maximize')  # 最大化准确率
    study.optimize(objective, n_trials=5)  # 增加试验次数

    print("Best parameters:", study.best_params)
    print("Best cross-validation accuracy:", study.best_value)
    return study.best_params, study.best_value

def train_best_model(params):
    """使用最优参数训练模型并保存"""
    best_model = XGBClassifier(**params, use_label_encoder=False, random_state=10, tree_method='hist', device='cuda')
    best_model.fit(X_train_preprocessed, y_train)  # 在整个训练集上训练模型
    joblib.dump(best_model, "best_model_xgb.pkl")  # 保存模型
    print("Best model saved as 'best_model_xgb.pkl'")
    return best_model

# 检查输入数据
if X_train_preprocessed is None or y_train is None:
    raise ValueError("输入数据未定义，请确保 `X_train_preprocessed` 和 `y_train` 已正确加载。")

# 运行超参数优化
best_params, best_score = optimize_hyperparameters()

# 使用最优参数训练模型
best_model_XGB = train_best_model(best_params)


[I 2024-11-10 02:38:52,112] A new study created in memory with name: no-name-2e9cc1ec-9b06-4c62-a33d-61cccd0411c8
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  print(smsg)
[I 2024-11-10 02:38:56,662] Trial 0 finished with value: 0.9424158952629235 and parameters: {'colsample_bytree': 0.6127323747908503, 'learning_rate': 0.17493052828757943, 'max_depth': 11, 'min_child_weight': 8, 'n_estimators': 475, 'subsample': 0.770673036737211, 'gamma': 3.65461659416078, 'reg_lambda': 4.720250112802938}. Best is trial 0 with value: 0.9424158952629235.
[I 2024-11-10 02:39:00,021] Trial 1 finished with value: 0.9429045291409212 and parameters: {'colsample_bytree': 0.3726943154571929, 'learning_rate': 0.06279118666171293, 'max_depth': 3, 'min_child_weight': 2, 'n_estimators': 351, 'subsample': 0.7138395542343512, 'gamma': 0.3371228047477004, 'reg_lambda': 1.042732336848554}. Best is trial 1 w

Best parameters: {'colsample_bytree': 0.13688906826576172, 'learning_rate': 0.1943270114137025, 'max_depth': 8, 'min_child_weight': 2, 'n_estimators': 480, 'subsample': 0.8858072864068489, 'gamma': 3.4140449018902608, 'reg_lambda': 1.9657108414548234}
Best cross-validation accuracy: 0.9435264220452538
Best model saved as 'best_model_xgb.pkl'


In [11]:
# Fit the model 
best_model_XGB.fit(X_train_preprocessed, y_train)  # 在整个训练集上训练模型
# Make predictions 
test_preds =best_model_XGB.predict(X_test_preprocessed)
# Create a DataFrame to hold the submission results
output = pd.DataFrame({'id': df_test['id'],
                       'class': test_preds})

# Save the output DataFrame to a CSV file
output.to_csv('submission_XGB.csv', index=False)

output.head()

Unnamed: 0,id,class
0,140700,0
1,140701,0
2,140702,0
3,140703,1
4,140704,0


# Catboost

In [12]:
pip install catboost --upgrade


Note: you may need to restart the kernel to use updated packages.


In [13]:
import optuna
from sklearn.model_selection import cross_val_score
from catboost import CatBoostClassifier

# 定义目标函数
def objective(trial):
    # 定义要优化的超参数
    param_grid = {
        'iterations': trial.suggest_int('iterations', 100, 500),
        'depth': trial.suggest_int('depth', 3, 15),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1.0, 10.0),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
    }

    # 初始化模型，启用 GPU
    model = CatBoostClassifier(
        **param_grid,
        random_seed=10,
        task_type='GPU',  # 启用 GPU 加速
        devices='0',  # 指定使用 GPU 设备，可以根据需要更改为 '0,1' 来使用多个 GPU
        verbose=0  # 关闭训练过程的输出
    )

    # 交叉验证
    cv_scores = cross_val_score(model, X_train_preprocessed, y_train, cv=5, scoring='accuracy')
    mean_cv_score = cv_scores.mean()

    # 返回目标值（越大越好）
    return mean_cv_score

# 创建 Optuna study
study = optuna.create_study(direction='maximize')  # 最大化准确率
study.optimize(objective, n_trials=5)  # 运行 50 次试验

# 输出最优参数
print("Best parameters:", study.best_params)
print("Best cross-validation accuracy:", study.best_value)

# 使用最优参数初始化模型
best_model_Catboost = CatBoostClassifier(
    **study.best_params,
    random_seed=10,
    task_type='GPU',  # 启用 GPU 加速
    devices='0',  # 使用指定的 GPU 设备
    verbose=0
)
best_model_Catboost.fit(X_train_preprocessed, y_train)  # 在整个训练集上训练模型

# 保存模型
import joblib
joblib.dump(best_model_Catboost, "best_model_catboost.pkl")
print("Best model saved as 'best_model_catboost.pkl'")


[I 2024-11-10 02:39:24,450] A new study created in memory with name: no-name-f9548041-eb22-4c0a-a657-3fc57ebfb600
[I 2024-11-10 02:40:24,788] Trial 0 finished with value: 0.9422900485748364 and parameters: {'iterations': 365, 'depth': 11, 'learning_rate': 0.04362811352108447, 'l2_leaf_reg': 2.7796423876154654, 'border_count': 151, 'bagging_temperature': 0.07061698559866769}. Best is trial 0 with value: 0.9422900485748364.
[I 2024-11-10 02:40:39,474] Trial 1 finished with value: 0.9428453068960028 and parameters: {'iterations': 480, 'depth': 3, 'learning_rate': 0.141429622898294, 'l2_leaf_reg': 7.677696630462661, 'border_count': 215, 'bagging_temperature': 0.2952287498431104}. Best is trial 1 with value: 0.9428453068960028.
[I 2024-11-10 02:41:41,195] Trial 2 finished with value: 0.9420753387845139 and parameters: {'iterations': 374, 'depth': 11, 'learning_rate': 0.0314289568361383, 'l2_leaf_reg': 3.930193786797469, 'border_count': 135, 'bagging_temperature': 0.3878448640625678}. Best i

Best parameters: {'iterations': 480, 'depth': 3, 'learning_rate': 0.141429622898294, 'l2_leaf_reg': 7.677696630462661, 'border_count': 215, 'bagging_temperature': 0.2952287498431104}
Best cross-validation accuracy: 0.9428453068960028
Best model saved as 'best_model_catboost.pkl'


In [14]:
# Fit the model 
best_model_Catboost.fit(X_train_preprocessed, y_train)  # 在整个训练集上训练模型
# Make predictions 
test_preds =best_model_Catboost.predict(X_test_preprocessed)
# Create a DataFrame to hold the submission results
output = pd.DataFrame({'id': df_test['id'],
                       'class': test_preds})

# Save the output DataFrame to a CSV file
output.to_csv('submission_Catboost.csv', index=False)

output.head()

Unnamed: 0,id,class
0,140700,0
1,140701,0
2,140702,0
3,140703,1
4,140704,0


# LightGBM

In [15]:
pip install lightgbm --install-option=--gpu



Usage:   
  /opt/conda/bin/python -m pip install [options] <requirement specifier> [package-index-options] ...
  /opt/conda/bin/python -m pip install [options] -r <requirements file> [package-index-options] ...
  /opt/conda/bin/python -m pip install [options] [-e] <vcs project url> ...
  /opt/conda/bin/python -m pip install [options] [-e] <local project path> ...
  /opt/conda/bin/python -m pip install [options] <archive url/path> ...

no such option: --install-option
Note: you may need to restart the kernel to use updated packages.


In [16]:
import optuna
from sklearn.model_selection import cross_val_score
from lightgbm import LGBMClassifier

# 定义目标函数
def objective(trial):
    # 定义要优化的超参数
    param_grid = {
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'max_depth': trial.suggest_int('max_depth', -1, 15),  # -1 表示不限制深度
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 10.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 10.0),
        'device': 'gpu',  # 启用 GPU 加速
    }

    # 初始化模型，启用 GPU
    model = LGBMClassifier(**param_grid, random_state=10)

    # 交叉验证
    cv_scores = cross_val_score(model, X_train_preprocessed, y_train, cv=5, scoring='accuracy')
    mean_cv_score = cv_scores.mean()

    # 打印当前试验的参数和结果
    print(f"Trial params: {param_grid}")
    print(f"Trial accuracy: {mean_cv_score:.4f}\n")

    # 返回目标值（越大越好）
    return mean_cv_score

# 创建 Optuna study
study = optuna.create_study(direction='maximize')  # 最大化准确率
study.optimize(objective, n_trials=5, show_progress_bar=False)  # 运行 50 次试验

# 输出最优参数
print("Best parameters:", study.best_params)
print("Best cross-validation accuracy:", study.best_value)

# 使用最优参数初始化模型
best_model_LGBM = LGBMClassifier(**study.best_params, random_state=10)
best_model_LGBM.fit(X_train_preprocessed, y_train)  # 在整个训练集上训练模型

# 保存模型
import joblib
joblib.dump(best_model_LGBM, "best_model_lgbm.pkl")
print("Best model saved as 'best_model_lgbm.pkl'")


[I 2024-11-10 02:45:31,343] A new study created in memory with name: no-name-b90fd708-a055-47f4-aef9-0f6aa83751a9


[LightGBM] [Info] Number of positive: 18660, number of negative: 89397
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 639
[LightGBM] [Info] Number of data points in the train set: 108057, number of used features: 18
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...




[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 14 dense feature groups (1.65 MB) transferred to GPU in 0.002816 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.172687 -> initscore=-1.566705
[LightGBM] [Info] Start training from score -1.566705
[LightGBM] [Info] Number of positive: 18660, number of negative: 89397
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 634
[LightGBM] [Info] Number of data points in the train set: 108057, number of used features: 18
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 14 dense feature groups (1.65 MB) transferred to GPU in 0.002770 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.172687 -> initscore=-1.5

[I 2024-11-10 02:45:47,429] Trial 0 finished with value: 0.9426009929715977 and parameters: {'num_leaves': 29, 'max_depth': 9, 'learning_rate': 0.1297465702692319, 'n_estimators': 265, 'min_child_samples': 12, 'subsample': 0.7088454295594289, 'colsample_bytree': 0.9910916877324435, 'reg_alpha': 0.25725693641989955, 'reg_lambda': 9.58133706097607}. Best is trial 0 with value: 0.9426009929715977.


Trial params: {'num_leaves': 29, 'max_depth': 9, 'learning_rate': 0.1297465702692319, 'n_estimators': 265, 'min_child_samples': 12, 'subsample': 0.7088454295594289, 'colsample_bytree': 0.9910916877324435, 'reg_alpha': 0.25725693641989955, 'reg_lambda': 9.58133706097607, 'device': 'gpu'}
Trial accuracy: 0.9426

[LightGBM] [Info] Number of positive: 18660, number of negative: 89397
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 639
[LightGBM] [Info] Number of data points in the train set: 108057, number of used features: 18
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 14 dense feature groups (1.65 MB) transferred to GPU in 0.002774 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.172687 -> initscore=-1.566705
[LightGBM] [Info] Start t

[I 2024-11-10 02:45:56,974] Trial 1 finished with value: 0.942889723100097 and parameters: {'num_leaves': 76, 'max_depth': 4, 'learning_rate': 0.29252178190000777, 'n_estimators': 411, 'min_child_samples': 69, 'subsample': 0.9638831662353424, 'colsample_bytree': 0.7966573518881499, 'reg_alpha': 9.528342066666108, 'reg_lambda': 7.677860161950094}. Best is trial 1 with value: 0.942889723100097.


Trial params: {'num_leaves': 76, 'max_depth': 4, 'learning_rate': 0.29252178190000777, 'n_estimators': 411, 'min_child_samples': 69, 'subsample': 0.9638831662353424, 'colsample_bytree': 0.7966573518881499, 'reg_alpha': 9.528342066666108, 'reg_lambda': 7.677860161950094, 'device': 'gpu'}
Trial accuracy: 0.9429

[LightGBM] [Info] Number of positive: 18660, number of negative: 89397
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 639
[LightGBM] [Info] Number of data points in the train set: 108057, number of used features: 18
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 14 dense feature groups (1.65 MB) transferred to GPU in 0.003168 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.172687 -> initscore=-1.566705
[LightGBM] [Info] Start t

[I 2024-11-10 02:46:10,940] Trial 2 finished with value: 0.9418458372041746 and parameters: {'num_leaves': 96, 'max_depth': 10, 'learning_rate': 0.24536180466391344, 'n_estimators': 369, 'min_child_samples': 8, 'subsample': 0.999494397820488, 'colsample_bytree': 0.810849248444893, 'reg_alpha': 9.839480212158179, 'reg_lambda': 1.9273424051353183}. Best is trial 1 with value: 0.942889723100097.


Trial params: {'num_leaves': 96, 'max_depth': 10, 'learning_rate': 0.24536180466391344, 'n_estimators': 369, 'min_child_samples': 8, 'subsample': 0.999494397820488, 'colsample_bytree': 0.810849248444893, 'reg_alpha': 9.839480212158179, 'reg_lambda': 1.9273424051353183, 'device': 'gpu'}
Trial accuracy: 0.9418

[LightGBM] [Info] Number of positive: 18660, number of negative: 89397
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 639
[LightGBM] [Info] Number of data points in the train set: 108057, number of used features: 18
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 14 dense feature groups (1.65 MB) transferred to GPU in 0.002754 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.172687 -> initscore=-1.566705
[LightGBM] [Info] Start tr

[I 2024-11-10 02:46:20,733] Trial 3 finished with value: 0.9412165368945663 and parameters: {'num_leaves': 75, 'max_depth': 13, 'learning_rate': 0.2939954235703352, 'n_estimators': 134, 'min_child_samples': 84, 'subsample': 0.7481091588674107, 'colsample_bytree': 0.5415200011173993, 'reg_alpha': 0.10066929319039408, 'reg_lambda': 6.04025307314204}. Best is trial 1 with value: 0.942889723100097.


Trial params: {'num_leaves': 75, 'max_depth': 13, 'learning_rate': 0.2939954235703352, 'n_estimators': 134, 'min_child_samples': 84, 'subsample': 0.7481091588674107, 'colsample_bytree': 0.5415200011173993, 'reg_alpha': 0.10066929319039408, 'reg_lambda': 6.04025307314204, 'device': 'gpu'}
Trial accuracy: 0.9412

[LightGBM] [Info] Number of positive: 18660, number of negative: 89397
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 639
[LightGBM] [Info] Number of data points in the train set: 108057, number of used features: 18
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 14 dense feature groups (1.65 MB) transferred to GPU in 0.002826 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.172687 -> initscore=-1.566705
[LightGBM] [Info] Start 

[I 2024-11-10 02:46:26,456] Trial 4 finished with value: 0.9428008841146127 and parameters: {'num_leaves': 51, 'max_depth': 3, 'learning_rate': 0.2100680673446642, 'n_estimators': 198, 'min_child_samples': 96, 'subsample': 0.6535841282872177, 'colsample_bytree': 0.7931225220601941, 'reg_alpha': 9.322156535264796, 'reg_lambda': 4.91812014102135}. Best is trial 1 with value: 0.942889723100097.


Trial params: {'num_leaves': 51, 'max_depth': 3, 'learning_rate': 0.2100680673446642, 'n_estimators': 198, 'min_child_samples': 96, 'subsample': 0.6535841282872177, 'colsample_bytree': 0.7931225220601941, 'reg_alpha': 9.322156535264796, 'reg_lambda': 4.91812014102135, 'device': 'gpu'}
Trial accuracy: 0.9428

Best parameters: {'num_leaves': 76, 'max_depth': 4, 'learning_rate': 0.29252178190000777, 'n_estimators': 411, 'min_child_samples': 69, 'subsample': 0.9638831662353424, 'colsample_bytree': 0.7966573518881499, 'reg_alpha': 9.528342066666108, 'reg_lambda': 7.677860161950094}
Best cross-validation accuracy: 0.942889723100097
[LightGBM] [Info] Number of positive: 23325, number of negative: 111747
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.033533 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 654
[LightGBM] [Info] Number of data points in the train set: 135072, number of used features: 18
[Ligh

In [17]:
# Fit the model 
best_model_LGBM.fit(X_train_preprocessed, y_train)

# Make predictions 
test_preds = best_model_LGBM.predict(X_test_preprocessed)
# Create a DataFrame to hold the submission results
output = pd.DataFrame({'id': df_test['id'],
                       'class': test_preds})

# Save the output DataFrame to a CSV file
output.to_csv('submission_LGBM.csv', index=False)
output.head()

[LightGBM] [Info] Number of positive: 23325, number of negative: 111747
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.033994 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 654
[LightGBM] [Info] Number of data points in the train set: 135072, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.172686 -> initscore=-1.566712
[LightGBM] [Info] Start training from score -1.566712


Unnamed: 0,id,class
0,140700,0
1,140701,0
2,140702,0
3,140703,1
4,140704,0


ENSEMBLE

In [18]:
import numpy as np

# 假设 X_test_preprocessed 是测试集的特征
# 获取每个模型的预测概率
probs_XGB = best_model_XGB.predict_proba(X_test_preprocessed)
probs_CatBoost = best_model_Catboost.predict_proba(X_test_preprocessed)
probs_LGBM = best_model_LGBM.predict_proba(X_test_preprocessed)

# 设置每个模型的权重（这里给定相同的权重 1，表示平等融合）
w1, w2, w3 = 1, 1, 1

# 加权平均融合概率
final_probs = (w1 * probs_XGB + w2 * probs_CatBoost + w3 * probs_LGBM) / (w1 + w2 + w3)

# 得到最终预测类别
final_preds = np.argmax(final_probs, axis=1)

# 创建提交文件
output = pd.DataFrame({'id': df_test['id'], 'class': final_preds})
output.to_csv('submission_ensemble3.csv', index=False)

# 显示提交文件的前几行
print(output.head())

       id  class
0  140700      0
1  140701      0
2  140702      0
3  140703      1
4  140704      0
