In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
from google.colab import auth
auth.authenticate_user()

In [3]:
!pip install -U polars optuna catboost >> /dev/null

In [4]:
!mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd


In [5]:
from itertools import combinations
from pathlib import Path
import joblib
import os
import gc
import numpy as np
import pandas as pd
import polars as pl

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GroupKFold, StratifiedGroupKFold
from sklearn.metrics import roc_curve, auc, roc_auc_score
import xgboost as xgb
import lightgbm as lgb

import seaborn as sns
import optuna

# カラム数の制限を解除
pd.set_option('display.max_columns', 200)

# 行数の制限を解除
pd.set_option('display.max_rows', 200)

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [6]:
BASE_PATH = Path('/content/drive/MyDrive/kaggle/isic2024')
OUTPUT_DIR = BASE_PATH / 'output'
DATA_PATH = BASE_PATH / 'data'
TRAIN_PATH = DATA_PATH /'train-metadata.csv'

TARGET_COL = 'target'
ERR = 1e-5
N_SPLITS = 5
SPLIT_SEED = 42


In [7]:
# register settings like hyper prams to CFG class from Google Spread Sheet
import gspread
from gspread_dataframe import get_as_dataframe, set_with_dataframe
from google.auth import default
creds, _ = default()
gc_auth = gspread.authorize(creds)

# Please set your Google Spread Sheet url below.
url = ""
ss = gc_auth.open_by_url(url)
param_sheet = ss.worksheet("tree_model_params_v3")
df_param = get_as_dataframe(param_sheet)


In [8]:
from pydantic import BaseModel
from typing import Optional


# Final submission Patterns
# all_img_cols = [
#     'pred_sub_71',
#     'pred_sub_73',
#     'pred_sub_75',
#     'pred_sub_77',
#     'pred_tsuma_eva_nes',
#     'pred_tsuma_conv_nes',
#     'pred_sub_71,pred_sub_73,pred_sub_75,pred_tsuma_eva_nes',
#     'pred_sub_77,pred_sub_75,pred_tsuma_conv_nes',
#     'pred_sub_77,pred_sub_75,pred_tsuma_conv_nes,pred_tsuma_eva_nes',
#     'pred_sub_71,pred_sub_73,pred_sub_75,pred_sub_77,pred_tsuma_eva,pred_tsuma_conv_nes'
# ]
# all_attribution_flags = [0, 1]
# all_feature_select_flags = [0, 1]
# all_lesion_id_weights = [0.0]
# all_Indeterminate_weights = [0.0, 0.5]


# Test patterns
all_img_cols = [
    'pred_sub_71,pred_sub_73,pred_sub_75,pred_sub_77,pred_tsuma_eva,pred_tsuma_conv_nes',
    'pred_sub_71,pred_sub_73,pred_sub_75,pred_sub_77,pred_tsuma_eva',
]
all_attribution_flags = [0]
all_feature_select_flags = [1]
all_lesion_id_weights = [0.0]
all_Indeterminate_weights = [0]

class CFG(BaseModel):
    exp_ver: Optional[str] = "32"
    img_cols: Optional[str] = "pred_sub_71,pred_sub_73,pred_sub_75"
    attribution_flag: Optional[int] = 1
    feature_select_flag: Optional[int] = 1
    lesion_id_weight: Optional[float] = 0.0
    Indeterminate_weight: Optional[float] = 0.5


all_config_list = []
counter = 0
start_from = 1
restart_from = 1
for img_col in all_img_cols:
    for attribution_flag in all_attribution_flags:
        for feature_select_flag in all_feature_select_flags:
            for lesion_id_weight in all_lesion_id_weights:
                for Indeterminate_weight in all_Indeterminate_weights:
                    cfg = CFG()
                    cfg.exp_ver = start_from+counter
                    if cfg.exp_ver < restart_from:
                        continue
                    cfg.img_cols = img_col.split(',')
                    cfg.attribution_flag = int(attribution_flag)
                    cfg.feature_select_flag = int(feature_select_flag)
                    cfg.lesion_id_weight = float(lesion_id_weight)
                    cfg.Indeterminate_weight = float(Indeterminate_weight)

                    param_sheet.update_acell(f"A{cfg.exp_ver+1}", cfg.exp_ver)
                    param_sheet.update_acell(f"B{cfg.exp_ver+1}", img_col)
                    param_sheet.update_acell(f"C{cfg.exp_ver+1}", attribution_flag)
                    param_sheet.update_acell(f"D{cfg.exp_ver+1}", feature_select_flag)
                    param_sheet.update_acell(f"E{cfg.exp_ver+1}", lesion_id_weight)
                    param_sheet.update_acell(f"F{cfg.exp_ver+1}", Indeterminate_weight)
                    all_config_list.append(cfg)
                    counter += 1
for cfg in all_config_list:
    print(cfg)


exp_ver=1 img_cols=['pred_sub_71', 'pred_sub_73', 'pred_sub_75', 'pred_sub_77', 'pred_tsuma_eva', 'pred_tsuma_conv_nes'] attribution_flag=0 feature_select_flag=1 lesion_id_weight=0.0 Indeterminate_weight=0.0
exp_ver=2 img_cols=['pred_sub_71', 'pred_sub_73', 'pred_sub_75', 'pred_sub_77', 'pred_tsuma_eva'] attribution_flag=0 feature_select_flag=1 lesion_id_weight=0.0 Indeterminate_weight=0.0


In [9]:
def custom_metric(estimator, X, y_true):
    y_hat = estimator.predict_proba(X)[:, 1]
    min_tpr = 0.80
    max_fpr = abs(1 - min_tpr)

    v_gt = abs(y_true - 1)
    v_pred = np.array([1.0 - x for x in y_hat])

    partial_auc_scaled = roc_auc_score(v_gt, v_pred, max_fpr=max_fpr)
    partial_auc = 0.5 * max_fpr**2 + (max_fpr - 0.5 * max_fpr**2) / (1.0 - 0.5) * (partial_auc_scaled - 0.5)

    return partial_auc


def score(solution: np.ndarray, submission: np.ndarray, min_tpr: float=0.80) -> float:
    v_gt = abs(solution-1)
    v_pred = np.array([1.0 - x for x in submission])
    max_fpr = abs(1-min_tpr)
    partial_auc_scaled = roc_auc_score(v_gt, v_pred, max_fpr=max_fpr)
    # change scale from [0.5, 1.0] to [0.5 * max_fpr**2, max_fpr]
    # https://math.stackexchange.com/questions/914823/shift-numbers-into-a-different-range
    partial_auc = 0.5 * max_fpr**2 + (max_fpr - 0.5 * max_fpr**2) / (1.0 - 0.5) * (partial_auc_scaled - 0.5)

    return(partial_auc)


In [10]:
num_cols = [
    'age_approx',                        # Approximate age of patient at time of imaging.
    'clin_size_long_diam_mm',            # Maximum diameter of the lesion (mm).+
    'tbp_lv_A',                          # A inside  lesion.+
    'tbp_lv_Aext',                       # A outside lesion.+
    'tbp_lv_B',                          # B inside  lesion.+
    'tbp_lv_Bext',                       # B outside lesion.+
    'tbp_lv_C',                          # Chroma inside  lesion.+
    'tbp_lv_Cext',                       # Chroma outside lesion.+
    'tbp_lv_H',                          # Hue inside the lesion; calculated as the angle of A* and B* in LAB* color space. Typical values range from 25 (red) to 75 (brown).+
    'tbp_lv_Hext',                       # Hue outside lesion.+
    'tbp_lv_L',                          # L inside lesion.+
    'tbp_lv_Lext',                       # L outside lesion.+
    'tbp_lv_areaMM2',                    # Area of lesion (mm^2).+
    'tbp_lv_area_perim_ratio',           # Border jaggedness, the ratio between lesions perimeter and area. Circular lesions will have low values; irregular shaped lesions will have higher values. Values range 0-10.+
    'tbp_lv_color_std_mean',             # Color irregularity, calculated as the variance of colors within the lesion's boundary.
    'tbp_lv_deltaA',                     # Average A contrast (inside vs. outside lesion).+
    'tbp_lv_deltaB',                     # Average B contrast (inside vs. outside lesion).+
    'tbp_lv_deltaL',                     # Average L contrast (inside vs. outside lesion).+
    'tbp_lv_deltaLB',                    #
    'tbp_lv_deltaLBnorm',                # Contrast between the lesion and its immediate surrounding skin. Low contrast lesions tend to be faintly visible such as freckles; high contrast lesions tend to be those with darker pigment. Calculated as the average delta LB of the lesion relative to its immediate background in LAB* color space. Typical values range from 5.5 to 25.+
    'tbp_lv_eccentricity',               # Eccentricity.+
    'tbp_lv_minorAxisMM',                # Smallest lesion diameter (mm).+
    'tbp_lv_nevi_confidence',            # Nevus confidence score (0-100 scale) is a convolutional neural network classifier estimated probability that the lesion is a nevus. The neural network was trained on approximately 57,000 lesions that were classified and labeled by a dermatologist.+,++
    'tbp_lv_norm_border',                # Border irregularity (0-10 scale); the normalized average of border jaggedness and asymmetry.+
    'tbp_lv_norm_color',                 # Color variation (0-10 scale); the normalized average of color asymmetry and color irregularity.+
    'tbp_lv_perimeterMM',                # Perimeter of lesion (mm).+
    'tbp_lv_radial_color_std_max',       # Color asymmetry, a measure of asymmetry of the spatial distribution of color within the lesion. This score is calculated by looking at the average standard deviation in LAB* color space within concentric rings originating from the lesion center. Values range 0-10.+
    'tbp_lv_stdL',                       # Standard deviation of L inside  lesion.+
    'tbp_lv_stdLExt',                    # Standard deviation of L outside lesion.+
    'tbp_lv_symm_2axis',                 # Border asymmetry; a measure of asymmetry of the lesion's contour about an axis perpendicular to the lesion's most symmetric axis. Lesions with two axes of symmetry will therefore have low scores (more symmetric), while lesions with only one or zero axes of symmetry will have higher scores (less symmetric). This score is calculated by comparing opposite halves of the lesion contour over many degrees of rotation. The angle where the halves are most similar identifies the principal axis of symmetry, while the second axis of symmetry is perpendicular to the principal axis. Border asymmetry is reported as the asymmetry value about this second axis. Values range 0-10.+
    'tbp_lv_symm_2axis_angle',           # Lesion border asymmetry angle.+
    'tbp_lv_x',                          # X-coordinate of the lesion on 3D TBP.+
    'tbp_lv_y',                          # Y-coordinate of the lesion on 3D TBP.+
    'tbp_lv_z',                          # Z-coordinate of the lesion on 3D TBP.+
]

new_num_cols = [
    'lesion_size_ratio',             # tbp_lv_minorAxisMM      / clin_size_long_diam_mm
    'lesion_size_ratio_with_area',   # tbp_lv_minorAxisMM      / clin_size_long_diam_mm * tbp_lv_areaMM2
    'lesion_shape_index',            # tbp_lv_areaMM2          / tbp_lv_perimeterMM **2
    'hue_contrast',                  # tbp_lv_H                - tbp_lv_Hext              abs
    'luminance_contrast',            # tbp_lv_L                - tbp_lv_Lext              abs
    'lesion_color_difference',       # tbp_lv_deltaA **2       + tbp_lv_deltaB **2 + tbp_lv_deltaL **2  sqrt
    'border_complexity',             # tbp_lv_norm_border      + tbp_lv_symm_2axis
    'color_uniformity',              # tbp_lv_color_std_mean   / tbp_lv_radial_color_std_max

    'position_distance_3d',          # tbp_lv_x **2 + tbp_lv_y **2 + tbp_lv_z **2  sqrt
    'perimeter_to_area_ratio',       # tbp_lv_perimeterMM      / tbp_lv_areaMM2
    'area_to_perimeter_ratio',       # tbp_lv_areaMM2          / tbp_lv_perimeterMM
    'lesion_visibility_score',          # tbp_lv_deltaLBnorm      + tbp_lv_norm_color
    'symmetry_border_consistency',   # tbp_lv_symm_2axis       * tbp_lv_norm_border
    'consistency_symmetry_border',   # tbp_lv_symm_2axis       * tbp_lv_norm_border / (tbp_lv_symm_2axis + tbp_lv_norm_border)

    'color_consistency',             # tbp_lv_stdL             / tbp_lv_Lext
    'consistency_color',             # tbp_lv_stdL*tbp_lv_Lext / tbp_lv_stdL + tbp_lv_Lext
    'size_age_interaction',          # clin_size_long_diam_mm  * age_approx
    'hue_color_std_interaction',     # tbp_lv_H                * tbp_lv_color_std_mean
    'lesion_severity_index',         # tbp_lv_norm_border      + tbp_lv_norm_color + tbp_lv_eccentricity / 3
    'shape_complexity_index',        # border_complexity       + lesion_shape_index
    'color_contrast_index',          # tbp_lv_deltaA + tbp_lv_deltaB + tbp_lv_deltaL + tbp_lv_deltaLBnorm

    'log_lesion_area',               # tbp_lv_areaMM2          + 1  np.log
    'normalized_lesion_size',        # clin_size_long_diam_mm  / age_approx
    'mean_hue_difference',           # tbp_lv_H                + tbp_lv_Hext    / 2
    'std_dev_contrast',              # tbp_lv_deltaA **2 + tbp_lv_deltaB **2 + tbp_lv_deltaL **2   / 3  np.sqrt
    'color_shape_composite_index',   # tbp_lv_color_std_mean   + bp_lv_area_perim_ratio + tbp_lv_symm_2axis   / 3
    'lesion_orientation_3d',         # tbp_lv_y                , tbp_lv_x  np.arctan2
    'overall_color_difference',      # tbp_lv_deltaA           + tbp_lv_deltaB + tbp_lv_deltaL   / 3

    'symmetry_perimeter_interaction',# tbp_lv_symm_2axis       * tbp_lv_perimeterMM
    'comprehensive_lesion_index',    # tbp_lv_area_perim_ratio + tbp_lv_eccentricity + bp_lv_norm_color + tbp_lv_symm_2axis   / 4
    'color_variance_ratio',          # tbp_lv_color_std_mean   / tbp_lv_stdLExt
    'border_color_interaction',      # tbp_lv_norm_border      * tbp_lv_norm_color
    'border_color_interaction_2',
    'size_color_contrast_ratio',     # clin_size_long_diam_mm  / tbp_lv_deltaLBnorm
    'age_normalized_nevi_confidence',# tbp_lv_nevi_confidence  / age_approx
    'age_normalized_nevi_confidence_2',
    'color_asymmetry_index',         # tbp_lv_symm_2axis       * tbp_lv_radial_color_std_max

    'volume_approximation_3d',       # tbp_lv_areaMM2          * sqrt(tbp_lv_x**2 + tbp_lv_y**2 + tbp_lv_z**2)
    'color_range',                   # abs(tbp_lv_L - tbp_lv_Lext) + abs(tbp_lv_A - tbp_lv_Aext) + abs(tbp_lv_B - tbp_lv_Bext)
    'shape_color_consistency',       # tbp_lv_eccentricity     * tbp_lv_color_std_mean
    'border_length_ratio',           # tbp_lv_perimeterMM      / pi * sqrt(tbp_lv_areaMM2 / pi)
    'age_size_symmetry_index',       # age_approx              * clin_size_long_diam_mm * tbp_lv_symm_2axis
    'index_age_size_symmetry',       # age_approx              * tbp_lv_areaMM2 * tbp_lv_symm_2axis

    'border_mul_perimeter',
]

cat_cols = ['sex', 'anatom_site_general', 'tbp_tile_type', 'tbp_lv_location', 'tbp_lv_location_simple', 'attribution', 'copyright_license']

norm_cols1 = [f'{col}_patient_norm' for col in num_cols + new_num_cols]
norm_cols2 = [f'{col}_attribution_norm' for col in num_cols + new_num_cols]
norm_cols3 = [f'{col}_patient_lv_location_norm' for col in num_cols + new_num_cols]
norm_cols4 = [f'{col}_attribution_lv_location_norm' for col in num_cols + new_num_cols]
norm_cols5 = [f'{col}_lv_location_norm' for col in num_cols + new_num_cols]
norm_cols6 = [f'{col}_attribution_lv_location_tile_type_norm' for col in num_cols + new_num_cols]
norm_cols7 = [f'{col}_patient_lv_location_tile_type_norm' for col in num_cols + new_num_cols]

agg_list = ['mean', 'max', 'sum', 'std']
additinal_features1 = [f'{col}_patient_{agg}' for col in num_cols + new_num_cols for agg in agg_list]
additinal_features2 = [f'{col}_patient_lv_location_{agg}' for col in num_cols + new_num_cols for agg in agg_list]
additinal_features3 = [f'{col}_patient_lv_location_tile_type_{agg}' for col in num_cols + new_num_cols for agg in agg_list]
additinal_features4 = [f'{col}_attribution_{agg}' for col in num_cols + new_num_cols for agg in agg_list]
additinal_features5 = [f'{col}_attribution_lv_location_{agg}' for col in num_cols + new_num_cols for agg in agg_list]

cnt_features = ['isic_id_count', 'isic_id_count_patient_lv_location', 'isic_id_count_attribution', 'isic_id_count_attribution_lv_location']
age_features = ['unique_age_approx_count', 'age_min_diff', 'age_max_diff', 'age_min_max', 'age_range', 'age_phase']

pivot_cnt_features = [
 'anatom_site_general_anterior torso_count',
 'anatom_site_general_upper extremity_count',
 'anatom_site_general_lower extremity_count',
 'anatom_site_general_posterior torso_count',
 'anatom_site_general_head/neck_count',
 'anatom_site_general__count',
 'combined_anatomical_site_lower extremity_Left Leg - Lower_count',
 'combined_anatomical_site_anterior torso_Torso Front Top Half_count',
 'combined_anatomical_site_upper extremity_Right Arm - Lower_count',
 'combined_anatomical_site_upper extremity_Left Arm - Lower_count',
 'combined_anatomical_site_head/neck_Head & Neck_count',
 'combined_anatomical_site_anterior torso_Torso Front Bottom Half_count',
 'combined_anatomical_site_lower extremity_Right Leg - Lower_count',
 'combined_anatomical_site_posterior torso_Torso Back Bottom Third_count',
 'combined_anatomical_site_upper extremity_Right Arm - Upper_count',
 'combined_anatomical_site_lower extremity_Right Leg - Upper_count',
 'combined_anatomical_site_posterior torso_Torso Back Top Third_count',
 'combined_anatomical_site_upper extremity_Left Arm - Upper_count',
 'combined_anatomical_site_lower extremity_Left Leg - Upper_count',
 'combined_anatomical_site_posterior torso_Torso Back Middle Third_count',
 'combined_anatomical_site_upper extremity_Left Arm_count',
 'combined_anatomical_site_lower extremity_Right Leg_count',
 'combined_anatomical_site_lower extremity_Left Leg_count',
 'combined_anatomical_site_upper extremity_Right Arm_count',
 'combined_anatomical_site__Unknown_count',
 'combined_anatomical_site_posterior torso_Torso Back_count',
 'combined_anatomical_site_anterior torso_Torso Front_count',
 'tbp_lv_location_Left Leg - Upper_count',
 'tbp_lv_location_Right Arm - Lower_count',
 'tbp_lv_location_Right Leg - Upper_count',
 'tbp_lv_location_Torso Back Bottom Third_count',
 'tbp_lv_location_Left Arm - Upper_count',
 'tbp_lv_location_Torso Front Top Half_count',
 'tbp_lv_location_Torso Back Middle Third_count',
 'tbp_lv_location_Torso Front Bottom Half_count',
 'tbp_lv_location_Torso Back Top Third_count',
 'tbp_lv_location_Right Arm - Upper_count',
 'tbp_lv_location_Head & Neck_count',
 'tbp_lv_location_Left Arm_count',
 'tbp_lv_location_Left Arm - Lower_count',
 'tbp_lv_location_Right Leg - Lower_count',
 'tbp_lv_location_Left Leg - Lower_count',
 'tbp_lv_location_Torso Front_count',
 'tbp_lv_location_Left Leg_count',
 'tbp_lv_location_Right Arm_count',
 'tbp_lv_location_Right Leg_count',
 'tbp_lv_location_Unknown_count',
 'tbp_lv_location_Torso Back_count'
]

feature_cols = (
    num_cols +
    new_num_cols +
    cat_cols +
    norm_cols1 +
    norm_cols2 +
    norm_cols3 +
    norm_cols4 +
    norm_cols5 +
    norm_cols6 +
    additinal_features1 +
    additinal_features2 +
    additinal_features4 +
    additinal_features5 +
    cnt_features +
    age_features +
    pivot_cnt_features
    #
    # +image_cols
)


def read_data(path):
    return (
        pl.read_csv(path)
        .with_columns(
            pl.col('age_approx').cast(pl.Utf8).replace('NA', np.nan).cast(pl.Float64),
        )
        .with_columns(
            pl.col(pl.Float64).fill_nan(pl.col(pl.Float64).median()), # You may want to impute test data with train
        )
        .with_columns(
            lesion_size_ratio              = pl.col('tbp_lv_minorAxisMM') / pl.col('clin_size_long_diam_mm'),
            lesion_size_ratio_with_area    = pl.col('tbp_lv_minorAxisMM') / pl.col('clin_size_long_diam_mm') * pl.col('tbp_lv_areaMM2'),
            lesion_shape_index             = pl.col('tbp_lv_areaMM2') / (pl.col('tbp_lv_perimeterMM') ** 2),
            # hue_contrast                   = (pl.col('tbp_lv_H') - pl.col('tbp_lv_Hext')).abs(),
            # luminance_contrast             = (pl.col('tbp_lv_L') - pl.col('tbp_lv_Lext')).abs(),
            hue_contrast                   = (pl.col('tbp_lv_H') - pl.col('tbp_lv_Hext')), # not use abs
            luminance_contrast             = (pl.col('tbp_lv_L') - pl.col('tbp_lv_Lext')), # not use abs
            lesion_color_difference        = (pl.col('tbp_lv_deltaA') ** 2 + pl.col('tbp_lv_deltaB') ** 2 + pl.col('tbp_lv_deltaL') ** 2).sqrt(),
            border_complexity              = pl.col('tbp_lv_norm_border') + pl.col('tbp_lv_symm_2axis'),
            border_mul_perimeter          =  pl.col('tbp_lv_norm_border') * pl.col('tbp_lv_perimeterMM'),
            color_uniformity               = pl.col('tbp_lv_color_std_mean') / (pl.col('tbp_lv_radial_color_std_max') + ERR),
        )
        .with_columns(
            position_distance_3d           = (pl.col('tbp_lv_x') ** 2 + pl.col('tbp_lv_y') ** 2 + pl.col('tbp_lv_z') ** 2).sqrt(),
            perimeter_to_area_ratio        = pl.col('tbp_lv_perimeterMM') / pl.col('tbp_lv_areaMM2'),
            area_to_perimeter_ratio        = pl.col('tbp_lv_areaMM2') / pl.col('tbp_lv_perimeterMM'),
            lesion_visibility_score        = pl.col('tbp_lv_deltaLBnorm') + pl.col('tbp_lv_norm_color'),
            combined_anatomical_site       = pl.col('anatom_site_general') + '_' + pl.col('tbp_lv_location'),
            symmetry_border_consistency    = pl.col('tbp_lv_symm_2axis') * pl.col('tbp_lv_norm_border'),
            consistency_symmetry_border    = pl.col('tbp_lv_symm_2axis') * pl.col('tbp_lv_norm_border') / (pl.col('tbp_lv_symm_2axis') + pl.col('tbp_lv_norm_border')),
        )
        .with_columns(
            color_consistency              = pl.col('tbp_lv_stdL') / pl.col('tbp_lv_Lext'),
            consistency_color              = pl.col('tbp_lv_stdL') * pl.col('tbp_lv_Lext') / (pl.col('tbp_lv_stdL') + pl.col('tbp_lv_Lext')),
            size_age_interaction           = pl.col('clin_size_long_diam_mm') * pl.col('age_approx'),
            hue_color_std_interaction      = pl.col('tbp_lv_H') * pl.col('tbp_lv_color_std_mean'),
            lesion_severity_index          = (pl.col('tbp_lv_norm_border') + pl.col('tbp_lv_norm_color') + pl.col('tbp_lv_eccentricity')) / 3,
            shape_complexity_index         = pl.col('border_complexity') + pl.col('lesion_shape_index'),
            color_contrast_index           = pl.col('tbp_lv_deltaA') + pl.col('tbp_lv_deltaB') + pl.col('tbp_lv_deltaL') + pl.col('tbp_lv_deltaLBnorm'),
        )
        .with_columns(
            log_lesion_area                = (pl.col('tbp_lv_areaMM2') + 1).log(),
            normalized_lesion_size         = pl.col('clin_size_long_diam_mm') / pl.col('age_approx'),
            mean_hue_difference            = (pl.col('tbp_lv_H') + pl.col('tbp_lv_Hext')) / 2,
            std_dev_contrast               = ((pl.col('tbp_lv_deltaA') ** 2 + pl.col('tbp_lv_deltaB') ** 2 + pl.col('tbp_lv_deltaL') ** 2) / 3).sqrt(),
            color_shape_composite_index    = (pl.col('tbp_lv_color_std_mean') + pl.col('tbp_lv_area_perim_ratio') + pl.col('tbp_lv_symm_2axis')) / 3,
            lesion_orientation_3d          = pl.arctan2(pl.col('tbp_lv_y'), pl.col('tbp_lv_x')),
            overall_color_difference       = (pl.col('tbp_lv_deltaA') + pl.col('tbp_lv_deltaB') + pl.col('tbp_lv_deltaL')) / 3,
        )
        .with_columns(
            symmetry_perimeter_interaction = pl.col('tbp_lv_symm_2axis') * pl.col('tbp_lv_perimeterMM'),
            comprehensive_lesion_index     = (pl.col('tbp_lv_area_perim_ratio') + pl.col('tbp_lv_eccentricity') + pl.col('tbp_lv_norm_color') + pl.col('tbp_lv_symm_2axis')) / 4,
            color_variance_ratio           = pl.col('tbp_lv_color_std_mean') / pl.col('tbp_lv_stdLExt'),
            border_color_interaction       = pl.col('tbp_lv_norm_border') * pl.col('tbp_lv_norm_color'),
            border_color_interaction_2     = pl.col('tbp_lv_norm_border') * pl.col('tbp_lv_norm_color') / (pl.col('tbp_lv_norm_border') + pl.col('tbp_lv_norm_color')),
            size_color_contrast_ratio      = pl.col('clin_size_long_diam_mm') / pl.col('tbp_lv_deltaLBnorm'),
            age_normalized_nevi_confidence = pl.col('tbp_lv_nevi_confidence') / pl.col('age_approx'),
            age_normalized_nevi_confidence_2 = (pl.col('clin_size_long_diam_mm')**2 + pl.col('age_approx')**2).sqrt(),
            color_asymmetry_index          = pl.col('tbp_lv_radial_color_std_max') * pl.col('tbp_lv_symm_2axis'),
        )
        .with_columns(
            volume_approximation_3d        = pl.col('tbp_lv_areaMM2') * (pl.col('tbp_lv_x')**2 + pl.col('tbp_lv_y')**2 + pl.col('tbp_lv_z')**2).sqrt(),
            color_range                    = (pl.col('tbp_lv_L') - pl.col('tbp_lv_Lext')).abs() + (pl.col('tbp_lv_A') - pl.col('tbp_lv_Aext')).abs() + (pl.col('tbp_lv_B') - pl.col('tbp_lv_Bext')).abs(),
            shape_color_consistency        = pl.col('tbp_lv_eccentricity') * pl.col('tbp_lv_color_std_mean'),
            border_length_ratio            = pl.col('tbp_lv_perimeterMM') / (2 * np.pi * (pl.col('tbp_lv_areaMM2') / np.pi).sqrt()),
            age_size_symmetry_index        = pl.col('age_approx') * pl.col('clin_size_long_diam_mm') * pl.col('tbp_lv_symm_2axis'),
            index_age_size_symmetry        = pl.col('age_approx') * pl.col('tbp_lv_areaMM2') * pl.col('tbp_lv_symm_2axis'),
        )
        .with_columns(
            [
                pl.concat_str([pl.col(col) for col in combo], separator="_").alias("_".join(combo))
                for r in range(2, 4)
                for combo in combinations(['sex', 'tbp_tile_type', 'tbp_lv_location', 'attribution', 'copyright_license'], r)
            ]
        )
        # The degree to which each lesion is separated from the relevant lesion in each patient (with normalization).
        .with_columns(
            (
                (pl.col(col) - pl.col(col).mean().over('patient_id')) / (pl.col(col).std().over('patient_id') + ERR)
            ).alias(f'{col}_patient_norm')
            for col in (num_cols + new_num_cols)
        )
        # The degree to which the relevant lesion is separated from others within each hospital's lesions (with normalization).
        .with_columns(
            (
                (pl.col(col) - pl.col(col).mean().over(['attribution'])) / (pl.col(col).std().over(['attribution']) + ERR)
            ).alias(f'{col}_attribution_norm')
            for col in (num_cols + new_num_cols)
        )
        # The degree to which the relevant lesion is separated from others within each tbp_lv_location's lesions (with normalization).
        .with_columns(
            (
                (pl.col(col) - pl.col(col).mean().over(['tbp_lv_location'])) / (pl.col(col).std().over(['tbp_lv_location']) + ERR)
            ).alias(f'{col}_lv_location_norm')
            for col in (num_cols + new_num_cols)
        )
        .with_columns(
            (
                (pl.col(col) - pl.col(col).mean().over(['patient_id', 'tbp_lv_location'])) / (pl.col(col).std().over(['patient_id', 'tbp_lv_location']) + ERR)
            ).alias(f'{col}_patient_lv_location_norm')
            for col in (num_cols + new_num_cols)
        )
        # The degree to which the relevant lesion is separated from others within each hospital and site's lesions (with normalization).
        # 各病院・部位のlesion中で該当lesionがどの程度かけ離れているか
        .with_columns(
            (
                (pl.col(col) - pl.col(col).mean().over(['attribution', 'tbp_lv_location'])) / (pl.col(col).std().over(['attribution', 'tbp_lv_location']) + ERR)
            ).alias(f'{col}_attribution_lv_location_norm')
            for col in (num_cols + new_num_cols)
        )
        # 各病院・部位・tile_typeのlesion中で該当lesionがどの程度かけ離れているか
        .with_columns(
            (
                (pl.col(col) - pl.col(col).mean().over(['attribution', 'tbp_lv_location', 'tbp_tile_type'])) / (pl.col(col).std().over(['attribution', 'tbp_lv_location', 'tbp_tile_type']) + ERR)
            ).alias(f'{col}_attribution_lv_location_tile_type_norm')
            for col in (num_cols + new_num_cols)
        )
        .with_columns(
            pl.col(cat_cols).cast(pl.Categorical),
        )
    )

def feature_engineering_patient_id_pl(df: pl.DataFrame, num_cols: list[str]) -> pl.DataFrame:
    # 数値系変数の集計特徴量を計算
    agg_numeric = (
        df.group_by('patient_id')
        .agg([
            pl.col(col).mean().alias(f'{col}_patient_mean') for col in num_cols
        ] + [
            pl.col(col).std().alias(f'{col}_patient_std') for col in num_cols
        ] + [
            pl.col(col).max().alias(f'{col}_patient_max') for col in num_cols
        ] + [
            pl.col(col).sum().alias(f'{col}_patient_sum') for col in num_cols
        ] + [
            pl.count('isic_id').alias('isic_id_count')
        ])
    )

    agg_numeric2 = (
        df.group_by(['patient_id', 'tbp_lv_location'])
        .agg([
            pl.col(col).mean().alias(f'{col}_patient_lv_location_mean') for col in num_cols
        ] + [
            pl.col(col).std().alias(f'{col}_patient_lv_location_std') for col in num_cols
        ] + [
            pl.col(col).max().alias(f'{col}_patient_lv_location_max') for col in num_cols
        ] + [
            pl.col(col).sum().alias(f'{col}_patient_lv_location_sum') for col in num_cols
        ] + [
            pl.count('isic_id').alias('isic_id_count_patient_lv_location')
        ])
    )

    agg_numeric4 = (
        df.group_by(['attribution'])
        .agg([
            pl.col(col).mean().alias(f'{col}_attribution_mean') for col in num_cols
        ] + [
            pl.col(col).std().alias(f'{col}_attribution_std') for col in num_cols
        ] + [
            pl.col(col).max().alias(f'{col}_attribution_max') for col in num_cols
        ] + [
            pl.col(col).sum().alias(f'{col}_attribution_sum') for col in num_cols
        ] + [
            pl.count('isic_id').alias('isic_id_count_attribution')
        ])
    )

    agg_numeric5 = (
        df.group_by(['attribution', 'tbp_lv_location'])
        .agg([
            pl.col(col).mean().alias(f'{col}_attribution_lv_location_mean') for col in num_cols
        ] + [
            pl.col(col).std().alias(f'{col}_attribution_lv_location_std') for col in num_cols
        ] + [
            pl.col(col).max().alias(f'{col}_attribution_lv_location_max') for col in num_cols
        ] + [
            pl.col(col).sum().alias(f'{col}_attribution_lv_location_sum') for col in num_cols
        ] + [
            pl.count('isic_id').alias('isic_id_count_attribution_lv_location')
        ])
    )

    categorical_columns = ['anatom_site_general', 'combined_anatomical_site', "tbp_lv_location"]
    for col in categorical_columns:
        counts = (
            df.group_by(['patient_id', col])
            .agg(pl.count().alias(f'{col}_count'))
            .pivot(
                index='patient_id',
                columns=col,
                values=f'{col}_count',
                aggregate_function='sum'
            )
            .fill_null(0)
        )
        col_names = [f'{col}_{val}_count' for val in counts.columns if val != 'patient_id']
        counts = counts.rename({old: new for old, new in zip(counts.columns, ['patient_id'] + col_names)})
        df = df.join(counts, on='patient_id', how='left')


    df = df.join(agg_numeric, on='patient_id', how='left')
    df = df.join(agg_numeric2, on=['patient_id', 'tbp_lv_location'], how='left')
    df = df.join(agg_numeric4, on=['attribution'], how='left')
    df = df.join(agg_numeric5, on=['attribution', 'tbp_lv_location'], how='left')

    return df

def feature_engineering_age_pl(df: pl.DataFrame) -> pl.DataFrame:
    unique_counts = (
        df.group_by('patient_id')
        .agg(
            pl.col('age_approx').n_unique().alias('unique_age_approx_count')
        )
    )
    df = df.join(unique_counts, on='patient_id', how='left')
    df = df.with_columns(
        age_min_diff = pl.col('age_approx') - pl.col('age_approx').min().over('patient_id'),
        age_max_diff = pl.col('age_approx') - pl.col('age_approx').max().over('patient_id'),
        age_min_max = (pl.col('age_approx') - pl.col('age_approx').min().over('patient_id')) +
                         (pl.col('age_approx') - pl.col('age_approx').max().over('patient_id')),
        age_range = pl.col('age_approx').max().over('patient_id') - pl.col('age_approx').min().over('patient_id'),
        age_phase = pl.col('age_approx') / (pl.col('age_approx').max().over('patient_id') - pl.col('age_approx').min().over('patient_id') + ERR),
    )

    return df

def process_metadata(past_meta, meta, num_cols):
    # datatype optimization
    def optimize_dataframe(df):
        df = df.with_columns([
            pl.col(pl.Float64).cast(pl.Float32),
            pl.col(pl.Int64).cast(pl.Int32)
        ])
        return df

    # skin_type from past metadata
    skin_map = {
        'I': 0,
        'II': 1,
        'III': 2,
        'IV': 3,
    }
    mapped_col = (
        pl.when(pl.col('fitzpatrick_skin_type') == 'I').then(0)
        .when(pl.col('fitzpatrick_skin_type') == 'II').then(1)
        .when(pl.col('fitzpatrick_skin_type') == 'III').then(2)
        .when(pl.col('fitzpatrick_skin_type') == 'IV').then(3)
        .otherwise(None).alias('fitzpatrick_skin_type_mapped')
    )
    past_meta = past_meta.with_columns([mapped_col])
    skin_df = (
        past_meta.group_by('patient_id')
        .agg(pl.col('fitzpatrick_skin_type_mapped').min().alias('fitzpatrick_skin_type_mapped'))
        .filter(pl.col('fitzpatrick_skin_type_mapped').is_not_null())
    )
    meta = meta.join(skin_df, on='patient_id', how='left')
    skin_df_cols = skin_df.columns
    del skin_df
    gc.collect()

    # ピボットとグループ化の関数
    def pivot_group_count(df, groupby_cols, pivot_index, pivot_column, count_column):
        group_count = (
            df.group_by(groupby_cols)
            .agg(pl.col(count_column).count().alias(count_column))
        )
        pivot_df = group_count.pivot(
            index=pivot_index,
            columns=pivot_column,
            values=count_column
        )
        return pivot_df

    pivot_df_1 = pivot_group_count(
        df=past_meta,
        groupby_cols=['patient_id', 'anatom_site_general', 'age_approx',  'benign_malignant'],
        pivot_index=['patient_id', 'anatom_site_general', 'age_approx'],
        pivot_column='benign_malignant',
        count_column='isic_id'
    )

    # データ型を一致させる
    pivot_df_1 = pivot_df_1.with_columns([
        pl.col('anatom_site_general').cast(pl.Categorical),
        pl.col('age_approx').cast(pl.Float64)
    ])

    meta = meta.with_columns([
        pl.col('anatom_site_general').cast(pl.Categorical),
        pl.col('age_approx').cast(pl.Float64)
    ])

    meta = meta.join(pivot_df_1, on=['patient_id', 'anatom_site_general', 'age_approx'], how='left')
    pivot_df_1_cols = pivot_df_1.columns
    del pivot_df_1
    gc.collect()

    pivot_df_2 = pivot_group_count(
        df=past_meta,
        groupby_cols=['patient_id', 'anatom_site_general', 'benign_malignant'],
        pivot_index=['patient_id', 'anatom_site_general'],
        pivot_column='benign_malignant',
        count_column='isic_id'
    )

    # データ型を一致させる
    pivot_df_2 = pivot_df_2.with_columns([
        pl.col('anatom_site_general').cast(pl.Categorical),
        # pl.col('age_approx').cast(pl.Float64)
    ])

    meta = meta.join(pivot_df_2, on=['patient_id', 'anatom_site_general'], how='left')
    pivot_df_2_cols = pivot_df_2.columns
    del pivot_df_2
    gc.collect()


    benign_malignant_map = {
        'benign': 0,
        'indeterminate': 0,
        'indeterminate/benign': 0,
        'indeterminate/malignant': 1,
        'malignant': 1,
    }
    mapped_target_col = (
        pl.when(pl.col('benign_malignant') == 'benign').then(0)
        .when(pl.col('benign_malignant') == 'indeterminate').then(0)
        .when(pl.col('benign_malignant') == 'indeterminate/benign').then(0)
        .when(pl.col('benign_malignant') == 'indeterminate/malignant').then(1)
        .when(pl.col('benign_malignant') == 'malignant').then(1)
        .otherwise(None).alias('binary_target')
    )
    past_meta = past_meta.with_columns([mapped_target_col])

    def calculate_group_aggregates(df, groupby_columns, columns_to_aggregate, aggregations, suffix):
        agg_df = df.group_by(groupby_columns).agg([
            getattr(pl.col(column), agg)().alias(f'{alias}_{agg}_{suffix}')
            for column, alias in columns_to_aggregate.items()
            for agg in aggregations
        ])
        agg_df = optimize_dataframe(agg_df)
        return agg_df, agg_df.columns

    columns_to_aggregate = {
        'binary_target': 'binary_target_agg',
        'clin_size_long_diam_mm': 'clin_size_long_diam_mm_agg'
    }
    aggregations = ['mean', 'max']

    patient_aggregates, patient_aggregates_cols = calculate_group_aggregates(
        past_meta,
        ['patient_id'],
        columns_to_aggregate,
        aggregations,
        '_p'
    )
    meta = meta.join(patient_aggregates, on=['patient_id'], how='left')
    del patient_aggregates
    gc.collect()

    patient_age_aggregates, patient_age_aggregates_cols = calculate_group_aggregates(
        past_meta,
        ['patient_id', 'anatom_site_general'],
        columns_to_aggregate,
        aggregations,
        '_ps',
    )
    patient_age_aggregates = patient_age_aggregates.with_columns([
        pl.col('anatom_site_general').cast(pl.Categorical),
    ])
    meta = meta.join(patient_age_aggregates, on=['patient_id', 'anatom_site_general'], how='left')
    del patient_age_aggregates
    gc.collect()

    patient_age_site_aggregates, patient_age_site_aggregates_cols = calculate_group_aggregates(
        past_meta,
        ['patient_id', 'anatom_site_general', 'age_approx'],
        columns_to_aggregate,
        aggregations,
        '_psa',
    )
    patient_age_site_aggregates = patient_age_site_aggregates.with_columns([
        pl.col('anatom_site_general').cast(pl.Categorical),
        pl.col('age_approx').cast(pl.Float64)
    ])
    meta = meta.join(patient_age_site_aggregates, on=['patient_id', 'anatom_site_general', 'age_approx'], how='left')
    del patient_age_site_aggregates
    gc.collect()

    # Shift Features
    # num features from the past and future for the same patient and anatom_site_general.
    # But in real world we shouldn't user 'future one...'
    aggregations = ['mean', 'max']
    meta_g = (
        meta.group_by(['patient_id', 'anatom_site_general', 'age_approx'], maintain_order=True)
        .agg([
            getattr(pl.col(col), agg)().alias(f'{col}_{agg}')
            for col in num_cols
            for agg in aggregations
        ])
    )
    meta_g = optimize_dataframe(meta_g)

    shift_range = 5
    new_feature_names = []
    for shift in range(1, shift_range + 1):
        for agg in aggregations:
            for col in num_cols:
                base_col = f'{col}_{agg}'

                # In real world we shouldn't use 'next' shift features.
                prev_col = f'{base_col}_prev_{shift}'
                next_col = f'{base_col}_next_{shift}'

                meta_g = meta_g.with_columns([
                    pl.col(base_col).shift(shift).over(['patient_id', 'anatom_site_general']).alias(prev_col),
                    pl.col(base_col).shift(-shift).over(['patient_id', 'anatom_site_general']).alias(next_col)
                ])
                meta_g = meta_g.with_columns([
                    (pl.col(base_col) - pl.col(prev_col)).alias(f'{base_col}_diff_prev_{shift}'),
                    (pl.col(base_col) - pl.col(next_col)).alias(f'{base_col}_diff_next_{shift}')
                ])

                new_feature_names.extend([prev_col, next_col, f'{base_col}_diff_prev_{shift}', f'{base_col}_diff_next_{shift}'])

    meta_g = meta_g.with_columns([
        pl.col('anatom_site_general').cast(pl.Categorical),
        pl.col('age_approx').cast(pl.Float64)
    ])
    meta = meta.join(meta_g, on=['patient_id', 'anatom_site_general', 'age_approx'], how='left')

    del meta_g
    gc.collect()

    # shift features across old and new metadata.
    meta_selected = meta.select([
        'patient_id',
        'anatom_site_general',
        'age_approx',
        pl.lit(0).cast(pl.Int64).alias("binary_target"),
    ])
    past_meta_selected = past_meta.select([
        'patient_id',
        'anatom_site_general',
        'age_approx',
        pl.col('binary_target')
    ])
    past_meta_selected = past_meta_selected.with_columns([
        pl.col('anatom_site_general').cast(pl.Categorical),
        pl.col('age_approx').cast(pl.Float64),
        pl.col('binary_target').cast(pl.Int64),
    ])
    combined_df = pl.concat([meta_selected, past_meta_selected])
    combined_df = combined_df.with_columns([
        pl.col('age_approx').cast(pl.Float64)
    ])
    del meta_selected, past_meta_selected
    gc.collect()

    target_columns = ['binary_target']
    aggregations=['mean']
    agg_exprs = [
        getattr(pl.col(col), agg)().alias(f'{col}_{agg}')
        for col in target_columns
        for agg in aggregations
    ]

    shift_features = []
    for agg_cols in [
        ['patient_id', 'anatom_site_general'],
        ['patient_id'],
    ]:
        agg_df = combined_df.group_by(agg_cols + ['age_approx'], maintain_order=True).agg(agg_exprs)
        for shift in range(1, shift_range + 1):
            shift_exprs = []
            diff_exprs = []

            for col in target_columns:
                for agg in aggregations:
                    base_col = f'{col}_{agg}'

                    # In real world we shouldn't use 'next' shift features.
                    prev_col = f'{base_col}_prev_{shift}'
                    next_col = f'{base_col}_next_{shift}'

                    shift_exprs.extend([
                        pl.col(base_col).shift(shift).over(agg_cols).alias(prev_col),
                        pl.col(base_col).shift(-shift).over(agg_cols).alias(next_col)
                    ])

                    diff_exprs.extend([
                        (pl.col(base_col) - pl.col(prev_col)).alias(f'{base_col}_diff_prev_{shift}'),
                        (pl.col(base_col) - pl.col(next_col)).alias(f'{base_col}_diff_next_{shift}')
                    ])

                    shift_features.extend([prev_col, next_col, f'{base_col}_diff_prev_{shift}', f'{base_col}_diff_next_{shift}'])

            agg_df = agg_df.with_columns(shift_exprs)
            agg_df = agg_df.with_columns(diff_exprs)

        meta = meta.join(agg_df, on=agg_cols + ['age_approx'], how='left')
        del agg_df
        gc.collect()


    join_keys = {'patient_id', 'age_approx', 'anatom_site_general'}
    not_use_cols = {'benign', 'indeterminate', 'indeterminate/benign', 'indeterminate/malignant', 'malignant'}
    all_new_feature_names = [
        col for col in (
            list(skin_df_cols[1:]) +
            list(pivot_df_1_cols[1:]) +
            list(pivot_df_2_cols[1:]) +
            list(patient_aggregates_cols) +
            list(patient_age_aggregates_cols) +
            list(patient_age_site_aggregates_cols) +
            new_feature_names +
            shift_features
        ) if col not in join_keys | not_use_cols
    ]

    return meta, all_new_feature_names



In [11]:
class Trainer(object):
    def __init__(self, X, y, X_valid, y_valid, params, predictors):
        self.X = X
        self.y = y
        self.X_valid = X_valid
        self.y_valid = y_valid
        self.params = params
        self.predictors = predictors

    def fit(self):
        oof_result = []
        X_train, X_valid = self.X, self.X_valid
        y_train, y_valid = self.y, self.y_valid

        sample_weight_train = X_train['sample_weight']
        sample_weight_valid = X_valid['sample_weight']

        dtrain = lgb.Dataset(
            X_train[self.predictors],
            label=y_train,
            feature_name=self.predictors,
            weight=sample_weight_train,
        )
        dvalid = lgb.Dataset(
            X_valid[self.predictors],
            label=y_valid,
            feature_name=self.predictors,
            weight=sample_weight_valid,
        )
        del X_train, X_valid
        gc.collect()

        callbacks = [
            lgb.log_evaluation(100),
            lgb.early_stopping(200),
        ]

        clf = lgb.train(
            self.params,
            dtrain,
            valid_sets=[dtrain, dvalid],
            num_boost_round=100000,
            categorical_feature=[],
            callbacks=callbacks,
        )
        oof_result.append([x for x in clf.best_score["valid_1"].values()][0])
        self.clf = clf
        self.oof_result = oof_result
        return clf, oof_result

    def fit_xgb(self):
        oof_result = []
        X_train, X_valid = self.X, self.X_valid
        y_train, y_valid = self.y, self.y_valid

        X_train = X_train.replace([np.inf, -np.inf], np.nan)
        X_valid = X_valid.replace([np.inf, -np.inf], np.nan)

        X_train = X_train.fillna(0)
        X_valid = X_valid.fillna(0)

        sample_weight_train = X_train['sample_weight']
        sample_weight_valid = X_valid['sample_weight']

        X_train[self.predictors] = X_train[self.predictors].astype(np.float32)
        X_valid[self.predictors] = X_valid[self.predictors].astype(np.float32)

        dtrain = xgb.DMatrix(X_train[self.predictors], label=y_train, weight=sample_weight_train)
        dvalid = xgb.DMatrix(X_valid[self.predictors], label=y_valid, weight=sample_weight_valid)
        del X_train, X_valid
        gc.collect()

        watchlist = [(dtrain, 'train'), (dvalid, 'eval')]

        clf = xgb.train(
            self.params,
            dtrain,
            10000,
            early_stopping_rounds=200,
            evals=watchlist,
            verbose_eval=50,
        )

        self.clf = clf
        self.oof_result = oof_result
        return clf, oof_result


def read_img_oof(train_df):
    oof_df_tsuma_eva_nes = pl.read_parquet(DATA_PATH / 'preds_eva_nes.parquet', columns=['isic_id', 'pred']).rename({'pred': 'pred_tsuma_eva_nes'})
    oof_df_tsuma_conv_nes = pl.read_parquet(DATA_PATH / 'preds_conv_nes.parquet', columns=['isic_id', 'pred']).rename({'pred': 'pred_tsuma_conv_nes'})
    oof_df_sub_71 = pl.concat([pl.read_csv(DATA_PATH / f'sub71/test_results_fold_{fold}.csv') for fold in range(5)]).select(['isic_id', 'pred']).rename({'pred': 'pred_sub_71'})
    oof_df_sub_73 = pl.concat([pl.read_csv(DATA_PATH / f'sub73/test_results_fold_{fold}.csv') for fold in range(5)]).select(['isic_id', 'pred']).rename({'pred': 'pred_sub_73'})
    oof_df_sub_75 = pl.concat([pl.read_csv(DATA_PATH / f'sub75/test_results_fold_{fold}.csv') for fold in range(5)]).select(['isic_id', 'pred']).rename({'pred': 'pred_sub_75'})
    oof_df_sub_77 = pl.concat([pl.read_csv(DATA_PATH / f'sub77/test_results_fold_{fold}.csv') for fold in range(5)]).select(['isic_id', 'pred']).rename({'pred': 'pred_sub_77'})

    # Merge the data
    train_df = train_df.join(oof_df_tsuma_eva_nes, on='isic_id', how='left')
    train_df = train_df.join(oof_df_tsuma_conv_nes, on='isic_id', how='left')
    train_df = train_df.join(oof_df_sub_71, on='isic_id', how='left')
    train_df = train_df.join(oof_df_sub_73, on='isic_id', how='left')
    train_df = train_df.join(oof_df_sub_75, on='isic_id', how='left')
    train_df = train_df.join(oof_df_sub_77, on='isic_id', how='left')

    return train_df


params_xgb = {
    'objective':  'binary:logistic',
    'eval_metric': 'logloss',
    "nthread": -1,
    "learning_rate" : 0.03,
    'colsample_bytree': 0.5,
    'subsample': 0.6,
    'max_depth': 7,
    'lambda': 5,
    'tree_method':"hist",
    'scale_pos_weight':8,

    'tree_method': 'hist',
    'device': 'cuda',
}

params = {
    'objective': 'binary',
    "boosting_type": "gbdt",
    "verbosity": -1,
    'learning_rate': 0.005,
    'bagging_freq': 2,
    'feature_fraction': 0.6,
    'subsample': 0.6,
    'lambda_l1': 1.5,
    'lambda_l2': 5.5,
    'num_leaves': 32,
    "min_data_in_leaf": 20,
    'scale_pos_weight': 5,
    'device': 'gpu',  # use GPU
    'max_bin': 127,
    'gpu_platform_id': 0,
    'gpu_device_id': 0,
}


In [12]:
train_df = read_data(TRAIN_PATH)
train_df = feature_engineering_patient_id_pl(train_df, num_cols + new_num_cols)
train_df = feature_engineering_age_pl(train_df)
train_df = train_df.select([col for col in train_df.columns if not train_df[col].is_null().all()])

past_meta = pl.read_csv(DATA_PATH / 'past_metadata.csv')
train_df, new_feature_names = process_metadata(
    past_meta,
    train_df,
    num_cols=num_cols+new_num_cols,
)
train_df = train_df.select([col for col in train_df.columns if not train_df[col].is_null().all()])
train_df = read_img_oof(train_df)
train_df = train_df.to_pandas()

feature_cols += new_feature_names
feature_cols = [
    col for col in feature_cols
    if col not in ['benign', 'indeterminate', 'indeterminate/benign', 'indeterminate/malignant', 'malignant']
]
feature_cols = sorted(list(set(feature_cols)))
feature_cols = [col for col in feature_cols if col in train_df.columns]


  .agg(pl.count().alias(f'{col}_count'))
  df.group_by(['patient_id', col])
  pivot_df = group_count.pivot(
  meta = meta.join(pivot_df_1, on=['patient_id', 'anatom_site_general', 'age_approx'], how='left')
  meta = meta.join(pivot_df_2, on=['patient_id', 'anatom_site_general'], how='left')
  meta = meta.join(patient_age_aggregates, on=['patient_id', 'anatom_site_general'], how='left')
  meta = meta.join(patient_age_site_aggregates, on=['patient_id', 'anatom_site_general', 'age_approx'], how='left')
  past_meta_selected = past_meta_selected.with_columns([
  meta = meta.join(agg_df, on=agg_cols + ['age_approx'], how='left')


In [13]:
le_dict = {}
for c in cat_cols:
    le = LabelEncoder()
    le.fit(pd.concat([train_df[c]]))
    train_df[c] = le.transform(train_df[c])
    le.classes_ = np.append(le.classes_, '<unknown>')
    le_dict[c] = le


for CFG in all_config_list:
    print(CFG)
    feature_cols_with_img = feature_cols + CFG.img_cols

    if CFG.attribution_flag:
        feature_cols_with_img = [col for col in feature_cols_with_img if 'attribution' not in col]

    if CFG.feature_select_flag:
        imp_df = pd.read_csv('/content/drive/MyDrive/kaggle/isic2024/lgb_importance.csv')
        imp_df_gp = imp_df.groupby('feature_name')[['imporatance_gain']].mean().reset_index()
        drop_features = imp_df_gp[imp_df_gp.imporatance_gain == 0].feature_name.values
        feature_cols_with_img = [col for col in feature_cols_with_img if col not in drop_features]
        print(len(feature_cols_with_img))

    os.makedirs(OUTPUT_DIR / f"exp{CFG.exp_ver}", exist_ok=True)
    joblib.dump(le_dict, OUTPUT_DIR / f"exp{CFG.exp_ver}" / "labelEncoder.joblib")

    gkf = StratifiedGroupKFold(n_splits=N_SPLITS, random_state=SPLIT_SEED, shuffle=True)
    train_df["fold"] = -1
    for fold, (train_idx, val_idx) in enumerate(
        gkf.split(train_df, train_df['target'], groups=train_df["patient_id"])
    ):
        train_df.loc[val_idx, "fold"] = fold

    xgb_clfs = []
    for fold in range(5):
        X = train_df[train_df["fold"] != fold]
        y = X[TARGET_COL].astype(float)

        X_valid = train_df[train_df["fold"] == fold]
        y_valid = X_valid[TARGET_COL].astype(float)

        X['sample_weight'] = 1.0
        X_valid['sample_weight'] = 1.0

        if CFG.lesion_id_weight > 0:
            X.loc[X['lesion_id'].notnull() & (X['target'] == 0), 'sample_weight'] = 0.1
            X.loc[X['lesion_id'].notnull(), 'target'] = 1
            X_valid.loc[X_valid['lesion_id'].notnull() & (X_valid['target'] == 0), 'sample_weight'] = 0.1
            X_valid.loc[X_valid['lesion_id'].notnull(), 'target'] = 1

        if CFG.Indeterminate_weight > 0:
            X.loc[X['iddx_1'] == 'Indeterminate', 'target'] = 1
            X.loc[X['iddx_1'] == 'Indeterminate', 'sample_weight'] = 0.5
            X_valid.loc[X_valid['iddx_1'] == 'Indeterminate', 'target'] = 1
            X_valid.loc[X_valid['iddx_1'] == 'Indeterminate', 'sample_weight'] = 0.5

        trainer = Trainer(X, y, X_valid, y_valid, params_xgb, feature_cols_with_img)
        trainer.fit_xgb()
        xgb_clfs.append(trainer.clf)
        trainer.clf.save_model(OUTPUT_DIR / f'exp{CFG.exp_ver}' / f"xgb_fold_{fold}.json")

    xgb_oof_df = train_df[['isic_id', TARGET_COL]].copy()
    xgb_oof_scores = []
    for fold in range(5):
        X = train_df[train_df["fold"] != fold]
        X_valid = train_df[train_df["fold"] == fold].copy()

        X_valid = X_valid.replace([np.inf, -np.inf], np.nan)
        X_valid = X_valid.fillna(0)

        dvalid = xgb.DMatrix(X_valid[feature_cols_with_img])

        tmp_xgb_clf = xgb.Booster()
        tmp_xgb_clf.load_model(OUTPUT_DIR / f'exp{CFG.exp_ver}' / f"xgb_fold_{fold}.json")

        tmp = tmp_xgb_clf.predict(dvalid)
        print(score(X_valid[TARGET_COL], tmp))
        xgb_oof_scores.append(score(X_valid[TARGET_COL], tmp))

        xgb_oof_df.loc[X_valid.index, 'pred'] = tmp

    print(np.mean(xgb_oof_scores))
    param_sheet.update_acell(f"H{CFG.exp_ver+1}", np.mean(xgb_oof_scores))
    print(score(xgb_oof_df[TARGET_COL], xgb_oof_df['pred']))
    param_sheet.update_acell(f"I{CFG.exp_ver+1}", score(xgb_oof_df[TARGET_COL], xgb_oof_df['pred']))


exp_ver=1 img_cols=['pred_sub_71', 'pred_sub_73', 'pred_sub_75', 'pred_sub_77', 'pred_tsuma_eva', 'pred_tsuma_conv_nes'] attribution_flag=0 feature_select_flag=1 lesion_id_weight=0.0 Indeterminate_weight=0.0
1834


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['sample_weight'] = 1.0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_valid['sample_weight'] = 1.0


[0]	train-logloss:0.12836	eval-logloss:0.12891
[50]	train-logloss:0.02990	eval-logloss:0.03204
[100]	train-logloss:0.00849	eval-logloss:0.01131
[150]	train-logloss:0.00322	eval-logloss:0.00656
[200]	train-logloss:0.00170	eval-logloss:0.00539
[250]	train-logloss:0.00112	eval-logloss:0.00510
[300]	train-logloss:0.00079	eval-logloss:0.00502
[350]	train-logloss:0.00058	eval-logloss:0.00505
[400]	train-logloss:0.00046	eval-logloss:0.00513
[450]	train-logloss:0.00037	eval-logloss:0.00522
[500]	train-logloss:0.00031	eval-logloss:0.00530
[502]	train-logloss:0.00030	eval-logloss:0.00530


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['sample_weight'] = 1.0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_valid['sample_weight'] = 1.0


[0]	train-logloss:0.12870	eval-logloss:0.12851
[50]	train-logloss:0.03012	eval-logloss:0.03142
[100]	train-logloss:0.00862	eval-logloss:0.01057
[150]	train-logloss:0.00333	eval-logloss:0.00576
[200]	train-logloss:0.00179	eval-logloss:0.00461
[250]	train-logloss:0.00120	eval-logloss:0.00431
[300]	train-logloss:0.00086	eval-logloss:0.00423
[350]	train-logloss:0.00064	eval-logloss:0.00425
[400]	train-logloss:0.00050	eval-logloss:0.00429
[450]	train-logloss:0.00041	eval-logloss:0.00436
[500]	train-logloss:0.00034	eval-logloss:0.00443
[528]	train-logloss:0.00031	eval-logloss:0.00447


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['sample_weight'] = 1.0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_valid['sample_weight'] = 1.0


[0]	train-logloss:0.12886	eval-logloss:0.12838
[50]	train-logloss:0.03037	eval-logloss:0.03111
[100]	train-logloss:0.00886	eval-logloss:0.01020
[150]	train-logloss:0.00353	eval-logloss:0.00523
[200]	train-logloss:0.00194	eval-logloss:0.00392
[250]	train-logloss:0.00129	eval-logloss:0.00351
[300]	train-logloss:0.00091	eval-logloss:0.00335
[350]	train-logloss:0.00067	eval-logloss:0.00331
[400]	train-logloss:0.00052	eval-logloss:0.00330
[450]	train-logloss:0.00042	eval-logloss:0.00334
[500]	train-logloss:0.00035	eval-logloss:0.00338
[550]	train-logloss:0.00029	eval-logloss:0.00343
[572]	train-logloss:0.00027	eval-logloss:0.00344


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['sample_weight'] = 1.0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_valid['sample_weight'] = 1.0


[0]	train-logloss:0.12842	eval-logloss:0.12871
[50]	train-logloss:0.02991	eval-logloss:0.03182
[100]	train-logloss:0.00844	eval-logloss:0.01114
[150]	train-logloss:0.00321	eval-logloss:0.00640
[200]	train-logloss:0.00169	eval-logloss:0.00523
[250]	train-logloss:0.00108	eval-logloss:0.00490
[300]	train-logloss:0.00078	eval-logloss:0.00485
[350]	train-logloss:0.00058	eval-logloss:0.00489
[400]	train-logloss:0.00045	eval-logloss:0.00496
[450]	train-logloss:0.00036	eval-logloss:0.00503
[498]	train-logloss:0.00030	eval-logloss:0.00512


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['sample_weight'] = 1.0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_valid['sample_weight'] = 1.0


[0]	train-logloss:0.12852	eval-logloss:0.12863
[50]	train-logloss:0.03016	eval-logloss:0.03122
[100]	train-logloss:0.00866	eval-logloss:0.01036
[150]	train-logloss:0.00341	eval-logloss:0.00555
[200]	train-logloss:0.00186	eval-logloss:0.00442
[250]	train-logloss:0.00122	eval-logloss:0.00414
[300]	train-logloss:0.00086	eval-logloss:0.00412
[350]	train-logloss:0.00064	eval-logloss:0.00418
[400]	train-logloss:0.00050	eval-logloss:0.00427
[450]	train-logloss:0.00040	eval-logloss:0.00436
[479]	train-logloss:0.00036	eval-logloss:0.00440
0.1764551657764271
0.18243941361029964
0.1927567614014736
0.1755059634002985
0.19128666989086976
0.18368879481587372
0.182934145563667
exp_ver=2 img_cols=['pred_sub_71', 'pred_sub_73', 'pred_sub_75', 'pred_sub_77', 'pred_tsuma_eva'] attribution_flag=0 feature_select_flag=1 lesion_id_weight=0.0 Indeterminate_weight=0.0
1833


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['sample_weight'] = 1.0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_valid['sample_weight'] = 1.0


[0]	train-logloss:0.12836	eval-logloss:0.12891
[50]	train-logloss:0.02994	eval-logloss:0.03208
[100]	train-logloss:0.00847	eval-logloss:0.01130
[150]	train-logloss:0.00321	eval-logloss:0.00658
[200]	train-logloss:0.00169	eval-logloss:0.00537
[250]	train-logloss:0.00111	eval-logloss:0.00505
[300]	train-logloss:0.00080	eval-logloss:0.00495
[350]	train-logloss:0.00059	eval-logloss:0.00499
[400]	train-logloss:0.00046	eval-logloss:0.00504
[450]	train-logloss:0.00037	eval-logloss:0.00512
[500]	train-logloss:0.00031	eval-logloss:0.00520
[504]	train-logloss:0.00030	eval-logloss:0.00520


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['sample_weight'] = 1.0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_valid['sample_weight'] = 1.0


[0]	train-logloss:0.12870	eval-logloss:0.12850
[50]	train-logloss:0.03017	eval-logloss:0.03136
[100]	train-logloss:0.00865	eval-logloss:0.01053
[150]	train-logloss:0.00335	eval-logloss:0.00574
[200]	train-logloss:0.00179	eval-logloss:0.00460
[250]	train-logloss:0.00118	eval-logloss:0.00431
[300]	train-logloss:0.00084	eval-logloss:0.00426
[350]	train-logloss:0.00063	eval-logloss:0.00428
[400]	train-logloss:0.00050	eval-logloss:0.00433
[450]	train-logloss:0.00040	eval-logloss:0.00441
[500]	train-logloss:0.00033	eval-logloss:0.00450
[522]	train-logloss:0.00031	eval-logloss:0.00454


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['sample_weight'] = 1.0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_valid['sample_weight'] = 1.0


[0]	train-logloss:0.12888	eval-logloss:0.12841
[50]	train-logloss:0.03043	eval-logloss:0.03117
[100]	train-logloss:0.00887	eval-logloss:0.01022
[150]	train-logloss:0.00351	eval-logloss:0.00523
[200]	train-logloss:0.00192	eval-logloss:0.00390
[250]	train-logloss:0.00128	eval-logloss:0.00350
[300]	train-logloss:0.00090	eval-logloss:0.00334
[350]	train-logloss:0.00068	eval-logloss:0.00329
[400]	train-logloss:0.00053	eval-logloss:0.00328
[450]	train-logloss:0.00043	eval-logloss:0.00331
[500]	train-logloss:0.00035	eval-logloss:0.00333
[550]	train-logloss:0.00029	eval-logloss:0.00337
[560]	train-logloss:0.00028	eval-logloss:0.00338


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['sample_weight'] = 1.0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_valid['sample_weight'] = 1.0


[0]	train-logloss:0.12844	eval-logloss:0.12872
[50]	train-logloss:0.02992	eval-logloss:0.03180
[100]	train-logloss:0.00843	eval-logloss:0.01106
[150]	train-logloss:0.00318	eval-logloss:0.00634
[200]	train-logloss:0.00166	eval-logloss:0.00521
[250]	train-logloss:0.00107	eval-logloss:0.00490
[300]	train-logloss:0.00076	eval-logloss:0.00485
[350]	train-logloss:0.00057	eval-logloss:0.00490
[400]	train-logloss:0.00045	eval-logloss:0.00496
[450]	train-logloss:0.00036	eval-logloss:0.00504
[485]	train-logloss:0.00032	eval-logloss:0.00510


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['sample_weight'] = 1.0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_valid['sample_weight'] = 1.0


[0]	train-logloss:0.12852	eval-logloss:0.12863
[50]	train-logloss:0.03016	eval-logloss:0.03119
[100]	train-logloss:0.00868	eval-logloss:0.01035
[150]	train-logloss:0.00339	eval-logloss:0.00553
[200]	train-logloss:0.00183	eval-logloss:0.00437
[250]	train-logloss:0.00122	eval-logloss:0.00409
[300]	train-logloss:0.00086	eval-logloss:0.00407
[350]	train-logloss:0.00064	eval-logloss:0.00413
[400]	train-logloss:0.00050	eval-logloss:0.00422
[450]	train-logloss:0.00040	eval-logloss:0.00430
[479]	train-logloss:0.00036	eval-logloss:0.00436
0.17716424991478408
0.1843440276778196
0.19276664303666943
0.1736660474243875
0.1906764855082317
0.18372349071237845
0.18299619682846377


In [16]:
le_dict = {}
for c in cat_cols:
    le = LabelEncoder()
    le.fit(pd.concat([train_df[c]]))
    train_df[c] = le.transform(train_df[c])
    le.classes_ = np.append(le.classes_, '<unknown>')
    le_dict[c] = le

for CFG in all_config_list:
    print(CFG)
    feature_cols_with_img = feature_cols + CFG.img_cols
    # 分岐追加：attribution_flagが有効な場合、特定の列を除外
    if CFG.attribution_flag:
        feature_cols_with_img = [col for col in feature_cols_with_img if 'attribution' not in col]

    if CFG.feature_select_flag:
        imp_df = pd.read_csv('/content/drive/MyDrive/kaggle/isic2024/lgb_importance.csv')
        imp_df_gp = imp_df.groupby('feature_name')[['imporatance_gain']].mean().reset_index()
        drop_features = imp_df_gp[imp_df_gp.imporatance_gain == 0].feature_name.values
        feature_cols_with_img = [col for col in feature_cols_with_img if col not in drop_features]
        print(len(feature_cols_with_img))

    os.makedirs(OUTPUT_DIR / f"exp{CFG.exp_ver}", exist_ok=True)
    joblib.dump(le_dict, OUTPUT_DIR / f"exp{CFG.exp_ver}" / "labelEncoder.joblib")

    gkf = StratifiedGroupKFold(n_splits=N_SPLITS, random_state=SPLIT_SEED, shuffle=True)
    train_df["fold"] = -1
    for fold, (train_idx, val_idx) in enumerate(gkf.split(train_df, train_df['target'], groups=train_df["patient_id"])):
        train_df.loc[val_idx, "fold"] = fold

    clfs = []
    for fold in range(5):
        X = train_df[train_df["fold"] != fold]
        y = X[TARGET_COL].astype(float)
        X_valid = train_df[train_df["fold"] == fold]
        y_valid = X_valid[TARGET_COL].astype(float)

        # Indeterminate condition weight adjustment
        X['sample_weight'] = 1.0
        X_valid['sample_weight'] = 1.0
        if CFG.lesion_id_weight > 0:
            X.loc[X['lesion_id'].notnull() & (X['target'] == 0), 'sample_weight'] = CFG.lesion_id_weight
            X_valid.loc[X_valid['lesion_id'].notnull() & (X_valid['target'] == 0), 'sample_weight'] = CFG.lesion_id_weight

        if CFG.Indeterminate_weight > 0:
            X.loc[X['iddx_1'] == 'Indeterminate', 'target'] = 1
            X.loc[X['iddx_1'] == 'Indeterminate', 'sample_weight'] = 0.5
            X_valid.loc[X_valid['iddx_1'] == 'Indeterminate', 'target'] = 1
            X_valid.loc[X_valid['iddx_1'] == 'Indeterminate', 'sample_weight'] = 0.5

        lgbm_trainer = Trainer(X, y, X_valid, y_valid, params, feature_cols_with_img)
        lgbm_trainer.fit()
        clfs.append(lgbm_trainer.clf)
        lgbm_trainer.clf.save_model(OUTPUT_DIR / f'exp{CFG.exp_ver}' / f"lgb_fold_{fold}.json")

    # OOF (Out of Fold) predictions
    oof_df = train_df[['isic_id', TARGET_COL]].copy()
    oof_scores = []
    for fold in range(5):
        X = train_df[train_df["fold"] != fold]
        X_valid = train_df[train_df["fold"] == fold]
        tmp_lgb_clf = lgb.Booster(model_file=OUTPUT_DIR / f'exp{CFG.exp_ver}' / f"lgb_fold_{fold}.json")
        tmp = tmp_lgb_clf.predict(X_valid[feature_cols_with_img])
        print(score(X_valid[TARGET_COL], tmp))
        oof_scores.append(score(X_valid[TARGET_COL], tmp))
        oof_df.loc[X_valid.index, 'pred'] = tmp

    print(np.mean(oof_scores))
    param_sheet.update_acell(f"J{int(CFG.exp_ver)+1}", np.mean(oof_scores))
    print(score(oof_df[TARGET_COL], oof_df['pred']))
    param_sheet.update_acell(f"K{int(CFG.exp_ver)+1}", score(oof_df[TARGET_COL], oof_df['pred']))


exp_ver=1 img_cols=['pred_sub_71', 'pred_sub_73', 'pred_sub_75', 'pred_sub_77', 'pred_tsuma_eva', 'pred_tsuma_conv_nes'] attribution_flag=0 feature_select_flag=1 lesion_id_weight=0.0 Indeterminate_weight=0.0
1834


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['sample_weight'] = 1.0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_valid['sample_weight'] = 1.0


Training until validation scores don't improve for 200 rounds
[100]	training's binary_logloss: 0.0036065	valid_1's binary_logloss: 0.00609867
[200]	training's binary_logloss: 0.00292092	valid_1's binary_logloss: 0.00589104
[300]	training's binary_logloss: 0.0025067	valid_1's binary_logloss: 0.00578578
[400]	training's binary_logloss: 0.00221231	valid_1's binary_logloss: 0.00570062
[500]	training's binary_logloss: 0.00197092	valid_1's binary_logloss: 0.00561969
[600]	training's binary_logloss: 0.00176492	valid_1's binary_logloss: 0.00554408
[700]	training's binary_logloss: 0.00158276	valid_1's binary_logloss: 0.00546754
[800]	training's binary_logloss: 0.00143268	valid_1's binary_logloss: 0.00541131
[900]	training's binary_logloss: 0.00129956	valid_1's binary_logloss: 0.00536958
[1000]	training's binary_logloss: 0.00118133	valid_1's binary_logloss: 0.00533133
[1100]	training's binary_logloss: 0.00107458	valid_1's binary_logloss: 0.00530128
[1200]	training's binary_logloss: 0.000984217	v

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['sample_weight'] = 1.0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_valid['sample_weight'] = 1.0


Training until validation scores don't improve for 200 rounds
[100]	training's binary_logloss: 0.00377259	valid_1's binary_logloss: 0.00511677
[200]	training's binary_logloss: 0.00304029	valid_1's binary_logloss: 0.00484684
[300]	training's binary_logloss: 0.00261039	valid_1's binary_logloss: 0.00471026
[400]	training's binary_logloss: 0.00230591	valid_1's binary_logloss: 0.00462485
[500]	training's binary_logloss: 0.00204966	valid_1's binary_logloss: 0.00455322
[600]	training's binary_logloss: 0.00184075	valid_1's binary_logloss: 0.00450175
[700]	training's binary_logloss: 0.00166071	valid_1's binary_logloss: 0.00446687
[800]	training's binary_logloss: 0.0015049	valid_1's binary_logloss: 0.00443138
[900]	training's binary_logloss: 0.00136619	valid_1's binary_logloss: 0.00440509
[1000]	training's binary_logloss: 0.00124762	valid_1's binary_logloss: 0.00438777
[1100]	training's binary_logloss: 0.00114214	valid_1's binary_logloss: 0.00437442
[1200]	training's binary_logloss: 0.00104298	v

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['sample_weight'] = 1.0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_valid['sample_weight'] = 1.0


Training until validation scores don't improve for 200 rounds
[100]	training's binary_logloss: 0.00394473	valid_1's binary_logloss: 0.00458286
[200]	training's binary_logloss: 0.00322949	valid_1's binary_logloss: 0.00448626
[300]	training's binary_logloss: 0.00281233	valid_1's binary_logloss: 0.00441639
[400]	training's binary_logloss: 0.00249865	valid_1's binary_logloss: 0.00434211
[500]	training's binary_logloss: 0.0022394	valid_1's binary_logloss: 0.00425442
[600]	training's binary_logloss: 0.00201641	valid_1's binary_logloss: 0.00417757
[700]	training's binary_logloss: 0.00181926	valid_1's binary_logloss: 0.00409166
[800]	training's binary_logloss: 0.00164185	valid_1's binary_logloss: 0.00401008
[900]	training's binary_logloss: 0.00149117	valid_1's binary_logloss: 0.00395098
[1000]	training's binary_logloss: 0.0013633	valid_1's binary_logloss: 0.00390037
[1100]	training's binary_logloss: 0.00124786	valid_1's binary_logloss: 0.00385546
[1200]	training's binary_logloss: 0.00114081	va

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['sample_weight'] = 1.0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_valid['sample_weight'] = 1.0


Training until validation scores don't improve for 200 rounds
[100]	training's binary_logloss: 0.00360648	valid_1's binary_logloss: 0.00604384
[200]	training's binary_logloss: 0.00290663	valid_1's binary_logloss: 0.00572621
[300]	training's binary_logloss: 0.0024991	valid_1's binary_logloss: 0.00559161
[400]	training's binary_logloss: 0.00220482	valid_1's binary_logloss: 0.00549287
[500]	training's binary_logloss: 0.00196358	valid_1's binary_logloss: 0.005419
[600]	training's binary_logloss: 0.00176081	valid_1's binary_logloss: 0.00535009
[700]	training's binary_logloss: 0.00158371	valid_1's binary_logloss: 0.00529768
[800]	training's binary_logloss: 0.00143163	valid_1's binary_logloss: 0.00525463
[900]	training's binary_logloss: 0.00129738	valid_1's binary_logloss: 0.00522305
[1000]	training's binary_logloss: 0.00118073	valid_1's binary_logloss: 0.00520458
[1100]	training's binary_logloss: 0.00107778	valid_1's binary_logloss: 0.00518852
[1200]	training's binary_logloss: 0.000988354	va

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['sample_weight'] = 1.0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_valid['sample_weight'] = 1.0


Training until validation scores don't improve for 200 rounds
[100]	training's binary_logloss: 0.00376805	valid_1's binary_logloss: 0.00543226
[200]	training's binary_logloss: 0.00306777	valid_1's binary_logloss: 0.0050006
[300]	training's binary_logloss: 0.00264911	valid_1's binary_logloss: 0.00479254
[400]	training's binary_logloss: 0.00234001	valid_1's binary_logloss: 0.00465063
[500]	training's binary_logloss: 0.00209116	valid_1's binary_logloss: 0.00454752
[600]	training's binary_logloss: 0.00187743	valid_1's binary_logloss: 0.0044795
[700]	training's binary_logloss: 0.00169569	valid_1's binary_logloss: 0.00442366
[800]	training's binary_logloss: 0.00153149	valid_1's binary_logloss: 0.00438556
[900]	training's binary_logloss: 0.00138818	valid_1's binary_logloss: 0.00436105
[1000]	training's binary_logloss: 0.00126259	valid_1's binary_logloss: 0.00434436
[1100]	training's binary_logloss: 0.00115288	valid_1's binary_logloss: 0.00434191
[1200]	training's binary_logloss: 0.00105791	va

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['sample_weight'] = 1.0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_valid['sample_weight'] = 1.0


Training until validation scores don't improve for 200 rounds
[100]	training's binary_logloss: 0.00358963	valid_1's binary_logloss: 0.00606453
[200]	training's binary_logloss: 0.00289368	valid_1's binary_logloss: 0.00583597
[300]	training's binary_logloss: 0.00248622	valid_1's binary_logloss: 0.00573338
[400]	training's binary_logloss: 0.0021967	valid_1's binary_logloss: 0.00566628
[500]	training's binary_logloss: 0.00196136	valid_1's binary_logloss: 0.00559075
[600]	training's binary_logloss: 0.0017555	valid_1's binary_logloss: 0.0055094
[700]	training's binary_logloss: 0.00157594	valid_1's binary_logloss: 0.00544191
[800]	training's binary_logloss: 0.001427	valid_1's binary_logloss: 0.00538771
[900]	training's binary_logloss: 0.00129566	valid_1's binary_logloss: 0.00534407
[1000]	training's binary_logloss: 0.00118001	valid_1's binary_logloss: 0.0053107
[1100]	training's binary_logloss: 0.00107272	valid_1's binary_logloss: 0.00527972
[1200]	training's binary_logloss: 0.000980897	valid

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['sample_weight'] = 1.0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_valid['sample_weight'] = 1.0


Training until validation scores don't improve for 200 rounds
[100]	training's binary_logloss: 0.00375692	valid_1's binary_logloss: 0.00512309
[200]	training's binary_logloss: 0.00302724	valid_1's binary_logloss: 0.00482814
[300]	training's binary_logloss: 0.00261094	valid_1's binary_logloss: 0.00468305
[400]	training's binary_logloss: 0.00231047	valid_1's binary_logloss: 0.00460275
[500]	training's binary_logloss: 0.002059	valid_1's binary_logloss: 0.00453035
[600]	training's binary_logloss: 0.00185473	valid_1's binary_logloss: 0.00448216
[700]	training's binary_logloss: 0.00167484	valid_1's binary_logloss: 0.00444018
[800]	training's binary_logloss: 0.00151773	valid_1's binary_logloss: 0.0044148
[900]	training's binary_logloss: 0.0013788	valid_1's binary_logloss: 0.00439788
[1000]	training's binary_logloss: 0.00125889	valid_1's binary_logloss: 0.00438689
[1100]	training's binary_logloss: 0.0011566	valid_1's binary_logloss: 0.00437905
[1200]	training's binary_logloss: 0.0010577	valid_

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['sample_weight'] = 1.0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_valid['sample_weight'] = 1.0


Training until validation scores don't improve for 200 rounds
[100]	training's binary_logloss: 0.00393941	valid_1's binary_logloss: 0.00452808
[200]	training's binary_logloss: 0.00322241	valid_1's binary_logloss: 0.0044231
[300]	training's binary_logloss: 0.00280622	valid_1's binary_logloss: 0.00434212
[400]	training's binary_logloss: 0.00249398	valid_1's binary_logloss: 0.00426205
[500]	training's binary_logloss: 0.00223589	valid_1's binary_logloss: 0.00418742
[600]	training's binary_logloss: 0.00201353	valid_1's binary_logloss: 0.00410992
[700]	training's binary_logloss: 0.00182152	valid_1's binary_logloss: 0.00404117
[800]	training's binary_logloss: 0.00164714	valid_1's binary_logloss: 0.00396966
[900]	training's binary_logloss: 0.00149538	valid_1's binary_logloss: 0.00390856
[1000]	training's binary_logloss: 0.00136542	valid_1's binary_logloss: 0.00385355
[1100]	training's binary_logloss: 0.00124632	valid_1's binary_logloss: 0.00380158
[1200]	training's binary_logloss: 0.00114156	v

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['sample_weight'] = 1.0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_valid['sample_weight'] = 1.0


Training until validation scores don't improve for 200 rounds
[100]	training's binary_logloss: 0.00360558	valid_1's binary_logloss: 0.00600712
[200]	training's binary_logloss: 0.00292229	valid_1's binary_logloss: 0.00569643
[300]	training's binary_logloss: 0.00252496	valid_1's binary_logloss: 0.00554714
[400]	training's binary_logloss: 0.00222599	valid_1's binary_logloss: 0.00543679
[500]	training's binary_logloss: 0.00198509	valid_1's binary_logloss: 0.005356
[600]	training's binary_logloss: 0.00177776	valid_1's binary_logloss: 0.00529212
[700]	training's binary_logloss: 0.00160033	valid_1's binary_logloss: 0.00523313
[800]	training's binary_logloss: 0.00144519	valid_1's binary_logloss: 0.00518945
[900]	training's binary_logloss: 0.00130876	valid_1's binary_logloss: 0.00515652
[1000]	training's binary_logloss: 0.00119276	valid_1's binary_logloss: 0.00513357
[1100]	training's binary_logloss: 0.00108919	valid_1's binary_logloss: 0.00512324
[1200]	training's binary_logloss: 0.000998593	v

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['sample_weight'] = 1.0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_valid['sample_weight'] = 1.0


Training until validation scores don't improve for 200 rounds
[100]	training's binary_logloss: 0.00376371	valid_1's binary_logloss: 0.00541127
[200]	training's binary_logloss: 0.00308323	valid_1's binary_logloss: 0.00500658
[300]	training's binary_logloss: 0.002667	valid_1's binary_logloss: 0.00478941
[400]	training's binary_logloss: 0.00235789	valid_1's binary_logloss: 0.00465151
[500]	training's binary_logloss: 0.00210875	valid_1's binary_logloss: 0.00453855
[600]	training's binary_logloss: 0.00189185	valid_1's binary_logloss: 0.00445748
[700]	training's binary_logloss: 0.00170612	valid_1's binary_logloss: 0.00440258
[800]	training's binary_logloss: 0.00153979	valid_1's binary_logloss: 0.00436614
[900]	training's binary_logloss: 0.00139689	valid_1's binary_logloss: 0.00434174
[1000]	training's binary_logloss: 0.00126911	valid_1's binary_logloss: 0.00432719
[1100]	training's binary_logloss: 0.00116109	valid_1's binary_logloss: 0.0043301
[1200]	training's binary_logloss: 0.00106487	val