In [1]:
# !pip install polars pyarrow xgboost lightgbm

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
!pip install -U polars optuna >> /dev/null
# !pip install lightgbm --install-option=--gpu

In [4]:
!mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd


In [5]:
from pathlib import Path
import gc
import numpy as np
import pandas as pd
import polars as pl
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GroupKFold, StratifiedGroupKFold
from sklearn.metrics import roc_curve, auc, roc_auc_score
import xgboost as xgb
import lightgbm as lgb

import seaborn as sns
import optuna

# カラム数の制限を解除
pd.set_option('display.max_columns', 200)

# 行数の制限を解除
pd.set_option('display.max_rows', 200)

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [6]:
base_path = Path('/content/drive/MyDrive/kaggle/isic2024')

In [7]:
df = pd.read_csv(base_path / "train-metadata.csv")

  df = pd.read_csv(base_path / "train-metadata.csv")


In [8]:
df.shape

(401059, 55)

In [9]:
# df.isnull().sum()
df['sample_weight'] = 1.0

# lesion_idがnullではないかつtargetが0のサンプルのsample_weightを0.3に設定
df.loc[df['lesion_id'].notnull() & (df['target'] == 0), 'sample_weight'] = 0.3

df.loc[df['iddx_1'] == 'Indeterminate', 'target'] = 1
df.loc[df['iddx_1'] == 'Indeterminate', 'sample_weight'] = 0.3

In [10]:
# df['file_exists'] = df['file_path'].apply(lambda x: os.path.exists(x))
# df = df[df['file_exists']].drop(columns=['file_exists'])
df["target"] = df["target"].astype('int32')
df["has_lesion_id"] = ~df["lesion_id"].isnull().astype('int32')


In [11]:
# base_path = Path('./')
base_path = Path('/content/drive/MyDrive/kaggle/isic2024')
output_dir = base_path / 'output'

train_path = base_path / 'train-metadata.csv'
test_path = base_path / 'test-metadata.csv'
subm_path = base_path / 'sample_submission.csv'

id_col = 'isic_id'
target_col = 'target'
group_col = 'patient_id'

err = 1e-5
sampling_ratio = 0.01
seed = 42

num_cols = [
    'age_approx',                        # Approximate age of patient at time of imaging.
    'clin_size_long_diam_mm',            # Maximum diameter of the lesion (mm).+
    'tbp_lv_A',                          # A inside  lesion.+
    'tbp_lv_Aext',                       # A outside lesion.+
    'tbp_lv_B',                          # B inside  lesion.+
    'tbp_lv_Bext',                       # B outside lesion.+
    'tbp_lv_C',                          # Chroma inside  lesion.+
    'tbp_lv_Cext',                       # Chroma outside lesion.+
    'tbp_lv_H',                          # Hue inside the lesion; calculated as the angle of A* and B* in LAB* color space. Typical values range from 25 (red) to 75 (brown).+
    'tbp_lv_Hext',                       # Hue outside lesion.+
    'tbp_lv_L',                          # L inside lesion.+
    'tbp_lv_Lext',                       # L outside lesion.+
    'tbp_lv_areaMM2',                    # Area of lesion (mm^2).+
    'tbp_lv_area_perim_ratio',           # Border jaggedness, the ratio between lesions perimeter and area. Circular lesions will have low values; irregular shaped lesions will have higher values. Values range 0-10.+
    'tbp_lv_color_std_mean',             # Color irregularity, calculated as the variance of colors within the lesion's boundary.
    'tbp_lv_deltaA',                     # Average A contrast (inside vs. outside lesion).+
    'tbp_lv_deltaB',                     # Average B contrast (inside vs. outside lesion).+
    'tbp_lv_deltaL',                     # Average L contrast (inside vs. outside lesion).+
    'tbp_lv_deltaLB',                    #
    'tbp_lv_deltaLBnorm',                # Contrast between the lesion and its immediate surrounding skin. Low contrast lesions tend to be faintly visible such as freckles; high contrast lesions tend to be those with darker pigment. Calculated as the average delta LB of the lesion relative to its immediate background in LAB* color space. Typical values range from 5.5 to 25.+
    'tbp_lv_eccentricity',               # Eccentricity.+
    'tbp_lv_minorAxisMM',                # Smallest lesion diameter (mm).+
    'tbp_lv_nevi_confidence',            # Nevus confidence score (0-100 scale) is a convolutional neural network classifier estimated probability that the lesion is a nevus. The neural network was trained on approximately 57,000 lesions that were classified and labeled by a dermatologist.+,++
    'tbp_lv_norm_border',                # Border irregularity (0-10 scale); the normalized average of border jaggedness and asymmetry.+
    'tbp_lv_norm_color',                 # Color variation (0-10 scale); the normalized average of color asymmetry and color irregularity.+
    'tbp_lv_perimeterMM',                # Perimeter of lesion (mm).+
    'tbp_lv_radial_color_std_max',       # Color asymmetry, a measure of asymmetry of the spatial distribution of color within the lesion. This score is calculated by looking at the average standard deviation in LAB* color space within concentric rings originating from the lesion center. Values range 0-10.+
    'tbp_lv_stdL',                       # Standard deviation of L inside  lesion.+
    'tbp_lv_stdLExt',                    # Standard deviation of L outside lesion.+
    'tbp_lv_symm_2axis',                 # Border asymmetry; a measure of asymmetry of the lesion's contour about an axis perpendicular to the lesion's most symmetric axis. Lesions with two axes of symmetry will therefore have low scores (more symmetric), while lesions with only one or zero axes of symmetry will have higher scores (less symmetric). This score is calculated by comparing opposite halves of the lesion contour over many degrees of rotation. The angle where the halves are most similar identifies the principal axis of symmetry, while the second axis of symmetry is perpendicular to the principal axis. Border asymmetry is reported as the asymmetry value about this second axis. Values range 0-10.+
    'tbp_lv_symm_2axis_angle',           # Lesion border asymmetry angle.+
    'tbp_lv_x',                          # X-coordinate of the lesion on 3D TBP.+
    'tbp_lv_y',                          # Y-coordinate of the lesion on 3D TBP.+
    'tbp_lv_z',                          # Z-coordinate of the lesion on 3D TBP.+
]

new_num_cols = [
    'lesion_size_ratio',             # tbp_lv_minorAxisMM      / clin_size_long_diam_mm
    'lesion_size_ratio_with_area',   # tbp_lv_minorAxisMM      / clin_size_long_diam_mm * tbp_lv_areaMM2
    'lesion_shape_index',            # tbp_lv_areaMM2          / tbp_lv_perimeterMM **2
    'hue_contrast',                  # tbp_lv_H                - tbp_lv_Hext              abs
    'luminance_contrast',            # tbp_lv_L                - tbp_lv_Lext              abs
    'lesion_color_difference',       # tbp_lv_deltaA **2       + tbp_lv_deltaB **2 + tbp_lv_deltaL **2  sqrt
    'border_complexity',             # tbp_lv_norm_border      + tbp_lv_symm_2axis
    'color_uniformity',              # tbp_lv_color_std_mean   / tbp_lv_radial_color_std_max

    'position_distance_3d',          # tbp_lv_x **2 + tbp_lv_y **2 + tbp_lv_z **2  sqrt
    'perimeter_to_area_ratio',       # tbp_lv_perimeterMM      / tbp_lv_areaMM2
    'area_to_perimeter_ratio',       # tbp_lv_areaMM2          / tbp_lv_perimeterMM
    'lesion_visibility_score',       # tbp_lv_deltaLBnorm      + tbp_lv_norm_color
    'symmetry_border_consistency',   # tbp_lv_symm_2axis       * tbp_lv_norm_border
    'consistency_symmetry_border',   # tbp_lv_symm_2axis       * tbp_lv_norm_border / (tbp_lv_symm_2axis + tbp_lv_norm_border)

    'color_consistency',             # tbp_lv_stdL             / tbp_lv_Lext
    'consistency_color',             # tbp_lv_stdL*tbp_lv_Lext / tbp_lv_stdL + tbp_lv_Lext
    'size_age_interaction',          # clin_size_long_diam_mm  * age_approx
    'hue_color_std_interaction',     # tbp_lv_H                * tbp_lv_color_std_mean
    'lesion_severity_index',         # tbp_lv_norm_border      + tbp_lv_norm_color + tbp_lv_eccentricity / 3
    'shape_complexity_index',        # border_complexity       + lesion_shape_index
    'color_contrast_index',          # tbp_lv_deltaA + tbp_lv_deltaB + tbp_lv_deltaL + tbp_lv_deltaLBnorm

    'log_lesion_area',               # tbp_lv_areaMM2          + 1  np.log
    'normalized_lesion_size',        # clin_size_long_diam_mm  / age_approx
    'mean_hue_difference',           # tbp_lv_H                + tbp_lv_Hext    / 2
    'std_dev_contrast',              # tbp_lv_deltaA **2 + tbp_lv_deltaB **2 + tbp_lv_deltaL **2   / 3  np.sqrt
    'color_shape_composite_index',   # tbp_lv_color_std_mean   + bp_lv_area_perim_ratio + tbp_lv_symm_2axis   / 3
    'lesion_orientation_3d',         # tbp_lv_y                , tbp_lv_x  np.arctan2
    'overall_color_difference',      # tbp_lv_deltaA           + tbp_lv_deltaB + tbp_lv_deltaL   / 3

    'symmetry_perimeter_interaction',# tbp_lv_symm_2axis       * tbp_lv_perimeterMM
    'comprehensive_lesion_index',    # tbp_lv_area_perim_ratio + tbp_lv_eccentricity + bp_lv_norm_color + tbp_lv_symm_2axis   / 4
    'color_variance_ratio',          # tbp_lv_color_std_mean   / tbp_lv_stdLExt
    'border_color_interaction',      # tbp_lv_norm_border      * tbp_lv_norm_color
    'border_color_interaction_2',
    'size_color_contrast_ratio',     # clin_size_long_diam_mm  / tbp_lv_deltaLBnorm
    'age_normalized_nevi_confidence',# tbp_lv_nevi_confidence  / age_approx
    'age_normalized_nevi_confidence_2',
    'color_asymmetry_index',         # tbp_lv_symm_2axis       * tbp_lv_radial_color_std_max

    'volume_approximation_3d',       # tbp_lv_areaMM2          * sqrt(tbp_lv_x**2 + tbp_lv_y**2 + tbp_lv_z**2)
    'color_range',                   # abs(tbp_lv_L - tbp_lv_Lext) + abs(tbp_lv_A - tbp_lv_Aext) + abs(tbp_lv_B - tbp_lv_Bext)
    'shape_color_consistency',       # tbp_lv_eccentricity     * tbp_lv_color_std_mean
    'border_length_ratio',           # tbp_lv_perimeterMM      / pi * sqrt(tbp_lv_areaMM2 / pi)
    'age_size_symmetry_index',       # age_approx              * clin_size_long_diam_mm * tbp_lv_symm_2axis
    'index_age_size_symmetry',       # age_approx              * tbp_lv_areaMM2 * tbp_lv_symm_2axis
]

cat_cols = ['sex', 'anatom_site_general', 'tbp_tile_type', 'tbp_lv_location', 'tbp_lv_location_simple', 'attribution']

image_cols = [
    # 'pred_fyk',
    # 'pred_tsuma_image',
    # 'pred_tsuma_eff',
    # 'pred_tsuma_eva',
    # 'pred_hatry',
    # 'pred_sub_67',
    # 'pred_sub_68',
    # 'pred_sub_69',
    'pred_sub_71',
    # 'pred_sub_72',
    'pred_sub_73',
    'pred_sub_75',
]
# num_cols = num_cols + image_cols

norm_cols1 = [f'{col}_patient_norm' for col in num_cols + new_num_cols]
norm_cols2 = [f'{col}_attribution_norm' for col in num_cols + new_num_cols]
norm_cols3 = [f'{col}_patient_lv_location_norm' for col in num_cols + new_num_cols]
norm_cols4 = [f'{col}_attribution_lv_location_norm' for col in num_cols + new_num_cols]
norm_cols5 = [f'{col}_lv_location_norm' for col in num_cols + new_num_cols]
norm_cols6 = [f'{col}_attribution_lv_location_tile_type_norm' for col in num_cols + new_num_cols]
norm_cols7 = [f'{col}_patient_lv_location_tile_type_norm' for col in num_cols + new_num_cols]

# agg_list = ['mean', 'std', 'max', 'min', 'skew']
agg_list = ['mean', 'max', 'sum', 'std']
additinal_features1 = [f'{col}_patient_{agg}' for col in num_cols + new_num_cols for agg in agg_list]
additinal_features2 = [f'{col}_patient_lv_location_{agg}' for col in num_cols + new_num_cols for agg in agg_list]
additinal_features3 = [f'{col}_patient_lv_location_tile_type_{agg}' for col in num_cols + new_num_cols for agg in agg_list]
additinal_features4 = [f'{col}_attribution_{agg}' for col in num_cols + new_num_cols for agg in agg_list]
additinal_features5 = [f'{col}_attribution_lv_location_{agg}' for col in num_cols + new_num_cols for agg in agg_list]

cnt_features = ['isic_id_count', 'isic_id_count_patient_lv_location', 'isic_id_count_attribution', 'isic_id_count_attribution_lv_location']
age_features = ['unique_age_approx_count', 'age_min_diff', 'age_max_diff', 'age_min_max']

pivot_cnt_features = ['anatom_site_general_anterior torso_count',
 'anatom_site_general_upper extremity_count',
 'anatom_site_general_lower extremity_count',
 'anatom_site_general_posterior torso_count',
 'anatom_site_general_head/neck_count',
 'anatom_site_general__count',
 'combined_anatomical_site_lower extremity_Left Leg - Lower_count',
 'combined_anatomical_site_anterior torso_Torso Front Top Half_count',
 'combined_anatomical_site_upper extremity_Right Arm - Lower_count',
 'combined_anatomical_site_upper extremity_Left Arm - Lower_count',
 'combined_anatomical_site_head/neck_Head & Neck_count',
 'combined_anatomical_site_anterior torso_Torso Front Bottom Half_count',
 'combined_anatomical_site_lower extremity_Right Leg - Lower_count',
 'combined_anatomical_site_posterior torso_Torso Back Bottom Third_count',
 'combined_anatomical_site_upper extremity_Right Arm - Upper_count',
 'combined_anatomical_site_lower extremity_Right Leg - Upper_count',
 'combined_anatomical_site_posterior torso_Torso Back Top Third_count',
 'combined_anatomical_site_upper extremity_Left Arm - Upper_count',
 'combined_anatomical_site_lower extremity_Left Leg - Upper_count',
 'combined_anatomical_site_posterior torso_Torso Back Middle Third_count',
 'combined_anatomical_site_upper extremity_Left Arm_count',
 'combined_anatomical_site_lower extremity_Right Leg_count',
 'combined_anatomical_site_lower extremity_Left Leg_count',
 'combined_anatomical_site_upper extremity_Right Arm_count',
 'combined_anatomical_site__Unknown_count',
 'combined_anatomical_site_posterior torso_Torso Back_count',
 'combined_anatomical_site_anterior torso_Torso Front_count',
 'tbp_lv_location_Left Leg - Upper_count',
 'tbp_lv_location_Right Arm - Lower_count',
 'tbp_lv_location_Right Leg - Upper_count',
 'tbp_lv_location_Torso Back Bottom Third_count',
 'tbp_lv_location_Left Arm - Upper_count',
 'tbp_lv_location_Torso Front Top Half_count',
 'tbp_lv_location_Torso Back Middle Third_count',
 'tbp_lv_location_Torso Front Bottom Half_count',
 'tbp_lv_location_Torso Back Top Third_count',
 'tbp_lv_location_Right Arm - Upper_count',
 'tbp_lv_location_Head & Neck_count',
 'tbp_lv_location_Left Arm_count',
 'tbp_lv_location_Left Arm - Lower_count',
 'tbp_lv_location_Right Leg - Lower_count',
 'tbp_lv_location_Left Leg - Lower_count',
 'tbp_lv_location_Torso Front_count',
 'tbp_lv_location_Left Leg_count',
 'tbp_lv_location_Right Arm_count',
 'tbp_lv_location_Right Leg_count',
 'tbp_lv_location_Unknown_count',
 'tbp_lv_location_Torso Back_count'
]

feature_cols = (
    num_cols +
    new_num_cols +
    cat_cols +
    norm_cols1 +
    norm_cols2 +
    norm_cols3 +
    norm_cols4 +
    norm_cols5 +
    norm_cols6 +
    # norm_cols7 +
    additinal_features1 +
    additinal_features2 +
    # additinal_features3 +
    additinal_features4 +
    additinal_features5 +
    cnt_features +
    age_features +
    pivot_cnt_features
    + image_cols
)


In [12]:
def read_data(path):
    return (
        pl.read_csv(path)
        .with_columns(
            pl.col('age_approx').cast(pl.Utf8).replace('NA', np.nan).cast(pl.Float64),
        )
        .with_columns(
            pl.col(pl.Float64).fill_nan(pl.col(pl.Float64).median()), # You may want to impute test data with train
        )
        .with_columns(
            lesion_size_ratio              = pl.col('tbp_lv_minorAxisMM') / pl.col('clin_size_long_diam_mm'),
            lesion_size_ratio_with_area    = pl.col('tbp_lv_minorAxisMM') / pl.col('clin_size_long_diam_mm') * pl.col('tbp_lv_areaMM2'),
            lesion_shape_index             = pl.col('tbp_lv_areaMM2') / (pl.col('tbp_lv_perimeterMM') ** 2),
            # hue_contrast                   = (pl.col('tbp_lv_H') - pl.col('tbp_lv_Hext')).abs(),
            # luminance_contrast             = (pl.col('tbp_lv_L') - pl.col('tbp_lv_Lext')).abs(),
            hue_contrast                   = (pl.col('tbp_lv_H') - pl.col('tbp_lv_Hext')),
            luminance_contrast             = (pl.col('tbp_lv_L') - pl.col('tbp_lv_Lext')),
            lesion_color_difference        = (pl.col('tbp_lv_deltaA') ** 2 + pl.col('tbp_lv_deltaB') ** 2 + pl.col('tbp_lv_deltaL') ** 2).sqrt(),
            border_complexity              = pl.col('tbp_lv_norm_border') + pl.col('tbp_lv_symm_2axis'),
            color_uniformity               = pl.col('tbp_lv_color_std_mean') / (pl.col('tbp_lv_radial_color_std_max') + err),
        )
        .with_columns(
            position_distance_3d           = (pl.col('tbp_lv_x') ** 2 + pl.col('tbp_lv_y') ** 2 + pl.col('tbp_lv_z') ** 2).sqrt(),
            perimeter_to_area_ratio        = pl.col('tbp_lv_perimeterMM') / pl.col('tbp_lv_areaMM2'),
            area_to_perimeter_ratio        = pl.col('tbp_lv_areaMM2') / pl.col('tbp_lv_perimeterMM'),
            lesion_visibility_score        = pl.col('tbp_lv_deltaLBnorm') + pl.col('tbp_lv_norm_color'),
            combined_anatomical_site       = pl.col('anatom_site_general') + '_' + pl.col('tbp_lv_location'),
            symmetry_border_consistency    = pl.col('tbp_lv_symm_2axis') * pl.col('tbp_lv_norm_border'),
            consistency_symmetry_border    = pl.col('tbp_lv_symm_2axis') * pl.col('tbp_lv_norm_border') / (pl.col('tbp_lv_symm_2axis') + pl.col('tbp_lv_norm_border')),
        )
        .with_columns(
            color_consistency              = pl.col('tbp_lv_stdL') / pl.col('tbp_lv_Lext'),
            consistency_color              = pl.col('tbp_lv_stdL') * pl.col('tbp_lv_Lext') / (pl.col('tbp_lv_stdL') + pl.col('tbp_lv_Lext')),
            size_age_interaction           = pl.col('clin_size_long_diam_mm') * pl.col('age_approx'),
            hue_color_std_interaction      = pl.col('tbp_lv_H') * pl.col('tbp_lv_color_std_mean'),
            lesion_severity_index          = (pl.col('tbp_lv_norm_border') + pl.col('tbp_lv_norm_color') + pl.col('tbp_lv_eccentricity')) / 3,
            shape_complexity_index         = pl.col('border_complexity') + pl.col('lesion_shape_index'),
            color_contrast_index           = pl.col('tbp_lv_deltaA') + pl.col('tbp_lv_deltaB') + pl.col('tbp_lv_deltaL') + pl.col('tbp_lv_deltaLBnorm'),
        )
        .with_columns(
            log_lesion_area                = (pl.col('tbp_lv_areaMM2') + 1).log(),
            normalized_lesion_size         = pl.col('clin_size_long_diam_mm') / pl.col('age_approx'),
            mean_hue_difference            = (pl.col('tbp_lv_H') + pl.col('tbp_lv_Hext')) / 2,
            std_dev_contrast               = ((pl.col('tbp_lv_deltaA') ** 2 + pl.col('tbp_lv_deltaB') ** 2 + pl.col('tbp_lv_deltaL') ** 2) / 3).sqrt(),
            color_shape_composite_index    = (pl.col('tbp_lv_color_std_mean') + pl.col('tbp_lv_area_perim_ratio') + pl.col('tbp_lv_symm_2axis')) / 3,
            lesion_orientation_3d          = pl.arctan2(pl.col('tbp_lv_y'), pl.col('tbp_lv_x')),
            overall_color_difference       = (pl.col('tbp_lv_deltaA') + pl.col('tbp_lv_deltaB') + pl.col('tbp_lv_deltaL')) / 3,
        )
        .with_columns(
            symmetry_perimeter_interaction = pl.col('tbp_lv_symm_2axis') * pl.col('tbp_lv_perimeterMM'),
            comprehensive_lesion_index     = (pl.col('tbp_lv_area_perim_ratio') + pl.col('tbp_lv_eccentricity') + pl.col('tbp_lv_norm_color') + pl.col('tbp_lv_symm_2axis')) / 4,
            color_variance_ratio           = pl.col('tbp_lv_color_std_mean') / pl.col('tbp_lv_stdLExt'),
            border_color_interaction       = pl.col('tbp_lv_norm_border') * pl.col('tbp_lv_norm_color'),
            border_color_interaction_2     = pl.col('tbp_lv_norm_border') * pl.col('tbp_lv_norm_color') / (pl.col('tbp_lv_norm_border') + pl.col('tbp_lv_norm_color')),
            size_color_contrast_ratio      = pl.col('clin_size_long_diam_mm') / pl.col('tbp_lv_deltaLBnorm'),
            age_normalized_nevi_confidence = pl.col('tbp_lv_nevi_confidence') / pl.col('age_approx'),
            age_normalized_nevi_confidence_2 = (pl.col('clin_size_long_diam_mm')**2 + pl.col('age_approx')**2).sqrt(),
            color_asymmetry_index          = pl.col('tbp_lv_radial_color_std_max') * pl.col('tbp_lv_symm_2axis'),
        )
        .with_columns(
            volume_approximation_3d        = pl.col('tbp_lv_areaMM2') * (pl.col('tbp_lv_x')**2 + pl.col('tbp_lv_y')**2 + pl.col('tbp_lv_z')**2).sqrt(),
            color_range                    = (pl.col('tbp_lv_L') - pl.col('tbp_lv_Lext')).abs() + (pl.col('tbp_lv_A') - pl.col('tbp_lv_Aext')).abs() + (pl.col('tbp_lv_B') - pl.col('tbp_lv_Bext')).abs(),
            shape_color_consistency        = pl.col('tbp_lv_eccentricity') * pl.col('tbp_lv_color_std_mean'),
            border_length_ratio            = pl.col('tbp_lv_perimeterMM') / (2 * np.pi * (pl.col('tbp_lv_areaMM2') / np.pi).sqrt()),
            age_size_symmetry_index        = pl.col('age_approx') * pl.col('clin_size_long_diam_mm') * pl.col('tbp_lv_symm_2axis'),
            index_age_size_symmetry        = pl.col('age_approx') * pl.col('tbp_lv_areaMM2') * pl.col('tbp_lv_symm_2axis'),
        )
        # 各患者のlesion中で該当lesionがどの程度かけ離れているか
        .with_columns(
            (
                (pl.col(col) - pl.col(col).mean().over('patient_id')) / (pl.col(col).std().over('patient_id') + err)
            ).alias(f'{col}_patient_norm')
            for col in (num_cols + new_num_cols)
        )
        # 各病院のlesion中で該当lesionがどの程度かけ離れているか
        .with_columns(
            (
                (pl.col(col) - pl.col(col).mean().over(['attribution'])) / (pl.col(col).std().over(['attribution']) + err)
            ).alias(f'{col}_attribution_norm')
            for col in (num_cols + new_num_cols)
        )
        .with_columns(
            (
                (pl.col(col) - pl.col(col).mean().over(['tbp_lv_location'])) / (pl.col(col).std().over(['tbp_lv_location']) + err)
            ).alias(f'{col}_lv_location_norm')
            for col in (num_cols + new_num_cols)
        )
        # 各患者・部位のlesion中で該当lesionがどの程度かけ離れているか
        .with_columns(
            (
                (pl.col(col) - pl.col(col).mean().over(['patient_id', 'tbp_lv_location'])) / (pl.col(col).std().over(['patient_id', 'tbp_lv_location']) + err)
            ).alias(f'{col}_patient_lv_location_norm')
            for col in (num_cols + new_num_cols)
        )
        # .with_columns(
        #     (
        #         (pl.col(col) - pl.col(col).mean().over(['patient_id', 'tbp_lv_location', 'tbp_tile_type'])) / (pl.col(col).std().over(['patient_id', 'tbp_lv_location', 'tbp_tile_type']) + err)
        #     ).alias(f'{col}_patient_lv_location_tile_type_norm')
        #     for col in (num_cols + new_num_cols)
        # )
        # 各病院・部位のlesion中で該当lesionがどの程度かけ離れているか
        .with_columns(
            (
                (pl.col(col) - pl.col(col).mean().over(['attribution', 'tbp_lv_location'])) / (pl.col(col).std().over(['attribution', 'tbp_lv_location']) + err)
            ).alias(f'{col}_attribution_lv_location_norm')
            for col in (num_cols + new_num_cols)
        )
        # TODO tbp_tile_type系集約特徴量を増やす
        .with_columns(
            (
                (pl.col(col) - pl.col(col).mean().over(['attribution', 'tbp_lv_location', 'tbp_tile_type'])) / (pl.col(col).std().over(['attribution', 'tbp_lv_location', 'tbp_tile_type']) + err)
            ).alias(f'{col}_attribution_lv_location_tile_type_norm')
            for col in (num_cols + new_num_cols)
        )
        .with_columns(
            pl.col(cat_cols).cast(pl.Categorical),
        )
    )


In [13]:
def feature_engineering_patient_id_pl(df: pl.DataFrame, num_cols: list[str]) -> pl.DataFrame:
    # 数値系変数の集計特徴量を計算
    agg_numeric = (
        df.group_by('patient_id')
        .agg([
            pl.col(col).mean().alias(f'{col}_patient_mean') for col in num_cols
        ] + [
            pl.col(col).std().alias(f'{col}_patient_std') for col in num_cols
        ] + [
            pl.col(col).max().alias(f'{col}_patient_max') for col in num_cols
        ] + [
            pl.col(col).sum().alias(f'{col}_patient_sum') for col in num_cols
        ] + [
#             pl.col(col).min().alias(f'{col}_patient_min') for col in num_cols
        ] + [
#             pl.col(col).skew().alias(f'{col}_patient_skew') for col in num_cols
        ] + [
            pl.count('isic_id').alias('isic_id_count')
        ])
    )

    agg_numeric2 = (
        df.group_by(['patient_id', 'tbp_lv_location'])
        .agg([
            pl.col(col).mean().alias(f'{col}_patient_lv_location_mean') for col in num_cols
        ] + [
            pl.col(col).std().alias(f'{col}_patient_lv_location_std') for col in num_cols
        ] + [
            pl.col(col).max().alias(f'{col}_patient_lv_location_max') for col in num_cols
        ] + [
            pl.col(col).sum().alias(f'{col}_patient_lv_location_sum') for col in num_cols
        ] + [
#             pl.col(col).min().alias(f'{col}_patient_lv_location_min') for col in num_cols
        ] + [
#             pl.col(col).skew().alias(f'{col}_patient_lv_location_skew') for col in num_cols
        ] + [
            pl.count('isic_id').alias('isic_id_count_patient_lv_location')
        ])
    )

    # agg_numeric3 = (
    #     df.group_by(['patient_id', 'tbp_lv_location', 'tbp_tile_type'])
    #     .agg([
    #         pl.col(col).mean().alias(f'{col}_patient_lv_location_tile_type_mean') for col in num_cols
    #     ] + [
    #         # pl.col(col).std().alias(f'{col}_patient_lv_location_tile_type_std') for col in num_cols
    #     ] + [
    #         pl.col(col).max().alias(f'{col}_patient_lv_location_tile_type_max') for col in num_cols
    #     ] + [
    #         pl.col(col).sum().alias(f'{col}_patient_lv_location_tile_type_sum') for col in num_cols
    #     ] + [
    #         # pl.col(col).min().alias(f'{col}_patient_lv_location_tile_type_min') for col in num_cols
    #     ] + [
    #         # pl.col(col).skew().alias(f'{col}_patient_lv_location_tile_type_skew') for col in num_cols
    #     ] + [
    #         pl.count('isic_id').alias('isic_id_count_patient_lv_location_tile_type')
    #     ])
    # )

    agg_numeric4 = (
        df.group_by(['attribution'])
        .agg([
            pl.col(col).mean().alias(f'{col}_attribution_mean') for col in num_cols
        ] + [
            pl.col(col).std().alias(f'{col}_attribution_std') for col in num_cols
        ] + [
            pl.col(col).max().alias(f'{col}_attribution_max') for col in num_cols
        ] + [
            pl.col(col).sum().alias(f'{col}_attribution_sum') for col in num_cols
        ] + [
#             pl.col(col).min().alias(f'{col}_attribution_min') for col in num_cols
        ] + [
#             pl.col(col).skew().alias(f'{col}_attribution_skew') for col in num_cols
        ] + [
            pl.count('isic_id').alias('isic_id_count_attribution')
        ])
    )

    agg_numeric5 = (
        df.group_by(['attribution', 'tbp_lv_location'])
        .agg([
            pl.col(col).mean().alias(f'{col}_attribution_lv_location_mean') for col in num_cols
        ] + [
            pl.col(col).std().alias(f'{col}_attribution_lv_location_std') for col in num_cols
        ] + [
            pl.col(col).max().alias(f'{col}_attribution_lv_location_max') for col in num_cols
        ] + [
            pl.col(col).sum().alias(f'{col}_attribution_lv_location_sum') for col in num_cols
        ] + [
#             pl.col(col).min().alias(f'{col}_attribution_lv_location_min') for col in num_cols
        ] + [
#             pl.col(col).skew().alias(f'{col}_attribution_lv_location_skew') for col in num_cols
        ] + [
            pl.count('isic_id').alias('isic_id_count_attribution_lv_location')
        ])
    )


    # カテゴリ変数をリストで指定
    categorical_columns = ['anatom_site_general', 'combined_anatomical_site', "tbp_lv_location"]

    # カテゴリごとの出現回数を別カラムとして定義
    for col in categorical_columns:
        counts = (
            df.group_by(['patient_id', col])
            # .agg(pl.len().alias(f'{col}_count'))
            .agg(pl.count().alias(f'{col}_count'))
            .pivot(
                index='patient_id',
                columns=col,
                values=f'{col}_count',
                aggregate_function='sum'  # aggregate_function を使用
            )
            .fill_null(0)
        )
        # カラム名を動的に生成
        col_names = [f'{col}_{val}_count' for val in counts.columns if val != 'patient_id']

        # カラム名を更新
        counts = counts.rename({old: new for old, new in zip(counts.columns, ['patient_id'] + col_names)})

        # 元のデータフレームと結合
        df = df.join(counts, on='patient_id', how='left')

    # 数値集計結果を元のDataFrameに結合
    df = df.join(agg_numeric, on='patient_id', how='left')
    df = df.join(agg_numeric2, on=['patient_id', 'tbp_lv_location'], how='left')
    # df = df.join(agg_numeric3, on=['patient_id', 'tbp_lv_location', 'tbp_tile_type'], how='left')
    df = df.join(agg_numeric4, on=['attribution'], how='left')
    df = df.join(agg_numeric5, on=['attribution', 'tbp_lv_location'], how='left')

    return df


In [14]:
def feature_engineering_age_pl(df: pl.DataFrame) -> pl.DataFrame:
    # Calculate the number of unique 'age_approx' per 'patient_id'
    unique_counts = (
        df.group_by('patient_id')
        .agg(
            pl.col('age_approx').n_unique().alias('unique_age_approx_count')
        )
    )

    # Join the unique counts back to the original DataFrame
    df = df.join(unique_counts, on='patient_id', how='left')

    # Calculate min, max, and differences for 'age_approx'
    df = df.with_columns(
        age_min_diff = pl.col('age_approx') - pl.col('age_approx').min().over('patient_id'),
        age_max_diff = pl.col('age_approx') - pl.col('age_approx').max().over('patient_id'),
        age_min_max  = (pl.col('age_approx') - pl.col('age_approx').min().over('patient_id')) +
                        (pl.col('age_approx') - pl.col('age_approx').max().over('patient_id'))
    )

    return df


In [15]:
train_df = read_data(train_path)
train_df = feature_engineering_patient_id_pl(train_df, num_cols + new_num_cols)
train_df = feature_engineering_age_pl(train_df)
train_df = train_df.select([col for col in train_df.columns if not train_df[col].is_null().all()])
train_df.head()


  .agg(pl.count().alias(f'{col}_count'))
  df.group_by(['patient_id', col])


isic_id,target,patient_id,age_approx,sex,anatom_site_general,clin_size_long_diam_mm,image_type,tbp_tile_type,tbp_lv_A,tbp_lv_Aext,tbp_lv_B,tbp_lv_Bext,tbp_lv_C,tbp_lv_Cext,tbp_lv_H,tbp_lv_Hext,tbp_lv_L,tbp_lv_Lext,tbp_lv_areaMM2,tbp_lv_area_perim_ratio,tbp_lv_color_std_mean,tbp_lv_deltaA,tbp_lv_deltaB,tbp_lv_deltaL,tbp_lv_deltaLB,tbp_lv_deltaLBnorm,tbp_lv_eccentricity,tbp_lv_location,tbp_lv_location_simple,tbp_lv_minorAxisMM,tbp_lv_nevi_confidence,tbp_lv_norm_border,tbp_lv_norm_color,tbp_lv_perimeterMM,tbp_lv_radial_color_std_max,tbp_lv_stdL,…,lesion_visibility_score_attribution_lv_location_sum,symmetry_border_consistency_attribution_lv_location_sum,consistency_symmetry_border_attribution_lv_location_sum,color_consistency_attribution_lv_location_sum,consistency_color_attribution_lv_location_sum,size_age_interaction_attribution_lv_location_sum,hue_color_std_interaction_attribution_lv_location_sum,lesion_severity_index_attribution_lv_location_sum,shape_complexity_index_attribution_lv_location_sum,color_contrast_index_attribution_lv_location_sum,log_lesion_area_attribution_lv_location_sum,normalized_lesion_size_attribution_lv_location_sum,mean_hue_difference_attribution_lv_location_sum,std_dev_contrast_attribution_lv_location_sum,color_shape_composite_index_attribution_lv_location_sum,lesion_orientation_3d_attribution_lv_location_sum,overall_color_difference_attribution_lv_location_sum,symmetry_perimeter_interaction_attribution_lv_location_sum,comprehensive_lesion_index_attribution_lv_location_sum,color_variance_ratio_attribution_lv_location_sum,border_color_interaction_attribution_lv_location_sum,border_color_interaction_2_attribution_lv_location_sum,size_color_contrast_ratio_attribution_lv_location_sum,age_normalized_nevi_confidence_attribution_lv_location_sum,age_normalized_nevi_confidence_2_attribution_lv_location_sum,color_asymmetry_index_attribution_lv_location_sum,volume_approximation_3d_attribution_lv_location_sum,color_range_attribution_lv_location_sum,shape_color_consistency_attribution_lv_location_sum,border_length_ratio_attribution_lv_location_sum,age_size_symmetry_index_attribution_lv_location_sum,index_age_size_symmetry_attribution_lv_location_sum,isic_id_count_attribution_lv_location,unique_age_approx_count,age_min_diff,age_max_diff,age_min_max
str,i64,str,f64,cat,cat,f64,str,cat,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,cat,cat,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,u32,u32,f64,f64,f64
"""ISIC_0015670""",0,"""IP_1235828""",60.0,"""male""","""lower extremity""",3.04,"""TBP tile: close-up""","""3D: white""",20.244422,16.261975,26.922447,23.954773,33.684638,28.953117,53.058545,55.828924,54.367448,62.025701,3.152561,27.47617,0.0,3.982447,2.967674,-7.658253,8.360566,5.784302,0.901302,"""Right Leg - Upper""","""Right Leg""",1.543016,0.002629,7.09136,0.0,9.307003,0.0,2.036195,…,78004.135004,7684.909573,1913.685199,368.663576,20405.475358,1514000.0,391139.371251,16960.829618,25265.23407,26510.672466,14432.911103,514.835472,395149.311027,48440.778079,47290.80029,9457.223862,-9546.202545,23216.780691,40569.549963,3509.247679,65402.918624,9491.825444,3758.32777,6399.231049,414521.650236,1987.751808,36970000.0,123830.450835,5367.698502,8713.758906,446856.228239,878822.786397,7328,1,0.0,0.0,0.0
"""ISIC_0015845""",0,"""IP_8170065""",60.0,"""male""","""head/neck""",1.1,"""TBP tile: close-up""","""3D: white""",31.71257,25.36474,26.331,24.54929,41.21903,35.29926,39.70291,44.06404,48.86152,55.36236,0.9194971,12.23529,0.0,6.34783,1.781713,-6.500838,6.839008,4.987244,0.639885,"""Head & Neck""","""Head & Neck""",0.8219178,1.3343e-07,2.116402,0.0,3.354148,0.0,0.8532267,…,29593.414866,5039.218144,1016.613771,127.39422,6689.359527,718978.5,144269.028028,8039.362631,14236.559945,7820.517628,6212.004078,230.915986,167124.576543,17531.275823,23667.197915,5079.590321,-4354.303025,13101.224561,19807.614313,1143.560992,33541.700721,4253.172982,1949.282266,1205.585947,189040.346702,914.764435,38825000.0,44203.687612,2115.47535,4111.23204,253586.312627,485735.453933,3229,1,0.0,0.0,0.0
"""ISIC_0015864""",0,"""IP_6724798""",60.0,"""male""","""posterior torso""",3.4,"""TBP tile: close-up""","""3D: XP""",22.57583,17.12817,37.97046,33.48541,44.17492,37.6118,59.26585,62.90973,53.96118,61.67052,3.265153,24.18462,0.0,5.447655,4.485044,-7.709336,9.092376,6.290359,0.932147,"""Torso Back Top Third""","""Torso Back""",1.194905,0.000296,4.798335,0.0,8.886309,0.0,1.743651,…,239071.012982,32865.403033,6933.212924,1063.403953,60693.346543,5291100.0,1289700.0,58995.053121,95808.86714,82170.825756,46823.256423,1684.927226,1316400.0,150027.620966,165597.067809,36721.050763,-28161.530951,89886.782322,140629.006315,10314.04851,249747.521421,32050.923497,13203.155354,14105.546518,1357200.0,7121.510959,259720000.0,385354.766651,17584.903735,29139.637554,1743400.0,3387200.0,23332,1,0.0,0.0,0.0
"""ISIC_0015902""",0,"""IP_4111386""",65.0,"""male""","""anterior torso""",3.22,"""TBP tile: close-up""","""3D: XP""",14.242329,12.164757,21.448144,21.121356,25.7462,24.374023,56.414429,60.060388,18.649518,23.314841,6.07994,14.889242,0.51452,2.077572,0.326788,-4.665323,4.783413,6.400196,0.654458,"""Torso Front Top Half""","""Torso Front""",2.481328,21.989453,1.975874,1.771705,9.514499,0.66469,1.258541,…,46196.855434,6373.898254,1334.330127,235.365801,9894.980348,1083200.0,251528.528207,11067.85652,18394.833594,31656.701129,9169.708402,309.852495,263891.378875,25692.167456,31843.323438,7092.08075,-472.331159,17852.313914,26866.883918,2262.46524,44780.483295,5925.442391,2590.43262,2872.592339,272609.843536,1243.311789,50202000.0,69530.565399,3286.197943,5610.384977,358886.350678,705866.130137,4501,1,0.0,0.0,0.0
"""ISIC_0024200""",0,"""IP_8313778""",55.0,"""male""","""anterior torso""",2.73,"""TBP tile: close-up""","""3D: white""",24.72552,20.05747,26.4649,25.71046,36.21798,32.60874,46.94607,52.04118,46.27631,54.85574,2.101708,19.90256,0.0,4.668053,0.7544338,-8.579431,9.148495,6.531302,0.9464478,"""Torso Front Top Half""","""Torso Front""",0.929916,0.001379,3.658854,0.0,6.467562,0.0,2.085409,…,232325.5148,29319.404709,6352.974263,1035.0393,58298.647994,5245000.0,1299400.0,56003.742316,87310.613933,97072.295404,45203.448249,1571.541089,1230000.0,145325.489514,154267.907116,34623.068941,-20910.04832,84987.933026,131674.585583,10461.829836,243521.099367,30921.005287,12662.93189,13378.685867,1316100.0,6911.140291,249670000.0,379260.128571,17784.334676,27242.515954,1686500.0,3474300.0,22013,2,5.0,0.0,5.0


In [16]:
def custom_metric(estimator, X, y_true):
    y_hat = estimator.predict_proba(X)[:, 1]
    min_tpr = 0.80
    max_fpr = abs(1 - min_tpr)

    v_gt = abs(y_true - 1)
    v_pred = np.array([1.0 - x for x in y_hat])

    partial_auc_scaled = roc_auc_score(v_gt, v_pred, max_fpr=max_fpr)
    partial_auc = 0.5 * max_fpr**2 + (max_fpr - 0.5 * max_fpr**2) / (1.0 - 0.5) * (partial_auc_scaled - 0.5)

    return partial_auc


In [17]:
import polars as pl

def process_metadata(past_meta, meta, num_cols):

    # データ型の最適化
    def optimize_dataframe(df):
        df = df.with_columns([
            pl.col(pl.Float64).cast(pl.Float32),
            pl.col(pl.Int64).cast(pl.Int32)
        ])
        return df

#     past_meta = optimize_dataframe(past_meta)
    skin_map = {
        'I': 0,
        'II': 1,
        'III': 2,
        'IV': 3,
    }

    # fitzpatrick_skin_typeの値をマッピングするための式を作成
    mapped_col = (
        pl.when(pl.col('fitzpatrick_skin_type') == 'I').then(0)
        .when(pl.col('fitzpatrick_skin_type') == 'II').then(1)
        .when(pl.col('fitzpatrick_skin_type') == 'III').then(2)
        .when(pl.col('fitzpatrick_skin_type') == 'IV').then(3)
        .otherwise(None).alias('fitzpatrick_skin_type_mapped')
    )

    past_meta = past_meta.with_columns([mapped_col])

    skin_df = (
        past_meta.group_by('patient_id')
        .agg(pl.col('fitzpatrick_skin_type_mapped').min().alias('fitzpatrick_skin_type_mapped'))
        .filter(pl.col('fitzpatrick_skin_type_mapped').is_not_null())
    )
    meta = meta.join(skin_df, on='patient_id', how='left')
    skin_df_cols = skin_df.columns
    del skin_df
    gc.collect()

    # ピボットとグループ化の関数
    def pivot_group_count(df, groupby_cols, pivot_index, pivot_column, count_column):
        group_count = (
            df.group_by(groupby_cols)
            .agg(pl.col(count_column).count().alias(count_column))
        )
#         group_count = optimize_dataframe(group_count)
        pivot_df = group_count.pivot(
            index=pivot_index,
            columns=pivot_column,
            values=count_column
        )
        return pivot_df

    # 初回と2回目のピボットテーブル作成
    pivot_df_1 = pivot_group_count(
        df=past_meta,
        groupby_cols=['patient_id', 'anatom_site_general', 'age_approx',  'benign_malignant'],
        pivot_index=['patient_id', 'anatom_site_general', 'age_approx'],
        pivot_column='benign_malignant',
        count_column='isic_id'
    )

    # データ型を一致させる
    pivot_df_1 = pivot_df_1.with_columns([
        pl.col('anatom_site_general').cast(pl.Categorical),
        pl.col('age_approx').cast(pl.Float64)
    ])

    meta = meta.with_columns([
        pl.col('anatom_site_general').cast(pl.Categorical),
        pl.col('age_approx').cast(pl.Float64)
    ])

    meta = meta.join(pivot_df_1, on=['patient_id', 'anatom_site_general', 'age_approx'], how='left')
    pivot_df_1_cols = pivot_df_1.columns
    del pivot_df_1
    gc.collect()

    pivot_df_2 = pivot_group_count(
        df=past_meta,
        groupby_cols=['patient_id', 'anatom_site_general', 'benign_malignant'],
        pivot_index=['patient_id', 'anatom_site_general'],
        pivot_column='benign_malignant',
        count_column='isic_id'
    )

    # データ型を一致させる
    pivot_df_2 = pivot_df_2.with_columns([
        pl.col('anatom_site_general').cast(pl.Categorical),
#         pl.col('age_approx').cast(pl.Float64)
    ])

    meta = meta.join(pivot_df_2, on=['patient_id', 'anatom_site_general'], how='left')
    pivot_df_2_cols = pivot_df_2.columns
    del pivot_df_2
    gc.collect()

    # ターゲットのマッピング
    benign_malignant_map = {
        'benign': 0,
        'indeterminate': 0,
        'indeterminate/benign': 0,
        'indeterminate/malignant': 1,
        'malignant': 1,
    }

    # ターゲットのマッピング
    mapped_target_col = (
        pl.when(pl.col('benign_malignant') == 'benign').then(0)
        .when(pl.col('benign_malignant') == 'indeterminate').then(0)
        .when(pl.col('benign_malignant') == 'indeterminate/benign').then(0)
        .when(pl.col('benign_malignant') == 'indeterminate/malignant').then(1)
        .when(pl.col('benign_malignant') == 'malignant').then(1)
        .otherwise(None).alias('binary_target')
    )

    past_meta = past_meta.with_columns([mapped_target_col])

    # 集計関数
    def calculate_group_aggregates(df, groupby_columns, columns_to_aggregate, aggregations, suffix):
        agg_df = df.group_by(groupby_columns).agg([
            getattr(pl.col(column), agg)().alias(f'{alias}_{agg}_{suffix}')
            for column, alias in columns_to_aggregate.items()
            for agg in aggregations
        ])
        agg_df = optimize_dataframe(agg_df)
        return agg_df, agg_df.columns

    # 集計を実行
    columns_to_aggregate = {
        'binary_target': 'binary_target_agg',
        'clin_size_long_diam_mm': 'clin_size_long_diam_mm_agg'
    }
    aggregations = ['mean', 'sum']
    # aggregations = ['mean']

    patient_aggregates, patient_aggregates_cols = calculate_group_aggregates(
        past_meta,
        ['patient_id'],
        columns_to_aggregate,
        aggregations,
        '_p'
    )
    meta = meta.join(patient_aggregates, on=['patient_id'], how='left')
    del patient_aggregates
    gc.collect()

    patient_age_aggregates, patient_age_aggregates_cols = calculate_group_aggregates(
        past_meta,
        ['patient_id', 'anatom_site_general'],
        columns_to_aggregate,
        aggregations,
        '_ps',
    )
    patient_age_aggregates = patient_age_aggregates.with_columns([
        pl.col('anatom_site_general').cast(pl.Categorical),
    ])
    meta = meta.join(patient_age_aggregates, on=['patient_id', 'anatom_site_general'], how='left')
    del patient_age_aggregates
    gc.collect()

    patient_age_site_aggregates, patient_age_site_aggregates_cols = calculate_group_aggregates(
        past_meta,
        ['patient_id', 'anatom_site_general', 'age_approx'],
        columns_to_aggregate,
        aggregations,
        '_psa',
    )
    patient_age_site_aggregates = patient_age_site_aggregates.with_columns([
        pl.col('anatom_site_general').cast(pl.Categorical),
        pl.col('age_approx').cast(pl.Float64)
    ])
    meta = meta.join(patient_age_site_aggregates, on=['patient_id', 'anatom_site_general', 'age_approx'], how='left')
    del patient_age_site_aggregates
    gc.collect()

    # 集約処理
    aggregations = ['mean']
    meta_g = (
        meta.group_by(['patient_id', 'anatom_site_general', 'age_approx'], maintain_order=True)
        .agg([
            getattr(pl.col(col), agg)().alias(f'{col}_{agg}')
            for col in num_cols
            for agg in aggregations
        ])
    )
    meta_g = optimize_dataframe(meta_g)

    shift_range = 5
    new_feature_names = []

    for shift in range(1, shift_range + 1):
        for agg in aggregations:
            for col in num_cols:
                base_col = f'{col}_{agg}'

                prev_col = f'{base_col}_prev_{shift}'
                next_col = f'{base_col}_next_{shift}'

                meta_g = meta_g.with_columns([
                    pl.col(base_col).shift(shift).over(['patient_id', 'anatom_site_general']).alias(prev_col),
                    pl.col(base_col).shift(-shift).over(['patient_id', 'anatom_site_general']).alias(next_col)
                ])

                # 生成された列が存在することを確認してから、差分を計算
                meta_g = meta_g.with_columns([
                    (pl.col(base_col) - pl.col(prev_col)).alias(f'{base_col}_diff_prev_{shift}'),
                    (pl.col(base_col) - pl.col(next_col)).alias(f'{base_col}_diff_next_{shift}')
                ])

                new_feature_names.extend([prev_col, next_col, f'{base_col}_diff_prev_{shift}', f'{base_col}_diff_next_{shift}'])

    meta_g = meta_g.with_columns([
        pl.col('anatom_site_general').cast(pl.Categorical),
        pl.col('age_approx').cast(pl.Float64)
    ])
    meta = meta.join(meta_g, on=['patient_id', 'anatom_site_general', 'age_approx'], how='left')

    del meta_g
    gc.collect()


    # 新旧metaデータを跨いだshift系特徴量
    meta_selected = meta.select([
        'patient_id',
        'anatom_site_general',
        'age_approx',
        pl.col('target').alias('binary_target')
    ])
    past_meta_selected = past_meta.select([
        'patient_id',
        'anatom_site_general',
        'age_approx',
        pl.col('binary_target')
    ])
    past_meta_selected = past_meta_selected.with_columns([
        pl.col('anatom_site_general').cast(pl.Categorical),
        pl.col('age_approx').cast(pl.Float64),
        pl.col('binary_target').cast(pl.Int64),
    ])
    combined_df = pl.concat([meta_selected, past_meta_selected])
    combined_df = combined_df.with_columns([
        pl.col('age_approx').cast(pl.Float64)
    ])
    del meta_selected, past_meta_selected
    gc.collect()

    target_columns = ['binary_target']
    aggregations=['mean']
    # 複数のカラムに対して複数の集約を適用
    agg_exprs = [
        getattr(pl.col(col), agg)().alias(f'{col}_{agg}')
        for col in target_columns
        for agg in aggregations
    ]

    shift_features = []
    for agg_cols in [
        ['patient_id', 'anatom_site_general'],
        ['patient_id'],
    ]:
        agg_df = combined_df.group_by(agg_cols + ['age_approx'], maintain_order=True).agg(agg_exprs)
        for shift in range(1, shift_range + 1):
            shift_exprs = []
            diff_exprs = []

            for col in target_columns:
                for agg in aggregations:
                    base_col = f'{col}_{agg}'

                    prev_col = f'{base_col}_prev_{shift}'
                    next_col = f'{base_col}_next_{shift}'

                    shift_exprs.extend([
                        pl.col(base_col).shift(shift).over(agg_cols).alias(prev_col),
                        pl.col(base_col).shift(-shift).over(agg_cols).alias(next_col)
                    ])

                    diff_exprs.extend([
                        (pl.col(base_col) - pl.col(prev_col)).alias(f'{base_col}_diff_prev_{shift}'),
                        (pl.col(base_col) - pl.col(next_col)).alias(f'{base_col}_diff_next_{shift}')
                    ])

                    shift_features.extend([prev_col, next_col, f'{base_col}_diff_prev_{shift}', f'{base_col}_diff_next_{shift}'])

            agg_df = agg_df.with_columns(shift_exprs)
            agg_df = agg_df.with_columns(diff_exprs)

        meta = meta.join(agg_df, on=agg_cols + ['age_approx'], how='left')
        del agg_df
        gc.collect()


    # 除外するjoin key
    join_keys = {'patient_id', 'age_approx', 'anatom_site_general'}
    not_use_cols = {'benign', 'indeterminate', 'indeterminate/benign', 'indeterminate/malignant', 'malignant'}
    all_new_feature_names = [
        col for col in (
            list(skin_df_cols[1:]) +
            list(pivot_df_1_cols[1:]) +
            list(pivot_df_2_cols[1:]) +
            list(patient_aggregates_cols) +
            list(patient_age_aggregates_cols) +
            list(patient_age_site_aggregates_cols) +
            new_feature_names +
            shift_features
        ) if col not in join_keys | not_use_cols
    ]

    return meta, all_new_feature_names

# データの読み込み
past_meta = pl.read_csv(base_path / 'past_metadata.csv')

# 使用例
train_df, new_feature_names = process_metadata(
    past_meta,
    train_df,
    num_cols=num_cols+new_num_cols,
)
train_df = train_df.select([col for col in train_df.columns if not train_df[col].is_null().all()])


  pivot_df = group_count.pivot(
  meta = meta.join(pivot_df_1, on=['patient_id', 'anatom_site_general', 'age_approx'], how='left')
  meta = meta.join(pivot_df_2, on=['patient_id', 'anatom_site_general'], how='left')
  meta = meta.join(patient_age_aggregates, on=['patient_id', 'anatom_site_general'], how='left')
  meta = meta.join(patient_age_site_aggregates, on=['patient_id', 'anatom_site_general', 'age_approx'], how='left')
  past_meta_selected = past_meta_selected.with_columns([
  meta = meta.join(agg_df, on=agg_cols + ['age_approx'], how='left')


In [18]:
print(len(new_feature_names))
train_df.head()

1595


isic_id,target,patient_id,age_approx,sex,anatom_site_general,clin_size_long_diam_mm,image_type,tbp_tile_type,tbp_lv_A,tbp_lv_Aext,tbp_lv_B,tbp_lv_Bext,tbp_lv_C,tbp_lv_Cext,tbp_lv_H,tbp_lv_Hext,tbp_lv_L,tbp_lv_Lext,tbp_lv_areaMM2,tbp_lv_area_perim_ratio,tbp_lv_color_std_mean,tbp_lv_deltaA,tbp_lv_deltaB,tbp_lv_deltaL,tbp_lv_deltaLB,tbp_lv_deltaLBnorm,tbp_lv_eccentricity,tbp_lv_location,tbp_lv_location_simple,tbp_lv_minorAxisMM,tbp_lv_nevi_confidence,tbp_lv_norm_border,tbp_lv_norm_color,tbp_lv_perimeterMM,tbp_lv_radial_color_std_max,tbp_lv_stdL,…,volume_approximation_3d_mean_next_1,volume_approximation_3d_mean_diff_prev_1,volume_approximation_3d_mean_diff_next_1,color_range_mean_prev_1,color_range_mean_next_1,color_range_mean_diff_prev_1,color_range_mean_diff_next_1,shape_color_consistency_mean_prev_1,shape_color_consistency_mean_next_1,shape_color_consistency_mean_diff_prev_1,shape_color_consistency_mean_diff_next_1,border_length_ratio_mean_prev_1,border_length_ratio_mean_next_1,border_length_ratio_mean_diff_prev_1,border_length_ratio_mean_diff_next_1,age_size_symmetry_index_mean_prev_1,age_size_symmetry_index_mean_next_1,age_size_symmetry_index_mean_diff_prev_1,age_size_symmetry_index_mean_diff_next_1,index_age_size_symmetry_mean_prev_1,index_age_size_symmetry_mean_next_1,index_age_size_symmetry_mean_diff_prev_1,index_age_size_symmetry_mean_diff_next_1,binary_target_mean,binary_target_mean_prev_1,binary_target_mean_next_1,binary_target_mean_diff_prev_1,binary_target_mean_diff_next_1,binary_target_mean_next_2,binary_target_mean_diff_next_2,binary_target_mean_right,binary_target_mean_prev_1_right,binary_target_mean_next_1_right,binary_target_mean_diff_prev_1_right,binary_target_mean_diff_next_1_right,binary_target_mean_next_2_right,binary_target_mean_diff_next_2_right
str,i64,str,f64,cat,cat,f64,str,cat,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,cat,cat,f64,f64,f64,f64,f64,f64,f64,…,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""ISIC_0015670""",0,"""IP_1235828""",60.0,"""male""","""lower extremity""",3.04,"""TBP tile: close-up""","""3D: white""",20.244422,16.261975,26.922447,23.954773,33.684638,28.953117,53.058545,55.828924,54.367448,62.025701,3.152561,27.47617,0.0,3.982447,2.967674,-7.658253,8.360566,5.784302,0.901302,"""Right Leg - Upper""","""Right Leg""",1.543016,0.002629,7.09136,0.0,9.307003,0.0,2.036195,…,,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,0.0,,,,,,
"""ISIC_0015845""",0,"""IP_8170065""",60.0,"""male""","""head/neck""",1.1,"""TBP tile: close-up""","""3D: white""",31.71257,25.36474,26.331,24.54929,41.21903,35.29926,39.70291,44.06404,48.86152,55.36236,0.9194971,12.23529,0.0,6.34783,1.781713,-6.500838,6.839008,4.987244,0.639885,"""Head & Neck""","""Head & Neck""",0.8219178,1.3343e-07,2.116402,0.0,3.354148,0.0,0.8532267,…,,,,,,,,,,,,,,,,,,,,,,,,0.125,,,,,,,0.00885,,,,,,
"""ISIC_0015864""",0,"""IP_6724798""",60.0,"""male""","""posterior torso""",3.4,"""TBP tile: close-up""","""3D: XP""",22.57583,17.12817,37.97046,33.48541,44.17492,37.6118,59.26585,62.90973,53.96118,61.67052,3.265153,24.18462,0.0,5.447655,4.485044,-7.709336,9.092376,6.290359,0.932147,"""Torso Back Top Third""","""Torso Back""",1.194905,0.000296,4.798335,0.0,8.886309,0.0,1.743651,…,,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,0.000532,,,,,,
"""ISIC_0015902""",0,"""IP_4111386""",65.0,"""male""","""anterior torso""",3.22,"""TBP tile: close-up""","""3D: XP""",14.242329,12.164757,21.448144,21.121356,25.7462,24.374023,56.414429,60.060388,18.649518,23.314841,6.07994,14.889242,0.51452,2.077572,0.326788,-4.665323,4.783413,6.400196,0.654458,"""Torso Front Top Half""","""Torso Front""",2.481328,21.989453,1.975874,1.771705,9.514499,0.66469,1.258541,…,,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,0.0,,,,,,
"""ISIC_0024200""",0,"""IP_8313778""",55.0,"""male""","""anterior torso""",2.73,"""TBP tile: close-up""","""3D: white""",24.72552,20.05747,26.4649,25.71046,36.21798,32.60874,46.94607,52.04118,46.27631,54.85574,2.101708,19.90256,0.0,4.668053,0.7544338,-8.579431,9.148495,6.531302,0.9464478,"""Torso Front Top Half""","""Torso Front""",0.929916,0.001379,3.658854,0.0,6.467562,0.0,2.085409,…,,,,,,,,,,,,,,,,,,,,,,,,0.0,,0.0,,0.0,1.0,-1.0,0.004535,,0.666667,,-0.662132,0.0,0.004535


In [19]:
train_df = train_df.to_pandas()

In [20]:
# oof_df_fyk = pd.read_csv('oof_df_v20.csv')[['isic_id', 'pred']].rename(columns={'pred': 'pred_fyk'})

oof_df_tsuma_eva = pd.read_parquet(base_path / 'eva_preds.parquet')
oof_df_tsuma_eva = oof_df_tsuma_eva.rename(columns = {'pred': 'pred_tsuma_eva'})

oof_df_sub_67 = pd.concat([
    pd.read_csv(base_path / f'sub67/test_results_fold_{fold}.csv') for fold in range(5)
])[['isic_id', 'pred']]
oof_df_sub_67 = oof_df_sub_67.rename(columns = {'pred': 'pred_sub_67'})

oof_df_sub_68 = pd.concat([
    pd.read_csv(base_path / f'sub68/test_results_fold_{fold}.csv') for fold in range(5)
])[['isic_id', 'pred']]
oof_df_sub_68 = oof_df_sub_68.rename(columns = {'pred': 'pred_sub_68'})

oof_df_sub_69 = pd.concat([
    pd.read_csv(base_path / f'sub69/test_results_fold_{fold}.csv') for fold in range(5)
])[['isic_id', 'pred']]
oof_df_sub_69 = oof_df_sub_69.rename(columns = {'pred': 'pred_sub_69'})

oof_df_sub_71 = pd.concat([
    pd.read_csv(base_path / f'sub71/test_results_fold_{fold}.csv') for fold in range(5)
])[['isic_id', 'pred']]
oof_df_sub_71 = oof_df_sub_71.rename(columns = {'pred': 'pred_sub_71'})

oof_df_sub_72 = pd.concat([
    pd.read_csv(base_path / f'sub72/test_results_fold_{fold}.csv') for fold in range(5)
])[['isic_id', 'pred']]
oof_df_sub_72 = oof_df_sub_72.rename(columns = {'pred': 'pred_sub_72'})

oof_df_sub_73 = pd.concat([
    pd.read_csv(base_path / f'sub73/test_results_fold_{fold}.csv') for fold in range(5)
])[['isic_id', 'pred']]
oof_df_sub_73 = oof_df_sub_73.rename(columns = {'pred': 'pred_sub_73'})

oof_df_sub_75 = pd.concat([
    pd.read_csv(base_path / f'sub75/test_results_fold_{fold}.csv') for fold in range(5)
])[['isic_id', 'pred']]
oof_df_sub_75 = oof_df_sub_75.rename(columns = {'pred': 'pred_sub_75'})

train_df = pd.merge(train_df, oof_df_tsuma_eva, how='left', on='isic_id')
train_df = pd.merge(train_df, oof_df_sub_67, how='left', on='isic_id')
train_df = pd.merge(train_df, oof_df_sub_68, how='left', on='isic_id')
train_df = pd.merge(train_df, oof_df_sub_69, how='left', on='isic_id')
train_df = pd.merge(train_df, oof_df_sub_71, how='left', on='isic_id')
train_df = pd.merge(train_df, oof_df_sub_72, how='left', on='isic_id')
train_df = pd.merge(train_df, oof_df_sub_73, how='left', on='isic_id')
train_df = pd.merge(train_df, oof_df_sub_75, how='left', on='isic_id')


In [21]:
feature_cols += new_feature_names
feature_cols = [
    col for col in feature_cols
    if col not in ['benign', 'indeterminate', 'indeterminate/benign', 'indeterminate/malignant', 'malignant']
]
feature_cols = sorted(list(set(feature_cols)))
feature_cols = [col for col in feature_cols if col in train_df.columns]

print(len(feature_cols))


2163


In [22]:
train_df.head()

Unnamed: 0,isic_id,target,patient_id,age_approx,sex,anatom_site_general,clin_size_long_diam_mm,image_type,tbp_tile_type,tbp_lv_A,tbp_lv_Aext,tbp_lv_B,tbp_lv_Bext,tbp_lv_C,tbp_lv_Cext,tbp_lv_H,tbp_lv_Hext,tbp_lv_L,tbp_lv_Lext,tbp_lv_areaMM2,tbp_lv_area_perim_ratio,tbp_lv_color_std_mean,tbp_lv_deltaA,tbp_lv_deltaB,tbp_lv_deltaL,tbp_lv_deltaLB,tbp_lv_deltaLBnorm,tbp_lv_eccentricity,tbp_lv_location,tbp_lv_location_simple,tbp_lv_minorAxisMM,tbp_lv_nevi_confidence,tbp_lv_norm_border,tbp_lv_norm_color,tbp_lv_perimeterMM,tbp_lv_radial_color_std_max,tbp_lv_stdL,tbp_lv_stdLExt,tbp_lv_symm_2axis,tbp_lv_symm_2axis_angle,tbp_lv_x,tbp_lv_y,tbp_lv_z,attribution,copyright_license,lesion_id,iddx_full,iddx_1,iddx_2,iddx_3,iddx_4,iddx_5,mel_mitotic_index,mel_thick_mm,tbp_lv_dnn_lesion_confidence,lesion_size_ratio,lesion_size_ratio_with_area,lesion_shape_index,hue_contrast,luminance_contrast,lesion_color_difference,border_complexity,color_uniformity,position_distance_3d,perimeter_to_area_ratio,area_to_perimeter_ratio,lesion_visibility_score,combined_anatomical_site,symmetry_border_consistency,consistency_symmetry_border,color_consistency,consistency_color,size_age_interaction,hue_color_std_interaction,lesion_severity_index,shape_complexity_index,color_contrast_index,log_lesion_area,normalized_lesion_size,mean_hue_difference,std_dev_contrast,color_shape_composite_index,lesion_orientation_3d,overall_color_difference,symmetry_perimeter_interaction,comprehensive_lesion_index,color_variance_ratio,border_color_interaction,border_color_interaction_2,size_color_contrast_ratio,age_normalized_nevi_confidence,age_normalized_nevi_confidence_2,color_asymmetry_index,volume_approximation_3d,color_range,shape_color_consistency,border_length_ratio,age_size_symmetry_index,index_age_size_symmetry,age_approx_patient_norm,...,mean_hue_difference_mean_diff_prev_1,mean_hue_difference_mean_diff_next_1,std_dev_contrast_mean_prev_1,std_dev_contrast_mean_next_1,std_dev_contrast_mean_diff_prev_1,std_dev_contrast_mean_diff_next_1,color_shape_composite_index_mean_prev_1,color_shape_composite_index_mean_next_1,color_shape_composite_index_mean_diff_prev_1,color_shape_composite_index_mean_diff_next_1,lesion_orientation_3d_mean_prev_1,lesion_orientation_3d_mean_next_1,lesion_orientation_3d_mean_diff_prev_1,lesion_orientation_3d_mean_diff_next_1,overall_color_difference_mean_prev_1,overall_color_difference_mean_next_1,overall_color_difference_mean_diff_prev_1,overall_color_difference_mean_diff_next_1,symmetry_perimeter_interaction_mean_prev_1,symmetry_perimeter_interaction_mean_next_1,symmetry_perimeter_interaction_mean_diff_prev_1,symmetry_perimeter_interaction_mean_diff_next_1,comprehensive_lesion_index_mean_prev_1,comprehensive_lesion_index_mean_next_1,comprehensive_lesion_index_mean_diff_prev_1,comprehensive_lesion_index_mean_diff_next_1,color_variance_ratio_mean_prev_1,color_variance_ratio_mean_next_1,color_variance_ratio_mean_diff_prev_1,color_variance_ratio_mean_diff_next_1,border_color_interaction_mean_prev_1,border_color_interaction_mean_next_1,border_color_interaction_mean_diff_prev_1,border_color_interaction_mean_diff_next_1,border_color_interaction_2_mean_prev_1,border_color_interaction_2_mean_next_1,border_color_interaction_2_mean_diff_prev_1,border_color_interaction_2_mean_diff_next_1,size_color_contrast_ratio_mean_prev_1,size_color_contrast_ratio_mean_next_1,size_color_contrast_ratio_mean_diff_prev_1,size_color_contrast_ratio_mean_diff_next_1,age_normalized_nevi_confidence_mean_prev_1,age_normalized_nevi_confidence_mean_next_1,age_normalized_nevi_confidence_mean_diff_prev_1,age_normalized_nevi_confidence_mean_diff_next_1,age_normalized_nevi_confidence_2_mean_prev_1,age_normalized_nevi_confidence_2_mean_next_1,age_normalized_nevi_confidence_2_mean_diff_prev_1,age_normalized_nevi_confidence_2_mean_diff_next_1,color_asymmetry_index_mean_prev_1,color_asymmetry_index_mean_next_1,color_asymmetry_index_mean_diff_prev_1,color_asymmetry_index_mean_diff_next_1,volume_approximation_3d_mean_prev_1,volume_approximation_3d_mean_next_1,volume_approximation_3d_mean_diff_prev_1,volume_approximation_3d_mean_diff_next_1,color_range_mean_prev_1,color_range_mean_next_1,color_range_mean_diff_prev_1,color_range_mean_diff_next_1,shape_color_consistency_mean_prev_1,shape_color_consistency_mean_next_1,shape_color_consistency_mean_diff_prev_1,shape_color_consistency_mean_diff_next_1,border_length_ratio_mean_prev_1,border_length_ratio_mean_next_1,border_length_ratio_mean_diff_prev_1,border_length_ratio_mean_diff_next_1,age_size_symmetry_index_mean_prev_1,age_size_symmetry_index_mean_next_1,age_size_symmetry_index_mean_diff_prev_1,age_size_symmetry_index_mean_diff_next_1,index_age_size_symmetry_mean_prev_1,index_age_size_symmetry_mean_next_1,index_age_size_symmetry_mean_diff_prev_1,index_age_size_symmetry_mean_diff_next_1,binary_target_mean,binary_target_mean_prev_1,binary_target_mean_next_1,binary_target_mean_diff_prev_1,binary_target_mean_diff_next_1,binary_target_mean_next_2,binary_target_mean_diff_next_2,binary_target_mean_right,binary_target_mean_prev_1_right,binary_target_mean_next_1_right,binary_target_mean_diff_prev_1_right,binary_target_mean_diff_next_1_right,binary_target_mean_next_2_right,binary_target_mean_diff_next_2_right,pred_tsuma_eva,pred_sub_67,pred_sub_68,pred_sub_69,pred_sub_71,pred_sub_72,pred_sub_73,pred_sub_75
0,ISIC_0015670,0,IP_1235828,60.0,male,lower extremity,3.04,TBP tile: close-up,3D: white,20.244422,16.261975,26.922447,23.954773,33.684638,28.953117,53.058545,55.828924,54.367448,62.025701,3.152561,27.47617,0.0,3.982447,2.967674,-7.658253,8.360566,5.784302,0.901302,Right Leg - Upper,Right Leg,1.543016,0.002628592,7.09136,0.0,9.307003,0.0,2.036195,2.63778,0.590476,85,-182.703552,613.493652,-42.427948,Memorial Sloan Kettering Cancer Center,CC-BY,,Benign,Benign,,,,,,,97.517282,0.507571,1.600149,0.036395,-2.770379,-7.658253,9.12775,7.681836,0.0,641.525666,2.952204,0.33873,5.784302,lower extremity_Right Leg - Upper,4.187279,0.545088,0.032828,1.971475,182.4,0.0,2.66422,7.718231,5.07617,1.423725,0.050667,54.443734,5.269909,9.355549,1.860241,-0.236044,5.495564,7.241987,0.0,0.0,0.0,0.52556,4.380987e-05,60.076964,0.0,2022.449088,14.608375,0.0,1.478676,107.702857,111.690749,0.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,0.0,,,,,,,0.029476,1.154462e-06,7e-06,6e-06,3e-06,3.2e-05,1.127719e-06,1e-05
1,ISIC_0015845,0,IP_8170065,60.0,male,head/neck,1.1,TBP tile: close-up,3D: white,31.71257,25.36474,26.331,24.54929,41.21903,35.29926,39.70291,44.06404,48.86152,55.36236,0.919497,12.23529,0.0,6.34783,1.781713,-6.500838,6.839008,4.987244,0.639885,Head & Neck,Head & Neck,0.821918,1.334303e-07,2.116402,0.0,3.354148,0.0,0.853227,3.912844,0.285714,55,-0.078308,1575.687,57.1745,Memorial Sloan Kettering Cancer Center,CC-BY,IL_6727506,Benign,Benign,,,,,,,3.141455,0.747198,0.687046,0.081731,-4.36113,-6.50084,9.259068,2.402116,0.0,1576.723962,3.647807,0.274137,4.987244,head/neck_Head & Neck,0.604686,0.251731,0.015412,0.840277,66.0,0.0,0.918762,2.483847,6.615949,0.652063,0.018333,41.883475,5.345725,4.173668,1.570846,0.542902,0.958328,3.290222,0.0,0.0,0.0,0.220563,2.223838e-09,60.010082,0.0,1449.79311,14.63038,0.0,0.986739,18.857144,15.762808,0.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.125,,,,,,,0.00885,,,,,,,0.609087,0.001421404,0.000482,0.000738,0.001004,0.000318,0.0001837846,0.003136
2,ISIC_0015864,0,IP_6724798,60.0,male,posterior torso,3.4,TBP tile: close-up,3D: XP,22.57583,17.12817,37.97046,33.48541,44.17492,37.6118,59.26585,62.90973,53.96118,61.67052,3.265153,24.18462,0.0,5.447655,4.485044,-7.709336,9.092376,6.290359,0.932147,Torso Back Top Third,Torso Back,1.194905,0.0002959177,4.798335,0.0,8.886309,0.0,1.743651,1.950777,0.361905,105,123.6497,1472.01,232.9089,Memorial Sloan Kettering Cancer Center,CC-BY,,Benign,Benign,,,,,,,99.80404,0.351443,1.147514,0.041349,-3.64388,-7.70934,10.451145,5.16024,0.0,1495.442825,2.72156,0.367436,6.290359,posterior torso_Torso Back Top Third,1.73654,0.336523,0.028274,1.695707,204.0,0.0,1.910161,5.201588,8.513722,1.450478,0.056667,61.08779,6.033971,8.182175,1.486992,0.741121,3.215998,6.369668,0.0,0.0,0.0,0.54051,4.931962e-06,60.096256,0.0,4882.849628,17.64205,0.0,1.387282,73.828579,70.900473,0.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,0.000532,,,,,,,0.001576,1.888572e-06,1.8e-05,8e-06,2e-06,1.5e-05,7.042271e-08,6e-06
3,ISIC_0015902,0,IP_4111386,65.0,male,anterior torso,3.22,TBP tile: close-up,3D: XP,14.242329,12.164757,21.448144,21.121356,25.7462,24.374023,56.414429,60.060388,18.649518,23.314841,6.07994,14.889242,0.51452,2.077572,0.326788,-4.665323,4.783413,6.400196,0.654458,Torso Front Top Half,Torso Front,2.481328,21.98945,1.975874,1.771705,9.514499,0.66469,1.258541,1.573733,0.209581,130,-141.02478,1442.185791,58.359802,ACEMID MIA,CC-0,,Benign,Benign,,,,,,,99.989998,0.770599,4.685194,0.067163,-3.645959,-4.665323,5.117454,2.185455,0.774064,1450.239191,1.5649,0.639018,8.171901,anterior torso_Torso Front Top Half,0.414105,0.189482,0.05398,1.194084,209.3,29.026369,1.467346,2.252618,4.139233,1.957265,0.049538,58.237408,2.954563,5.204448,1.668272,-0.753654,1.994057,4.381246,0.326943,3.500666,0.934114,0.50311,0.3382993,65.079708,0.139306,8817.367196,7.069683,0.336732,1.088507,43.865269,82.825529,0.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,0.0,,,,,,,0.003175,5.989273e-07,1e-06,3e-06,1e-06,1e-05,3.237762e-06,1.4e-05
4,ISIC_0024200,0,IP_8313778,55.0,male,anterior torso,2.73,TBP tile: close-up,3D: white,24.72552,20.05747,26.4649,25.71046,36.21798,32.60874,46.94607,52.04118,46.27631,54.85574,2.101708,19.90256,0.0,4.668053,0.754434,-8.579431,9.148495,6.531302,0.946448,Torso Front Top Half,Torso Front,0.929916,0.001378832,3.658854,0.0,6.467562,0.0,2.085409,2.480509,0.313433,20,-72.31564,1488.72,21.42896,Memorial Sloan Kettering Cancer Center,CC-BY,,Benign,Benign,,,,,,,70.44251,0.340629,0.715902,0.050245,-5.09511,-8.57943,9.796251,3.972287,0.0,1490.629394,3.077289,0.324961,6.531302,anterior torso_Torso Front Top Half,1.146805,0.288701,0.038016,2.009033,150.15,0.0,1.535101,4.022532,3.374358,1.131953,0.049636,49.493625,5.655868,6.738664,1.619334,-1.052315,2.027146,5.29061,0.0,0.0,0.0,0.417987,2.506967e-05,55.067712,0.0,3132.867723,14.00192,0.0,1.258489,47.061935,36.230932,0.067495,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,0.0,,0.0,1.0,-1.0,0.004535,,0.666667,,-0.662132,0.0,0.004535,0.032961,4.06106e-06,4.3e-05,3.5e-05,6e-06,0.000111,0.0001609308,2.2e-05


In [23]:
import joblib

le_dict = {}
for c in cat_cols:
    le = LabelEncoder()
    le.fit(pd.concat([train_df[c]]))
    train_df[c] = le.transform(train_df[c])
    # test_df[c] = test_df[c].map(lambda s: '<unknown>' if s not in le.classes_ else s)

    le.classes_ = np.append(le.classes_, '<unknown>')
    # test_df[c] = le.transform(test_df[c])

    le_dict[c] = le

In [24]:
joblib.dump(le_dict, output_dir / "exp20" / "labelEncoder.joblib")


['/content/drive/MyDrive/kaggle/isic2024/output/exp20/labelEncoder.joblib']

In [25]:
target_col = 'target'
# target_col = 'has_lesion_id'


In [26]:
import gc


alpha = 0.25
gamma = 2.0

def focal_loss_lgb(y_true, y_pred, alpha, gamma):
    a, g = alpha, gamma
    y_pred = 1.0 / (1.0 + np.exp(-y_pred))  # sigmoid
    p_t = y_true * y_pred + (1 - y_true) * (1 - y_pred)
    grad = -a * (1 - p_t) ** g * (y_true - y_pred)
    hess = a * (1 - p_t) ** g * y_pred * (1 - y_pred) * (g * p_t * np.log(p_t) + 1)
    return grad, hess

def focal_loss_lgb_eval(y_true, y_pred, alpha, gamma):
    a, g = alpha, gamma
    y_pred = 1.0 / (1.0 + np.exp(-y_pred))  # sigmoid
    p_t = y_true * y_pred + (1 - y_true) * (1 - y_pred)
    loss = -a * (1 - p_t) ** g * np.log(p_t)
    return 'focal_loss', np.mean(loss), False


class Trainer(object):
    def __init__(self, X, y, X_valid, y_valid, params, predictors):
        self.X = X
        self.y = y
        self.X_valid = X_valid
        self.y_valid = y_valid
        self.params = params
        self.predictors = predictors

    def fit(self):
        oof_result = []
        X_train, X_valid = self.X, self.X_valid
        y_train, y_valid = self.y, self.y_valid

        sample_weight_train = X_train['sample_weight']
        sample_weight_valid = X_valid['sample_weight']

        dtrain = lgb.Dataset(
            X_train[self.predictors], label=y_train, feature_name=self.predictors, weight=sample_weight_train
        )
        dvalid = lgb.Dataset(
            X_valid[self.predictors], label=y_valid, feature_name=self.predictors, weight=sample_weight_valid
        )

        # もう使わないデータを削除
        del X_train, X_valid
        gc.collect()

        callbacks = [
            lgb.log_evaluation(100),
            lgb.early_stopping(100),
        ]

        clf = lgb.train(
            self.params,
            dtrain,
            valid_sets=[dtrain, dvalid],
            num_boost_round=100000,
            categorical_feature=[],
            callbacks=callbacks,
#             feval=pauc_80, # custome metric
            # fobj=lambda y_true, y_pred: focal_loss_lgb(y_true, y_pred, alpha, gamma),
            # feval=lambda y_true, y_pred: focal_loss_lgb_eval(y_true, y_pred, alpha, gamma),
        )
#         clf = lgb.cv(
#             self.params,
#             dtrain,
#             num_boost_round=100000,
#             categorical_feature=[],
#             callbacks=callbacks,
# #             feval=pauc_80, # custome metric
#             fobj=lambda y_true, y_pred: focal_loss_lgb(y_true, y_pred, alpha, gamma),
#             feval=lambda y_true, y_pred: focal_loss_lgb_eval(y_true, y_pred, alpha, gamma),
#         )
        oof_result.append([x for x in clf.best_score["valid_1"].values()][0])
        self.clf = clf
        self.oof_result = oof_result
        return clf, oof_result

    def fit_xgb(self):
        oof_result = []
        X_train, X_valid = self.X, self.X_valid
        y_train, y_valid = self.y, self.y_valid

        X_train = X_train.replace([np.inf, -np.inf], np.nan)
        X_valid = X_valid.replace([np.inf, -np.inf], np.nan)

        X_train = X_train.fillna(0)
        X_valid = X_valid.fillna(0)

        sample_weight_train = X_train['sample_weight']
        sample_weight_valid = X_valid['sample_weight']

        X_train[self.predictors] = X_train[self.predictors].astype(np.float32)
        X_valid[self.predictors] = X_valid[self.predictors].astype(np.float32)

        dtrain = xgb.DMatrix(X_train[self.predictors], label=y_train, weight=sample_weight_train)
        dvalid = xgb.DMatrix(X_valid[self.predictors], label=y_valid, weight=sample_weight_valid)
        # dvalid = xgb.DMatrix(X_valid[self.predictors], label=y_valid)

        # もう使わないデータを削除
        del X_train, X_valid
        gc.collect()

        watchlist = [(dtrain, 'train'), (dvalid, 'eval')]

        clf = xgb.train(
            self.params,
            dtrain,
            10000,
            early_stopping_rounds=100,
            evals=watchlist,
            verbose_eval=50,
        )

        self.clf = clf
        self.oof_result = oof_result
        return clf, oof_result


In [27]:
params_xgb = {
    'objective':  'binary:logistic',
    'eval_metric': 'logloss', # auc, logloss
    # 'objective': 'reg:squarederror',  # binary:logistic から reg:squarederror に変更
    # 'eval_metric': 'rmse',
    "nthread": -1,
    "learning_rate" : 0.003,
    'colsample_bytree': 0.5,
    # 'colsample_bytree': 0.5,
    # 'subsample': 0.8,
    'subsample': 0.6,
    'max_depth': 7,
    # 'lambda': 8,
    'lambda': 5,
    'tree_method':"hist",
    # 'scale_pos_weight':20
    # 'scale_pos_weight':7,
    'scale_pos_weight':8,

    'tree_method': 'hist',  # GPU対応のツリービルディングアルゴリズム
    'device': 'cuda',  # GPUを使用するデバイスとしてCUDAを指定
}

In [28]:
n_splits = 5
gkf = StratifiedGroupKFold(n_splits=n_splits, random_state=42, shuffle=True)
train_df["fold"] = -1

for fold, (train_idx, val_idx) in enumerate(
    gkf.split(train_df, train_df['target'], groups=train_df["patient_id"])
):
    train_df.loc[val_idx, "fold"] = fold

In [29]:
train_df["has_lesion_id"] = (train_df["lesion_id"] != '').astype('int32')


In [30]:
def score(solution: np.ndarray, submission: np.ndarray, min_tpr: float=0.80) -> float:
    v_gt = abs(solution-1)
    v_pred = np.array([1.0 - x for x in submission])
    max_fpr = abs(1-min_tpr)
    partial_auc_scaled = roc_auc_score(v_gt, v_pred, max_fpr=max_fpr)
    # change scale from [0.5, 1.0] to [0.5 * max_fpr**2, max_fpr]
    # https://math.stackexchange.com/questions/914823/shift-numbers-into-a-different-range
    partial_auc = 0.5 * max_fpr**2 + (max_fpr - 0.5 * max_fpr**2) / (1.0 - 0.5) * (partial_auc_scaled - 0.5)

    return(partial_auc)

In [31]:
xgb_clfs = []
for fold in range(5):
    X = train_df[train_df["fold"] != fold]
    X_valid = train_df[train_df["fold"] == fold]

    X['sample_weight'] = 1.0
    # X.loc[X['lesion_id'].notnull() & (X['target'] == 0), 'sample_weight'] = 0.1
    # X.loc[X['lesion_id'].notnull(), 'target'] = 1
    X.loc[X['iddx_1'] == 'Indeterminate', 'target'] = 1
    X.loc[X['iddx_1'] == 'Indeterminate', 'sample_weight'] = 0.5

    X_valid['sample_weight'] = 1.0
    # X_valid.loc[X_valid['lesion_id'].notnull() & (X_valid['target'] == 0), 'sample_weight'] = 0.1
    # X_valid.loc[X_valid['lesion_id'].notnull(), 'target'] = 1
    X_valid.loc[X_valid['iddx_1'] == 'Indeterminate', 'target'] = 1
    X_valid.loc[X_valid['iddx_1'] == 'Indeterminate', 'sample_weight'] = 0.5

    y = X[target_col].astype(float)
    y_valid = X_valid[target_col].astype(float)

    trainer = Trainer(X, y, X_valid, y_valid, params_xgb, feature_cols)
    trainer.fit_xgb()
    xgb_clfs.append(trainer.clf)

    trainer.clf.save_model(output_dir / 'exp20' / f"xgb_fold_{fold}.json")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['sample_weight'] = 1.0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_valid['sample_weight'] = 1.0


[0]	train-logloss:0.13283	eval-logloss:0.13323
[50]	train-logloss:0.11400	eval-logloss:0.11463
[100]	train-logloss:0.09806	eval-logloss:0.09890
[150]	train-logloss:0.08454	eval-logloss:0.08557
[200]	train-logloss:0.07305	eval-logloss:0.07424
[250]	train-logloss:0.06325	eval-logloss:0.06460
[300]	train-logloss:0.05488	eval-logloss:0.05636
[350]	train-logloss:0.04769	eval-logloss:0.04929
[400]	train-logloss:0.04152	eval-logloss:0.04324
[450]	train-logloss:0.03622	eval-logloss:0.03805
[500]	train-logloss:0.03167	eval-logloss:0.03360
[550]	train-logloss:0.02775	eval-logloss:0.02977
[600]	train-logloss:0.02436	eval-logloss:0.02647
[650]	train-logloss:0.02143	eval-logloss:0.02363
[700]	train-logloss:0.01890	eval-logloss:0.02118
[750]	train-logloss:0.01671	eval-logloss:0.01907
[800]	train-logloss:0.01482	eval-logloss:0.01725
[850]	train-logloss:0.01318	eval-logloss:0.01569
[900]	train-logloss:0.01174	eval-logloss:0.01433
[950]	train-logloss:0.01051	eval-logloss:0.01317
[1000]	train-logloss:0.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['sample_weight'] = 1.0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_valid['sample_weight'] = 1.0


[0]	train-logloss:0.13314	eval-logloss:0.13299
[50]	train-logloss:0.11427	eval-logloss:0.11442
[100]	train-logloss:0.09831	eval-logloss:0.09869
[150]	train-logloss:0.08477	eval-logloss:0.08535
[200]	train-logloss:0.07325	eval-logloss:0.07401
[250]	train-logloss:0.06343	eval-logloss:0.06435
[300]	train-logloss:0.05505	eval-logloss:0.05612
[350]	train-logloss:0.04785	eval-logloss:0.04906
[400]	train-logloss:0.04169	eval-logloss:0.04300
[450]	train-logloss:0.03638	eval-logloss:0.03782
[500]	train-logloss:0.03181	eval-logloss:0.03336
[550]	train-logloss:0.02787	eval-logloss:0.02951
[600]	train-logloss:0.02447	eval-logloss:0.02622
[650]	train-logloss:0.02154	eval-logloss:0.02337
[700]	train-logloss:0.01899	eval-logloss:0.02092
[750]	train-logloss:0.01679	eval-logloss:0.01879
[800]	train-logloss:0.01488	eval-logloss:0.01696
[850]	train-logloss:0.01323	eval-logloss:0.01539
[900]	train-logloss:0.01179	eval-logloss:0.01402
[950]	train-logloss:0.01055	eval-logloss:0.01284
[1000]	train-logloss:0.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['sample_weight'] = 1.0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_valid['sample_weight'] = 1.0


[0]	train-logloss:0.13339	eval-logloss:0.13278
[50]	train-logloss:0.11452	eval-logloss:0.11423
[100]	train-logloss:0.09857	eval-logloss:0.09851
[150]	train-logloss:0.08503	eval-logloss:0.08515
[200]	train-logloss:0.07354	eval-logloss:0.07379
[250]	train-logloss:0.06374	eval-logloss:0.06411
[300]	train-logloss:0.05535	eval-logloss:0.05584
[350]	train-logloss:0.04814	eval-logloss:0.04874
[400]	train-logloss:0.04196	eval-logloss:0.04266
[450]	train-logloss:0.03666	eval-logloss:0.03745
[500]	train-logloss:0.03210	eval-logloss:0.03296
[550]	train-logloss:0.02816	eval-logloss:0.02911
[600]	train-logloss:0.02477	eval-logloss:0.02580
[650]	train-logloss:0.02184	eval-logloss:0.02294
[700]	train-logloss:0.01930	eval-logloss:0.02046
[750]	train-logloss:0.01711	eval-logloss:0.01834
[800]	train-logloss:0.01521	eval-logloss:0.01649
[850]	train-logloss:0.01355	eval-logloss:0.01491
[900]	train-logloss:0.01212	eval-logloss:0.01355
[950]	train-logloss:0.01088	eval-logloss:0.01237
[1000]	train-logloss:0.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['sample_weight'] = 1.0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_valid['sample_weight'] = 1.0


[0]	train-logloss:0.13286	eval-logloss:0.13315
[50]	train-logloss:0.11402	eval-logloss:0.11452
[100]	train-logloss:0.09806	eval-logloss:0.09875
[150]	train-logloss:0.08453	eval-logloss:0.08539
[200]	train-logloss:0.07302	eval-logloss:0.07403
[250]	train-logloss:0.06321	eval-logloss:0.06435
[300]	train-logloss:0.05482	eval-logloss:0.05609
[350]	train-logloss:0.04762	eval-logloss:0.04902
[400]	train-logloss:0.04145	eval-logloss:0.04296
[450]	train-logloss:0.03614	eval-logloss:0.03776
[500]	train-logloss:0.03159	eval-logloss:0.03331
[550]	train-logloss:0.02765	eval-logloss:0.02947
[600]	train-logloss:0.02426	eval-logloss:0.02618
[650]	train-logloss:0.02133	eval-logloss:0.02335
[700]	train-logloss:0.01880	eval-logloss:0.02091
[750]	train-logloss:0.01661	eval-logloss:0.01881
[800]	train-logloss:0.01471	eval-logloss:0.01700
[850]	train-logloss:0.01307	eval-logloss:0.01543
[900]	train-logloss:0.01164	eval-logloss:0.01408
[950]	train-logloss:0.01041	eval-logloss:0.01292
[1000]	train-logloss:0.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['sample_weight'] = 1.0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_valid['sample_weight'] = 1.0


[0]	train-logloss:0.13297	eval-logloss:0.13309
[50]	train-logloss:0.11415	eval-logloss:0.11443
[100]	train-logloss:0.09822	eval-logloss:0.09863
[150]	train-logloss:0.08470	eval-logloss:0.08523
[200]	train-logloss:0.07321	eval-logloss:0.07386
[250]	train-logloss:0.06341	eval-logloss:0.06416
[300]	train-logloss:0.05502	eval-logloss:0.05587
[350]	train-logloss:0.04783	eval-logloss:0.04878
[400]	train-logloss:0.04167	eval-logloss:0.04271
[450]	train-logloss:0.03637	eval-logloss:0.03750
[500]	train-logloss:0.03181	eval-logloss:0.03302
[550]	train-logloss:0.02788	eval-logloss:0.02917
[600]	train-logloss:0.02448	eval-logloss:0.02584
[650]	train-logloss:0.02155	eval-logloss:0.02299
[700]	train-logloss:0.01900	eval-logloss:0.02053
[750]	train-logloss:0.01682	eval-logloss:0.01842
[800]	train-logloss:0.01492	eval-logloss:0.01659
[850]	train-logloss:0.01327	eval-logloss:0.01502
[900]	train-logloss:0.01185	eval-logloss:0.01367
[950]	train-logloss:0.01061	eval-logloss:0.01250
[1000]	train-logloss:0.

In [32]:
xgb_oof_df = train_df[['isic_id', target_col]].copy()


In [33]:
xgb_oof_scores = []
for fold in range(5):
    X = train_df[train_df["fold"] != fold]
    X_valid = train_df[train_df["fold"] == fold].copy()

    X_valid = X_valid.replace([np.inf, -np.inf], np.nan)
    X_valid = X_valid.fillna(0)

    dvalid = xgb.DMatrix(X_valid[feature_cols])

    tmp = xgb_clfs[fold].predict(dvalid)
    print(score(X_valid[target_col], tmp))
    xgb_oof_scores.append(score(X_valid[target_col], tmp))

    xgb_oof_df.loc[X_valid.index, 'pred'] = tmp


0.17467765859515771
0.18312691978418447
0.19191639277351719
0.1726309125313984
0.1896041029364291


In [34]:
print(np.mean(xgb_oof_scores))

0.1823911973241374


In [35]:
print(score(xgb_oof_df[target_col], xgb_oof_df['pred']))

0.18214841246068292


In [36]:
# params = {
#     'objective': 'binary',
#     "boosting_type": "gbdt",
#     "verbosity": -1,
#     'learning_rate': 0.005,
#     'bagging_freq': 1,
#     'feature_fraction': 0.6, # 小さめでも問題ない?
#     'subsample': 0.6, # 小さめでも問題ない?
#     'lambda_l1': 1.5,
#     'lambda_l2': 2.5,
#     'num_leaves': 32,
#     "min_data_in_leaf": 20,
#     'scale_pos_weight': 7,

#     # GPU対応のパラメータ
#     'device': 'gpu',  # GPUを使用
#     'max_bin': 127,  # GPUに適したbinの数
#     'gpu_platform_id': 0,
#     'gpu_device_id': 0,
# }
# 0.17056919821372732　# こっちの方がxgboostと比較して


# params = {
#     'objective': 'binary',
#     "boosting_type": "gbdt",
#     "verbosity": -1,
#     'learning_rate': 0.005,
#     'bagging_freq': 1,
#     'feature_fraction': 0.6, # 小さめでも問題ない?
#     'subsample': 0.6, # 小さめでも問題ない?
#     'lambda_l1': 1.5,
#     'lambda_l2': 3.5,
#     'num_leaves': 32,
#     "min_data_in_leaf": 20,
#     'scale_pos_weight': 7,

#     # GPU対応のパラメータ
#     'device': 'gpu',  # GPUを使用
#     'max_bin': 127,  # GPUに適したbinの数
#     'gpu_platform_id': 0,
#     'gpu_device_id': 0,
# }

# 0.17079946240654348

# params = {
#     'objective': 'binary',
#     "boosting_type": "gbdt",
#     "verbosity": -1,
#     'learning_rate': 0.005,
#     'bagging_freq': 2,
#     # 'bagging_freq': 1,
#     # 'feature_fraction': 0.6,
#     'feature_fraction': 0.6, # 小さめでも問題ない?
#     'subsample': 0.6, # 小さめでも問題ない?
#     'lambda_l1': 1.5,
#     'lambda_l2': 5.5,
#     'num_leaves': 32,
#     "min_data_in_leaf": 20,
#     'scale_pos_weight': 7,

#     # GPU対応のパラメータ
#     'device': 'gpu',  # GPUを使用
#     'max_bin': 127,  # GPUに適したbinの数
#     'gpu_platform_id': 0,
#     'gpu_device_id': 0,
# }

# 0.17083970710395688


# params = {
#     'objective': 'binary',
#     "boosting_type": "gbdt",
#     "verbosity": -1,
#     'learning_rate': 0.005,
#     'bagging_freq': 2,
#     'feature_fraction': 0.6, # 小さめでも問題ない?
#     'subsample': 0.6, # 小さめでも問題ない?
#     'lambda_l1': 1.5,
#     'lambda_l2': 5.5,
#     'num_leaves': 32,
#     "min_data_in_leaf": 20,
#     'scale_pos_weight': 10,

#     # GPU対応のパラメータ
#     'device': 'gpu',  # GPUを使用
#     'max_bin': 127,  # GPUに適したbinの数
#     'gpu_platform_id': 0,
#     'gpu_device_id': 0,
# }

# 0.1550121579376953

# params = {
#     'objective': 'binary',
#     "boosting_type": "gbdt",
#     "verbosity": -1,
#     'learning_rate': 0.005,
#     'bagging_freq': 2,
#     # 'bagging_freq': 1,
#     # 'feature_fraction': 0.6,
#     'feature_fraction': 0.6, # 小さめでも問題ない?
#     'subsample': 0.6, # 小さめでも問題ない?
#     'lambda_l1': 1.5,
#     'lambda_l2': 5.5,
#     'num_leaves': 32,
#     "min_data_in_leaf": 20,
#     'scale_pos_weight': 5,

#     # GPU対応のパラメータ
#     'device': 'gpu',  # GPUを使用
#     'max_bin': 127,  # GPUに適したbinの数
#     'gpu_platform_id': 0,
#     'gpu_device_id': 0,
# }

# 0.17098928883917178


# params = {
#     'objective': 'binary',
#     "boosting_type": "gbdt",
#     "verbosity": -1,
#     'learning_rate': 0.005,
#     'bagging_freq': 2,
#     # 'bagging_freq': 1,
#     # 'feature_fraction': 0.6,
#     'feature_fraction': 0.6, # 小さめでも問題ない?
#     'subsample': 0.6, # 小さめでも問題ない?
#     'lambda_l1': 1.5,
#     'lambda_l2': 5.5,
#     'num_leaves': 32,
#     "min_data_in_leaf": 20,
#     'scale_pos_weight': 3,

#     # GPU対応のパラメータ
#     'device': 'gpu',  # GPUを使用
#     'max_bin': 127,  # GPUに適したbinの数
#     'gpu_platform_id': 0,
#     'gpu_device_id': 0,
# }

# 0.17032848957884608


In [37]:
params = {
    'objective': 'binary',
    "boosting_type": "gbdt",
    "verbosity": -1,
    'learning_rate': 0.005,
    'bagging_freq': 2,
    # 'bagging_freq': 1,
    # 'feature_fraction': 0.6,
    'feature_fraction': 0.6, # 小さめでも問題ない?
    'subsample': 0.6, # 小さめでも問題ない?
    'lambda_l1': 1.5,
    'lambda_l2': 5.5,
    'num_leaves': 32,
    "min_data_in_leaf": 20,
    # 'scale_pos_weight': 5,

    # GPU対応のパラメータ
    'device': 'gpu',  # GPUを使用
    'max_bin': 127,  # GPUに適したbinの数
    'gpu_platform_id': 0,
    'gpu_device_id': 0,
}


In [None]:
clfs = []
for fold in range(5):
    X = train_df[train_df["fold"] != fold]
    X_valid = train_df[train_df["fold"] == fold]

    X['sample_weight'] = 1.0
    # # X.loc[X['lesion_id'].notnull() & (X['target'] == 0), 'sample_weight'] = 0.1
    # # X.loc[X['lesion_id'].notnull(), 'target'] = 1
    # X.loc[X['iddx_1'] == 'Indeterminate', 'target'] = 1
    # X.loc[X['iddx_1'] == 'Indeterminate', 'sample_weight'] = 0.5

    X_valid['sample_weight'] = 1.0
    # # X_valid.loc[X_valid['lesion_id'].notnull() & (X_valid['target'] == 0), 'sample_weight'] = 0.1
    # # X_valid.loc[X_valid['lesion_id'].notnull(), 'target'] = 1
    # X_valid.loc[X_valid['iddx_1'] == 'Indeterminate', 'target'] = 1
    # X_valid.loc[X_valid['iddx_1'] == 'Indeterminate', 'sample_weight'] = 0.5

    y = X['target'].astype(float)
    y_valid = X_valid['target'].astype(float)


    lgbm_trainer = Trainer(X, y, X_valid, y_valid, params, feature_cols)
    lgbm_trainer.fit()
    clfs.append(lgbm_trainer.clf)

    # lgbm_trainer.clf.save_model(OUTOUT_DIR / f"lgb_fold_{fold}.json")
    # lgbm_trainer.clf.save_model(output_dir / 'exp1' / f"lgb_fold_{fold}.json")
    lgbm_trainer.clf.save_model(output_dir / 'exp18' / f"lgb_fold_{fold}.json")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['sample_weight'] = 1.0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_valid['sample_weight'] = 1.0


In [None]:
oof_df = train_df[['isic_id', 'target']].copy()

In [None]:
# valid
oof_scores = []
for fold in range(5):
    X = train_df[train_df["fold"] != fold]
    X_valid = train_df[train_df["fold"] == fold]
    tmp = clfs[fold].predict(X_valid[feature_cols])
    print(score(X_valid['target'], tmp))
    oof_scores.append(score(X_valid['target'], tmp))

    oof_df.loc[X_valid.index, 'pred'] = tmp

In [None]:
np.mean(oof_scores)

In [None]:
score(oof_df['target'], oof_df['pred'])

In [None]:
importance_df = []
for e, c in enumerate(clfs):
    importance_df_tmp = pd.DataFrame({
        'fold':e,
        'feature_name':c.feature_name(),
        'imporatance_gain':c.feature_importance(importance_type='gain')
    })
    importance_df.append(importance_df_tmp)
importance_df = pd.concat(importance_df)

In [None]:
importance_df = importance_df.sort_values('imporatance_gain', ascending=False)

In [None]:
sns.boxplot(data=importance_df[:50], x="imporatance_gain", y="feature_name")

In [None]:
# ensemble
ensemble_oof_scores = []
ensemble_oof_df = train_df[['isic_id', 'target']].copy()


for fold in range(5):
    X = train_df[train_df["fold"] != fold]
    X_valid = train_df[train_df["fold"] == fold].copy()

    # xgb
    X_valid = X_valid.replace([np.inf, -np.inf], np.nan)
    X_valid = X_valid.fillna(0)

    dvalid = xgb.DMatrix(X_valid[feature_cols], enable_categorical=True)

    tmp_xgb = xgb_clfs[fold].predict(dvalid)
    print(f'fold {fold}', 'xgb', score(X_valid['target'], tmp_xgb))

    # lgb
    X_valid = train_df[train_df["fold"] == fold].copy()
    tmp_lgb = clfs[fold].predict(X_valid[feature_cols])
    print(f'fold {fold}', 'lgb', score(X_valid['target'], tmp_lgb))

    blend_tmp =  tmp_xgb * 0.95 + tmp_lgb * 0.05
    print(f'fold {fold}', 'blend', score(X_valid['target'], blend_tmp))

    ensemble_oof_scores.append(score(X_valid['target'], blend_tmp))

    ensemble_oof_df.loc[X_valid.index, 'pred'] = blend_tmp
    ensemble_oof_df.loc[X_valid.index, 'pred_xgb'] = tmp_xgb
    ensemble_oof_df.loc[X_valid.index, 'pred_lgb'] = tmp_lgb


In [None]:
np.mean(ensemble_oof_scores)

In [None]:
score(ensemble_oof_df['target'], ensemble_oof_df['pred'])

In [None]:
ensemble_oof_df.head()

In [None]:
ensemble_oof_df.to_csv(output_dir / 'exp18' / 'ensemble_oof_df.csv', index=False)


In [None]:
np.mean(ensemble_oof_scores)

In [None]:
from matplotlib import pyplot as plt

fpr_train, tpr_train, thresholds_train = roc_curve(ensemble_oof_df['target'], ensemble_oof_df['pred'], drop_intermediate=False)
plt.plot(fpr_train, tpr_train, color="blue", label="train")
plt.plot([0, 1], [0.8, 0.8], color="green", label="tpr=0.8")
plt.xlabel('FPR: False positive rate')
plt.ylabel('TPR: True positive rate')
plt.xlim(0, 1.02)
plt.ylim(0, 1.02)
plt.grid()
plt.legend()
plt.show()


In [None]:
len(train_df.columns)

In [None]:
# params_catboost = {
#     'objective': 'Logloss',  # 二値分類の目的関数
#     'eval_metric': 'Logloss',  # AUCやF1に変更可能
#     'learning_rate': 0.004,  # XGBoostとLightGBMの間を取って0.004に設定
#     'depth': 7,  # max_depthに対応
#     'colsample_bylevel': 0.5,  # colsample_bytreeに対応するパラメータ
#     'subsample': 0.6,  # XGBoostのsubsampleに対応
#     'l2_leaf_reg': 5.0,  # L2正則化（lambdaに対応）
#     'scale_pos_weight': 8,  # クラス不均衡の調整
#     'bootstrap_type': 'Bernoulli',  # subsampleを有効にするために必要
#     'boosting_type': 'Plain',  # LightGBMの"gbdt"に相当
#     'max_bin': 127,  # LightGBMに対応するパラメータ
#     'task_type': 'GPU',  # GPUを使用
#     'devices': '0:0',  # GPUデバイスIDの指定
#     'verbose': False  # ログの出力を制限
# }
