In [None]:
# !pip install polars pyarrow xgboost lightgbm

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install -U polars optuna >> /dev/null
# !pip install lightgbm --install-option=--gpu

In [None]:
!mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd


In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
import polars as pl
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GroupKFold, StratifiedGroupKFold
from sklearn.metrics import roc_curve, auc, roc_auc_score
import xgboost as xgb
import lightgbm as lgb

import seaborn as sns
import optuna

# カラム数の制限を解除
pd.set_option('display.max_columns', 200)

# 行数の制限を解除
pd.set_option('display.max_rows', 200)

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [None]:
base_path = Path('/content/drive/MyDrive/kaggle/isic2024')

In [None]:
df = pd.read_csv(base_path / "train-metadata.csv")

  df = pd.read_csv(base_path / "train-metadata.csv")


In [None]:
# df.isnull().sum()

In [None]:
# df['file_exists'] = df['file_path'].apply(lambda x: os.path.exists(x))
# df = df[df['file_exists']].drop(columns=['file_exists'])
df["target"] = df["target"].astype('int32')
df["has_lesion_id"] = ~df["lesion_id"].isnull().astype('int32')


In [None]:
# root = Path('./')
root = Path('./')
output_dir = base_path / 'output'

train_path = base_path / 'train-metadata.csv'
test_path = base_path / 'test-metadata.csv'
subm_path = base_path / 'sample_submission.csv'

id_col = 'isic_id'
target_col = 'target'
group_col = 'patient_id'

err = 1e-5
sampling_ratio = 0.01
seed = 42

num_cols = [
    'age_approx',                        # Approximate age of patient at time of imaging.
    'clin_size_long_diam_mm',            # Maximum diameter of the lesion (mm).+
    'tbp_lv_A',                          # A inside  lesion.+
    'tbp_lv_Aext',                       # A outside lesion.+
    'tbp_lv_B',                          # B inside  lesion.+
    'tbp_lv_Bext',                       # B outside lesion.+
    'tbp_lv_C',                          # Chroma inside  lesion.+
    'tbp_lv_Cext',                       # Chroma outside lesion.+
    'tbp_lv_H',                          # Hue inside the lesion; calculated as the angle of A* and B* in LAB* color space. Typical values range from 25 (red) to 75 (brown).+
    'tbp_lv_Hext',                       # Hue outside lesion.+
    'tbp_lv_L',                          # L inside lesion.+
    'tbp_lv_Lext',                       # L outside lesion.+
    'tbp_lv_areaMM2',                    # Area of lesion (mm^2).+
    'tbp_lv_area_perim_ratio',           # Border jaggedness, the ratio between lesions perimeter and area. Circular lesions will have low values; irregular shaped lesions will have higher values. Values range 0-10.+
    'tbp_lv_color_std_mean',             # Color irregularity, calculated as the variance of colors within the lesion's boundary.
    'tbp_lv_deltaA',                     # Average A contrast (inside vs. outside lesion).+
    'tbp_lv_deltaB',                     # Average B contrast (inside vs. outside lesion).+
    'tbp_lv_deltaL',                     # Average L contrast (inside vs. outside lesion).+
    'tbp_lv_deltaLB',                    #
    'tbp_lv_deltaLBnorm',                # Contrast between the lesion and its immediate surrounding skin. Low contrast lesions tend to be faintly visible such as freckles; high contrast lesions tend to be those with darker pigment. Calculated as the average delta LB of the lesion relative to its immediate background in LAB* color space. Typical values range from 5.5 to 25.+
    'tbp_lv_eccentricity',               # Eccentricity.+
    'tbp_lv_minorAxisMM',                # Smallest lesion diameter (mm).+
    'tbp_lv_nevi_confidence',            # Nevus confidence score (0-100 scale) is a convolutional neural network classifier estimated probability that the lesion is a nevus. The neural network was trained on approximately 57,000 lesions that were classified and labeled by a dermatologist.+,++
    'tbp_lv_norm_border',                # Border irregularity (0-10 scale); the normalized average of border jaggedness and asymmetry.+
    'tbp_lv_norm_color',                 # Color variation (0-10 scale); the normalized average of color asymmetry and color irregularity.+
    'tbp_lv_perimeterMM',                # Perimeter of lesion (mm).+
    'tbp_lv_radial_color_std_max',       # Color asymmetry, a measure of asymmetry of the spatial distribution of color within the lesion. This score is calculated by looking at the average standard deviation in LAB* color space within concentric rings originating from the lesion center. Values range 0-10.+
    'tbp_lv_stdL',                       # Standard deviation of L inside  lesion.+
    'tbp_lv_stdLExt',                    # Standard deviation of L outside lesion.+
    'tbp_lv_symm_2axis',                 # Border asymmetry; a measure of asymmetry of the lesion's contour about an axis perpendicular to the lesion's most symmetric axis. Lesions with two axes of symmetry will therefore have low scores (more symmetric), while lesions with only one or zero axes of symmetry will have higher scores (less symmetric). This score is calculated by comparing opposite halves of the lesion contour over many degrees of rotation. The angle where the halves are most similar identifies the principal axis of symmetry, while the second axis of symmetry is perpendicular to the principal axis. Border asymmetry is reported as the asymmetry value about this second axis. Values range 0-10.+
    'tbp_lv_symm_2axis_angle',           # Lesion border asymmetry angle.+
    'tbp_lv_x',                          # X-coordinate of the lesion on 3D TBP.+
    'tbp_lv_y',                          # Y-coordinate of the lesion on 3D TBP.+
    'tbp_lv_z',                          # Z-coordinate of the lesion on 3D TBP.+
]

new_num_cols = [
    'lesion_size_ratio',             # tbp_lv_minorAxisMM      / clin_size_long_diam_mm
    'lesion_size_ratio_with_area',   # tbp_lv_minorAxisMM      / clin_size_long_diam_mm * tbp_lv_areaMM2
    'lesion_shape_index',            # tbp_lv_areaMM2          / tbp_lv_perimeterMM **2
    'hue_contrast',                  # tbp_lv_H                - tbp_lv_Hext              abs
    'luminance_contrast',            # tbp_lv_L                - tbp_lv_Lext              abs
    'lesion_color_difference',       # tbp_lv_deltaA **2       + tbp_lv_deltaB **2 + tbp_lv_deltaL **2  sqrt
    'border_complexity',             # tbp_lv_norm_border      + tbp_lv_symm_2axis
    'color_uniformity',              # tbp_lv_color_std_mean   / tbp_lv_radial_color_std_max

    'position_distance_3d',          # tbp_lv_x **2 + tbp_lv_y **2 + tbp_lv_z **2  sqrt
    'perimeter_to_area_ratio',       # tbp_lv_perimeterMM      / tbp_lv_areaMM2
    'area_to_perimeter_ratio',       # tbp_lv_areaMM2          / tbp_lv_perimeterMM
    'lesion_visibility_score',       # tbp_lv_deltaLBnorm      + tbp_lv_norm_color
    'symmetry_border_consistency',   # tbp_lv_symm_2axis       * tbp_lv_norm_border
    'consistency_symmetry_border',   # tbp_lv_symm_2axis       * tbp_lv_norm_border / (tbp_lv_symm_2axis + tbp_lv_norm_border)

    'color_consistency',             # tbp_lv_stdL             / tbp_lv_Lext
    'consistency_color',             # tbp_lv_stdL*tbp_lv_Lext / tbp_lv_stdL + tbp_lv_Lext
    'size_age_interaction',          # clin_size_long_diam_mm  * age_approx
    'hue_color_std_interaction',     # tbp_lv_H                * tbp_lv_color_std_mean
    'lesion_severity_index',         # tbp_lv_norm_border      + tbp_lv_norm_color + tbp_lv_eccentricity / 3
    'shape_complexity_index',        # border_complexity       + lesion_shape_index
    'color_contrast_index',          # tbp_lv_deltaA + tbp_lv_deltaB + tbp_lv_deltaL + tbp_lv_deltaLBnorm

    'log_lesion_area',               # tbp_lv_areaMM2          + 1  np.log
    'normalized_lesion_size',        # clin_size_long_diam_mm  / age_approx
    'mean_hue_difference',           # tbp_lv_H                + tbp_lv_Hext    / 2
    'std_dev_contrast',              # tbp_lv_deltaA **2 + tbp_lv_deltaB **2 + tbp_lv_deltaL **2   / 3  np.sqrt
    'color_shape_composite_index',   # tbp_lv_color_std_mean   + bp_lv_area_perim_ratio + tbp_lv_symm_2axis   / 3
    'lesion_orientation_3d',         # tbp_lv_y                , tbp_lv_x  np.arctan2
    'overall_color_difference',      # tbp_lv_deltaA           + tbp_lv_deltaB + tbp_lv_deltaL   / 3

    'symmetry_perimeter_interaction',# tbp_lv_symm_2axis       * tbp_lv_perimeterMM
    'comprehensive_lesion_index',    # tbp_lv_area_perim_ratio + tbp_lv_eccentricity + bp_lv_norm_color + tbp_lv_symm_2axis   / 4
    'color_variance_ratio',          # tbp_lv_color_std_mean   / tbp_lv_stdLExt
    'border_color_interaction',      # tbp_lv_norm_border      * tbp_lv_norm_color
    'border_color_interaction_2',
    'size_color_contrast_ratio',     # clin_size_long_diam_mm  / tbp_lv_deltaLBnorm
    'age_normalized_nevi_confidence',# tbp_lv_nevi_confidence  / age_approx
    'age_normalized_nevi_confidence_2',
    'color_asymmetry_index',         # tbp_lv_symm_2axis       * tbp_lv_radial_color_std_max

    'volume_approximation_3d',       # tbp_lv_areaMM2          * sqrt(tbp_lv_x**2 + tbp_lv_y**2 + tbp_lv_z**2)
    'color_range',                   # abs(tbp_lv_L - tbp_lv_Lext) + abs(tbp_lv_A - tbp_lv_Aext) + abs(tbp_lv_B - tbp_lv_Bext)
    'shape_color_consistency',       # tbp_lv_eccentricity     * tbp_lv_color_std_mean
    'border_length_ratio',           # tbp_lv_perimeterMM      / pi * sqrt(tbp_lv_areaMM2 / pi)
    'age_size_symmetry_index',       # age_approx              * clin_size_long_diam_mm * tbp_lv_symm_2axis
    'index_age_size_symmetry',       # age_approx              * tbp_lv_areaMM2 * tbp_lv_symm_2axis
]

cat_cols = ['sex', 'anatom_site_general', 'tbp_tile_type', 'tbp_lv_location', 'tbp_lv_location_simple', 'attribution']
norm_cols1 = [f'{col}_patient_norm' for col in num_cols + new_num_cols]
norm_cols2 = [f'{col}_attribution_norm' for col in num_cols + new_num_cols]
norm_cols3 = [f'{col}_patient_lv_location_norm' for col in num_cols + new_num_cols]
norm_cols4 = [f'{col}_attribution_lv_location_norm' for col in num_cols + new_num_cols]
norm_cols5 = [f'{col}_lv_location_norm' for col in num_cols + new_num_cols]

agg_list = ['mean', 'std', 'max', 'min', 'skew']
additinal_features1 = [f'{col}_patient_{agg}' for col in num_cols + new_num_cols for agg in agg_list]
additinal_features2 = [f'{col}_patient_lv_location_{agg}' for col in num_cols + new_num_cols for agg in agg_list]

cnt_features = ['isic_id_count', 'isic_id_count_patient_lv_location']
age_features = ['unique_age_approx_count', 'age_min_diff', 'age_max_diff', 'age_min_max']
image_cols = [
    # 'pred_fyk',
    # 'pred_tsuma_image',
    # 'pred_tsuma_eff',
    # 'pred_tsuma_eva',
    # 'pred_hatry',
]
pivot_cnt_features = ['anatom_site_general_anterior torso_count',
 'anatom_site_general_upper extremity_count',
 'anatom_site_general_lower extremity_count',
 'anatom_site_general_posterior torso_count',
 'anatom_site_general_head/neck_count',
 'anatom_site_general__count',
 'combined_anatomical_site_lower extremity_Left Leg - Lower_count',
 'combined_anatomical_site_anterior torso_Torso Front Top Half_count',
 'combined_anatomical_site_upper extremity_Right Arm - Lower_count',
 'combined_anatomical_site_upper extremity_Left Arm - Lower_count',
 'combined_anatomical_site_head/neck_Head & Neck_count',
 'combined_anatomical_site_anterior torso_Torso Front Bottom Half_count',
 'combined_anatomical_site_lower extremity_Right Leg - Lower_count',
 'combined_anatomical_site_posterior torso_Torso Back Bottom Third_count',
 'combined_anatomical_site_upper extremity_Right Arm - Upper_count',
 'combined_anatomical_site_lower extremity_Right Leg - Upper_count',
 'combined_anatomical_site_posterior torso_Torso Back Top Third_count',
 'combined_anatomical_site_upper extremity_Left Arm - Upper_count',
 'combined_anatomical_site_lower extremity_Left Leg - Upper_count',
 'combined_anatomical_site_posterior torso_Torso Back Middle Third_count',
 'combined_anatomical_site_upper extremity_Left Arm_count',
 'combined_anatomical_site_lower extremity_Right Leg_count',
 'combined_anatomical_site_lower extremity_Left Leg_count',
 'combined_anatomical_site_upper extremity_Right Arm_count',
 'combined_anatomical_site__Unknown_count',
 'combined_anatomical_site_posterior torso_Torso Back_count',
 'combined_anatomical_site_anterior torso_Torso Front_count',
 'tbp_lv_location_Left Leg - Upper_count',
 'tbp_lv_location_Right Arm - Lower_count',
 'tbp_lv_location_Right Leg - Upper_count',
 'tbp_lv_location_Torso Back Bottom Third_count',
 'tbp_lv_location_Left Arm - Upper_count',
 'tbp_lv_location_Torso Front Top Half_count',
 'tbp_lv_location_Torso Back Middle Third_count',
 'tbp_lv_location_Torso Front Bottom Half_count',
 'tbp_lv_location_Torso Back Top Third_count',
 'tbp_lv_location_Right Arm - Upper_count',
 'tbp_lv_location_Head & Neck_count',
 'tbp_lv_location_Left Arm_count',
 'tbp_lv_location_Left Arm - Lower_count',
 'tbp_lv_location_Right Leg - Lower_count',
 'tbp_lv_location_Left Leg - Lower_count',
 'tbp_lv_location_Torso Front_count',
 'tbp_lv_location_Left Leg_count',
 'tbp_lv_location_Right Arm_count',
 'tbp_lv_location_Right Leg_count',
 'tbp_lv_location_Unknown_count',
 'tbp_lv_location_Torso Back_count'
]

feature_cols = (
    num_cols +
    new_num_cols +
    cat_cols +
    norm_cols1 +
    norm_cols2 +
    norm_cols3 +
    norm_cols4 +
    norm_cols5 +
    additinal_features1 +
    additinal_features2 +
    cnt_features +
    age_features +
    pivot_cnt_features +
    image_cols
)


In [None]:
def read_data(path):
    return (
        pl.read_csv(path)
        .with_columns(
            pl.col('age_approx').cast(pl.Utf8).replace('NA', np.nan).cast(pl.Float64),
        )
        .with_columns(
            pl.col(pl.Float64).fill_nan(pl.col(pl.Float64).median()), # You may want to impute test data with train
        )
        .with_columns(
            lesion_size_ratio              = pl.col('tbp_lv_minorAxisMM') / pl.col('clin_size_long_diam_mm'),
            lesion_size_ratio_with_area    = pl.col('tbp_lv_minorAxisMM') / pl.col('clin_size_long_diam_mm') * pl.col('tbp_lv_areaMM2'),
            lesion_shape_index             = pl.col('tbp_lv_areaMM2') / (pl.col('tbp_lv_perimeterMM') ** 2),
            # hue_contrast                   = (pl.col('tbp_lv_H') - pl.col('tbp_lv_Hext')).abs(),
            # luminance_contrast             = (pl.col('tbp_lv_L') - pl.col('tbp_lv_Lext')).abs(),
            hue_contrast                   = (pl.col('tbp_lv_H') - pl.col('tbp_lv_Hext')),
            luminance_contrast             = (pl.col('tbp_lv_L') - pl.col('tbp_lv_Lext')),
            lesion_color_difference        = (pl.col('tbp_lv_deltaA') ** 2 + pl.col('tbp_lv_deltaB') ** 2 + pl.col('tbp_lv_deltaL') ** 2).sqrt(),
            border_complexity              = pl.col('tbp_lv_norm_border') + pl.col('tbp_lv_symm_2axis'),
            color_uniformity               = pl.col('tbp_lv_color_std_mean') / (pl.col('tbp_lv_radial_color_std_max') + err),
        )
        .with_columns(
            position_distance_3d           = (pl.col('tbp_lv_x') ** 2 + pl.col('tbp_lv_y') ** 2 + pl.col('tbp_lv_z') ** 2).sqrt(),
            perimeter_to_area_ratio        = pl.col('tbp_lv_perimeterMM') / pl.col('tbp_lv_areaMM2'),
            area_to_perimeter_ratio        = pl.col('tbp_lv_areaMM2') / pl.col('tbp_lv_perimeterMM'),
            lesion_visibility_score        = pl.col('tbp_lv_deltaLBnorm') + pl.col('tbp_lv_norm_color'),
            combined_anatomical_site       = pl.col('anatom_site_general') + '_' + pl.col('tbp_lv_location'),
            symmetry_border_consistency    = pl.col('tbp_lv_symm_2axis') * pl.col('tbp_lv_norm_border'),
            consistency_symmetry_border    = pl.col('tbp_lv_symm_2axis') * pl.col('tbp_lv_norm_border') / (pl.col('tbp_lv_symm_2axis') + pl.col('tbp_lv_norm_border')),
        )
        .with_columns(
            color_consistency              = pl.col('tbp_lv_stdL') / pl.col('tbp_lv_Lext'),
            consistency_color              = pl.col('tbp_lv_stdL') * pl.col('tbp_lv_Lext') / (pl.col('tbp_lv_stdL') + pl.col('tbp_lv_Lext')),
            size_age_interaction           = pl.col('clin_size_long_diam_mm') * pl.col('age_approx'),
            hue_color_std_interaction      = pl.col('tbp_lv_H') * pl.col('tbp_lv_color_std_mean'),
            lesion_severity_index          = (pl.col('tbp_lv_norm_border') + pl.col('tbp_lv_norm_color') + pl.col('tbp_lv_eccentricity')) / 3,
            shape_complexity_index         = pl.col('border_complexity') + pl.col('lesion_shape_index'),
            color_contrast_index           = pl.col('tbp_lv_deltaA') + pl.col('tbp_lv_deltaB') + pl.col('tbp_lv_deltaL') + pl.col('tbp_lv_deltaLBnorm'),
        )
        .with_columns(
            log_lesion_area                = (pl.col('tbp_lv_areaMM2') + 1).log(),
            normalized_lesion_size         = pl.col('clin_size_long_diam_mm') / pl.col('age_approx'),
            mean_hue_difference            = (pl.col('tbp_lv_H') + pl.col('tbp_lv_Hext')) / 2,
            std_dev_contrast               = ((pl.col('tbp_lv_deltaA') ** 2 + pl.col('tbp_lv_deltaB') ** 2 + pl.col('tbp_lv_deltaL') ** 2) / 3).sqrt(),
            color_shape_composite_index    = (pl.col('tbp_lv_color_std_mean') + pl.col('tbp_lv_area_perim_ratio') + pl.col('tbp_lv_symm_2axis')) / 3,
            lesion_orientation_3d          = pl.arctan2(pl.col('tbp_lv_y'), pl.col('tbp_lv_x')),
            overall_color_difference       = (pl.col('tbp_lv_deltaA') + pl.col('tbp_lv_deltaB') + pl.col('tbp_lv_deltaL')) / 3,
        )
        .with_columns(
            symmetry_perimeter_interaction = pl.col('tbp_lv_symm_2axis') * pl.col('tbp_lv_perimeterMM'),
            comprehensive_lesion_index     = (pl.col('tbp_lv_area_perim_ratio') + pl.col('tbp_lv_eccentricity') + pl.col('tbp_lv_norm_color') + pl.col('tbp_lv_symm_2axis')) / 4,
            color_variance_ratio           = pl.col('tbp_lv_color_std_mean') / pl.col('tbp_lv_stdLExt'),
            border_color_interaction       = pl.col('tbp_lv_norm_border') * pl.col('tbp_lv_norm_color'),
            border_color_interaction_2     = pl.col('tbp_lv_norm_border') * pl.col('tbp_lv_norm_color') / (pl.col('tbp_lv_norm_border') + pl.col('tbp_lv_norm_color')),
            size_color_contrast_ratio      = pl.col('clin_size_long_diam_mm') / pl.col('tbp_lv_deltaLBnorm'),
            age_normalized_nevi_confidence = pl.col('tbp_lv_nevi_confidence') / pl.col('age_approx'),
            age_normalized_nevi_confidence_2 = (pl.col('clin_size_long_diam_mm')**2 + pl.col('age_approx')**2).sqrt(),
            color_asymmetry_index          = pl.col('tbp_lv_radial_color_std_max') * pl.col('tbp_lv_symm_2axis'),
        )
        .with_columns(
            volume_approximation_3d        = pl.col('tbp_lv_areaMM2') * (pl.col('tbp_lv_x')**2 + pl.col('tbp_lv_y')**2 + pl.col('tbp_lv_z')**2).sqrt(),
            color_range                    = (pl.col('tbp_lv_L') - pl.col('tbp_lv_Lext')).abs() + (pl.col('tbp_lv_A') - pl.col('tbp_lv_Aext')).abs() + (pl.col('tbp_lv_B') - pl.col('tbp_lv_Bext')).abs(),
            shape_color_consistency        = pl.col('tbp_lv_eccentricity') * pl.col('tbp_lv_color_std_mean'),
            border_length_ratio            = pl.col('tbp_lv_perimeterMM') / (2 * np.pi * (pl.col('tbp_lv_areaMM2') / np.pi).sqrt()),
            age_size_symmetry_index        = pl.col('age_approx') * pl.col('clin_size_long_diam_mm') * pl.col('tbp_lv_symm_2axis'),
            index_age_size_symmetry        = pl.col('age_approx') * pl.col('tbp_lv_areaMM2') * pl.col('tbp_lv_symm_2axis'),
        )
        # TODO color x sizeを同時に考慮した特徴量を追加する
        # 各患者のlesion中で該当lesionがどの程度かけ離れているか
        .with_columns(
            (
                (pl.col(col) - pl.col(col).mean().over('patient_id')) / (pl.col(col).std().over('patient_id') + err)
            ).alias(f'{col}_patient_norm')
            for col in (num_cols + new_num_cols)
        )
        # 各病院のlesion中で該当lesionがどの程度かけ離れているか
        .with_columns(
            (
                (pl.col(col) - pl.col(col).mean().over(['attribution'])) / (pl.col(col).std().over(['attribution']) + err)
            ).alias(f'{col}_attribution_norm')
            for col in (num_cols + new_num_cols)
        )
        .with_columns(
            (
                (pl.col(col) - pl.col(col).mean().over(['tbp_lv_location'])) / (pl.col(col).std().over(['tbp_lv_location']) + err)
            ).alias(f'{col}_lv_location_norm')
            for col in (num_cols + new_num_cols)
        )
        # 各患者・部位のlesion中で該当lesionがどの程度かけ離れているか
        .with_columns(
            (
                (pl.col(col) - pl.col(col).mean().over(['patient_id', 'tbp_lv_location'])) / (pl.col(col).std().over(['patient_id', 'tbp_lv_location']) + err)
            ).alias(f'{col}_patient_lv_location_norm')
            for col in (num_cols + new_num_cols)
        )
        # 各病院・部位のlesion中で該当lesionがどの程度かけ離れているか
        .with_columns(
            (
                (pl.col(col) - pl.col(col).mean().over(['attribution', 'tbp_lv_location'])) / (pl.col(col).std().over(['attribution', 'tbp_lv_location']) + err)
            ).alias(f'{col}_attribution_lv_location_norm')
            for col in (num_cols + new_num_cols)
        )
        .with_columns(
            pl.col(cat_cols).cast(pl.Categorical),
        )
    )


In [None]:
def feature_engineering_patient_id_pl(df: pl.DataFrame, num_cols: list[str]) -> pl.DataFrame:
    # 数値系変数の集計特徴量を計算
    agg_numeric = (
        df.group_by('patient_id')
        .agg([
            pl.col(col).mean().alias(f'{col}_patient_mean') for col in num_cols
        ] + [
            pl.col(col).std().alias(f'{col}_patient_std') for col in num_cols
        ] + [
            pl.col(col).max().alias(f'{col}_patient_max') for col in num_cols
        ] + [
            pl.col(col).min().alias(f'{col}_patient_min') for col in num_cols
        ] + [
            pl.col(col).skew().alias(f'{col}_patient_skew') for col in num_cols
        ] + [
            pl.count('isic_id').alias('isic_id_count')
        ])
    )

    agg_numeric2 = (
        df.group_by(['patient_id', 'tbp_lv_location'])
        .agg([
            pl.col(col).mean().alias(f'{col}_patient_lv_location_mean') for col in num_cols
        ] + [
            pl.col(col).std().alias(f'{col}_patient_lv_location_std') for col in num_cols
        ] + [
            pl.col(col).max().alias(f'{col}_patient_lv_location_max') for col in num_cols
        ] + [
            pl.col(col).min().alias(f'{col}_patient_lv_location_min') for col in num_cols
        ] + [
            pl.col(col).skew().alias(f'{col}_patient_lv_location_skew') for col in num_cols
        ] + [
            pl.count('isic_id').alias('isic_id_count_patient_lv_location')
        ])
    )

    # カテゴリ変数をリストで指定
    categorical_columns = ['anatom_site_general', 'combined_anatomical_site', "tbp_lv_location"]

    # カテゴリごとの出現回数を別カラムとして定義
    for col in categorical_columns:
        counts = (
            df.group_by(['patient_id', col])
            # .agg(pl.len().alias(f'{col}_count'))
            .agg(pl.count().alias(f'{col}_count'))
            .pivot(
                index='patient_id',
                columns=col,
                values=f'{col}_count',
                aggregate_function='sum'  # aggregate_function を使用
            )
            .fill_null(0)
        )
        # カラム名を動的に生成
        col_names = [f'{col}_{val}_count' for val in counts.columns if val != 'patient_id']

        # カラム名を更新
        counts = counts.rename({old: new for old, new in zip(counts.columns, ['patient_id'] + col_names)})

        # 元のデータフレームと結合
        df = df.join(counts, on='patient_id', how='left')

    # 数値集計結果を元のDataFrameに結合
    df = df.join(agg_numeric, on='patient_id', how='left')
    df = df.join(agg_numeric2, on=['patient_id', 'tbp_lv_location'], how='left')

    return df


In [None]:
def feature_engineering_age_pl(df: pl.DataFrame) -> pl.DataFrame:
    # Calculate the number of unique 'age_approx' per 'patient_id'
    unique_counts = (
        df.group_by('patient_id')
        .agg(
            pl.col('age_approx').n_unique().alias('unique_age_approx_count')
        )
    )

    # Join the unique counts back to the original DataFrame
    df = df.join(unique_counts, on='patient_id', how='left')

    # Calculate min, max, and differences for 'age_approx'
    df = df.with_columns(
        age_min_diff = pl.col('age_approx') - pl.col('age_approx').min().over('patient_id'),
        age_max_diff = pl.col('age_approx') - pl.col('age_approx').max().over('patient_id'),
        age_min_max  = (pl.col('age_approx') - pl.col('age_approx').min().over('patient_id')) +
                       (pl.col('age_approx') - pl.col('age_approx').max().over('patient_id'))
    )

    return df


In [None]:
train_df = read_data(train_path)
train_df = feature_engineering_patient_id_pl(train_df, num_cols + new_num_cols)
train_df = feature_engineering_age_pl(train_df)


  .agg(pl.count().alias(f'{col}_count'))
  df.group_by(['patient_id', col])


In [None]:
def custom_metric(estimator, X, y_true):
    y_hat = estimator.predict_proba(X)[:, 1]
    min_tpr = 0.80
    max_fpr = abs(1 - min_tpr)

    v_gt = abs(y_true - 1)
    v_pred = np.array([1.0 - x for x in y_hat])

    partial_auc_scaled = roc_auc_score(v_gt, v_pred, max_fpr=max_fpr)
    partial_auc = 0.5 * max_fpr**2 + (max_fpr - 0.5 * max_fpr**2) / (1.0 - 0.5) * (partial_auc_scaled - 0.5)

    return partial_auc


In [None]:
# train_df = train_df.to_pandas().set_index(id_col)
train_df = train_df.to_pandas()


In [None]:
# oof_df_fyk = pd.read_csv('oof_df_v20.csv')[['isic_id', 'pred']].rename(columns={'pred': 'pred_fyk'})

oof_df_tsuma_eva = pd.read_parquet(base_path / 'eva_preds.parquet')
oof_df_tsuma_eva = oof_df_tsuma_eva.rename(columns = {'pred': 'pred_tsuma_eva'})

oof_df_hatry = pd.concat([
    pd.read_csv(base_path / f'sub68/test_results_fold_{fold}.csv') for fold in range(5)
])[['isic_id', 'pred']]
oof_df_hatry = oof_df_hatry.rename(columns = {'pred': 'pred_hatry'})


In [None]:
train_df = pd.merge(train_df, oof_df_hatry, how='left', on='isic_id')
train_df = pd.merge(train_df, oof_df_tsuma_eva, how='left', on='isic_id')


In [None]:
import pandas as pd


def process_metadata(past_meta, meta, num_cols):
    skin_map = {
        'I': 0,
        'II': 1,
        'III': 2,
        'IV': 3,
    }
    past_meta['fitzpatrick_skin_type_mapped'] = past_meta.fitzpatrick_skin_type.map(skin_map)
    skin_df = past_meta.groupby(['patient_id'])[['fitzpatrick_skin_type_mapped']].min().reset_index()
    skin_df = skin_df[~skin_df.fitzpatrick_skin_type_mapped.isnull()]

    # ピボットとグループ化の関数
    def pivot_group_count(df, groupby_cols, pivot_index, pivot_column, count_column):
        group_count = df.groupby(groupby_cols).count()[count_column].reset_index()
        pivot_df = group_count.pivot(
            index=pivot_index,
            columns=pivot_column,
            values=count_column,
        ).reset_index()
        return pivot_df

    # 初回と2回目のピボットテーブル作成
    pivot_df_1 = pivot_group_count(
        df=past_meta,
        groupby_cols=['patient_id', 'anatom_site_general', 'age_approx',  'benign_malignant'],
        pivot_index=['patient_id', 'anatom_site_general', 'age_approx'],
        pivot_column='benign_malignant',
        count_column='isic_id'
    )

    pivot_df_2 = pivot_group_count(
        df=past_meta,
        groupby_cols=['patient_id', 'anatom_site_general', 'benign_malignant'],
        pivot_index=['patient_id', 'anatom_site_general'],
        pivot_column='benign_malignant',
        count_column='isic_id'
    )

    # pivot_df_3 = pivot_group_count(
    #     df=past_meta,
    #     groupby_cols=['patient_id', 'benign_malignant'],
    #     pivot_index=['patient_id'],
    #     pivot_column='benign_malignant',
    #     count_column='isic_id'
    # )


    # ターゲットのマッピング
    benign_malignant_map = {
        'benign': 0,
        'indeterminate': 0,
        'indeterminate/benign': 0,
        'indeterminate/malignant': 1,
        'malignant': 1,
    }
    target_column = 'binary_target'
    size_column = 'clin_size_long_diam_mm'
    past_meta[target_column] = past_meta['benign_malignant'].map(benign_malignant_map)

    # 集計関数
    def calculate_group_aggregates(df, groupby_columns, columns_to_aggregate, aggregations):
        agg_funcs = {col: aggregations for col in columns_to_aggregate.keys()}
        agg_df = df.groupby(groupby_columns).agg(agg_funcs).reset_index()

        agg_df.columns = [
            '_'.join([columns_to_aggregate.get(col[0], col[0]), col[1]]) + '_agg1' if col[1] else col[0]
            for col in agg_df.columns
        ]
        return agg_df, agg_df.columns.tolist()

    # 集計を実行
    columns_to_aggregate = {
        target_column: 'binary_target',
        size_column: 'clin_size_long_diam_mm'
    }
    # aggregations = ['mean', 'max', 'min', 'count', 'sum']
    aggregations = ['mean', 'count']
    patient_aggregates, patient_aggregates_cols = calculate_group_aggregates(past_meta, ['patient_id'], columns_to_aggregate, aggregations)
    patient_age_aggregates, patient_age_aggregates_cols = calculate_group_aggregates(past_meta, ['patient_id', 'anatom_site_general'], columns_to_aggregate, aggregations)
    patient_age_site_aggregates, patient_age_site_aggregates_cols = calculate_group_aggregates(past_meta, ['patient_id', 'anatom_site_general', 'age_approx'], columns_to_aggregate, aggregations)

    # 数値カラムの集計とシフト特徴量の作成
    aggregations = ['mean', 'count']
    meta_g = meta.groupby(['patient_id', 'anatom_site_general', 'age_approx'])[num_cols].agg(aggregations).reset_index()
    meta_g.columns = ['_'.join(col).strip() if col[1] else col[0] for col in meta_g.columns]

    shift_range = 5
    new_columns = {}
    new_feature_names = []
    for shift in range(1, shift_range + 1):
        for agg in aggregations:
            for col in num_cols:
                base_col = f'{col}_{agg}'

                prev_col = f'{base_col}_prev_{shift}'
                new_columns[prev_col] = meta_g.groupby(['patient_id', 'anatom_site_general'])[base_col].shift(shift)
                new_feature_names.append(prev_col)

                next_col = f'{base_col}_next_{shift}'
                new_columns[next_col] = meta_g.groupby(['patient_id', 'anatom_site_general'])[base_col].shift(-shift)
                new_feature_names.append(next_col)

                diff_prev_col = f'{base_col}_diff_prev_{shift}'
                diff_next_col = f'{base_col}_diff_next_{shift}'
                new_columns[diff_prev_col] = meta_g[base_col] - new_columns[prev_col]
                new_columns[diff_next_col] = meta_g[base_col] - new_columns[next_col]
                new_feature_names.extend([diff_prev_col, diff_next_col])

    new_features_df = pd.DataFrame(new_columns)
    meta_merged = pd.concat([meta_g, new_features_df], axis=1)

    # age_approxとanatom_site_general逆がよい?
    # データフレームの結合
    meta = pd.merge(meta, skin_df, on='patient_id', how='left')
    meta = pd.merge(meta, pivot_df_1, on=['patient_id', 'anatom_site_general', 'age_approx'], how='left')
    meta = pd.merge(meta, pivot_df_2, on=['patient_id', 'anatom_site_general'], how='left')
    # meta = pd.merge(meta, pivot_df_3, on=['patient_id'], how='left')
    meta = pd.merge(meta, patient_aggregates, on=['patient_id'], how='left')
    meta = pd.merge(meta, patient_age_aggregates, on=['patient_id', 'anatom_site_general'], how='left')
    meta = pd.merge(meta, patient_age_site_aggregates, on=['patient_id', 'anatom_site_general', 'age_approx'], how='left')
    meta = pd.merge(meta, meta_merged, on=['patient_id', 'anatom_site_general', 'age_approx'], how='left')

    # 除外するjoin key
    join_keys = {'patient_id', 'age_approx', 'anatom_site_general'}
    not_use_cols = {'benign', 'indeterminate', 'indeterminate/benign', 'indeterminate/malignant', 'malignant'}

    # 追加された特徴量名からjoin keyを除外
    all_new_feature_names = [
        col for col in (
            skin_df.columns.tolist()[1:] +
            pivot_df_1.columns.tolist()[1:] +
            pivot_df_2.columns.tolist()[1:] +
            # pivot_df_3.columns.tolist()[1:] +
            patient_aggregates_cols +
            patient_age_aggregates_cols +
            patient_age_site_aggregates_cols +
            new_feature_names
        ) if col not in join_keys | not_use_cols
    ]

    return meta, all_new_feature_names

# データの読み込み
past_meta = pd.read_csv(base_path / 'past_metadata.csv')
# 使用例
train_df, new_feature_names = process_metadata(
    past_meta,
    train_df,
    # num_cols=num_cols + new_num_cols,
    num_cols=num_cols,
)


  past_meta = pd.read_csv(base_path / 'past_metadata.csv')
  meta_g = meta.groupby(['patient_id', 'anatom_site_general', 'age_approx'])[num_cols].agg(aggregations).reset_index()
  new_columns[prev_col] = meta_g.groupby(['patient_id', 'anatom_site_general'])[base_col].shift(shift)
  new_columns[next_col] = meta_g.groupby(['patient_id', 'anatom_site_general'])[base_col].shift(-shift)
  new_columns[prev_col] = meta_g.groupby(['patient_id', 'anatom_site_general'])[base_col].shift(shift)
  new_columns[next_col] = meta_g.groupby(['patient_id', 'anatom_site_general'])[base_col].shift(-shift)
  new_columns[prev_col] = meta_g.groupby(['patient_id', 'anatom_site_general'])[base_col].shift(shift)
  new_columns[next_col] = meta_g.groupby(['patient_id', 'anatom_site_general'])[base_col].shift(-shift)
  new_columns[prev_col] = meta_g.groupby(['patient_id', 'anatom_site_general'])[base_col].shift(shift)
  new_columns[next_col] = meta_g.groupby(['patient_id', 'anatom_site_general'])[base_col].shift(-s

In [None]:
feature_cols += new_feature_names
feature_cols = [
    col for col in feature_cols
    if col not in ['benign', 'indeterminate', 'indeterminate/benign', 'indeterminate/malignant', 'malignant']
]
feature_cols = list(set(feature_cols))

In [None]:
train_df.head()

Unnamed: 0,isic_id,target,patient_id,age_approx,sex,anatom_site_general,clin_size_long_diam_mm,image_type,tbp_tile_type,tbp_lv_A,tbp_lv_Aext,tbp_lv_B,tbp_lv_Bext,tbp_lv_C,tbp_lv_Cext,tbp_lv_H,tbp_lv_Hext,tbp_lv_L,tbp_lv_Lext,tbp_lv_areaMM2,tbp_lv_area_perim_ratio,tbp_lv_color_std_mean,tbp_lv_deltaA,tbp_lv_deltaB,tbp_lv_deltaL,tbp_lv_deltaLB,tbp_lv_deltaLBnorm,tbp_lv_eccentricity,tbp_lv_location,tbp_lv_location_simple,tbp_lv_minorAxisMM,tbp_lv_nevi_confidence,tbp_lv_norm_border,tbp_lv_norm_color,tbp_lv_perimeterMM,tbp_lv_radial_color_std_max,tbp_lv_stdL,tbp_lv_stdLExt,tbp_lv_symm_2axis,tbp_lv_symm_2axis_angle,tbp_lv_x,tbp_lv_y,tbp_lv_z,attribution,copyright_license,lesion_id,iddx_full,iddx_1,iddx_2,iddx_3,iddx_4,iddx_5,mel_mitotic_index,mel_thick_mm,tbp_lv_dnn_lesion_confidence,lesion_size_ratio,lesion_size_ratio_with_area,lesion_shape_index,hue_contrast,luminance_contrast,lesion_color_difference,border_complexity,color_uniformity,position_distance_3d,perimeter_to_area_ratio,area_to_perimeter_ratio,lesion_visibility_score,combined_anatomical_site,symmetry_border_consistency,consistency_symmetry_border,color_consistency,consistency_color,size_age_interaction,hue_color_std_interaction,lesion_severity_index,shape_complexity_index,color_contrast_index,log_lesion_area,normalized_lesion_size,mean_hue_difference,std_dev_contrast,color_shape_composite_index,lesion_orientation_3d,overall_color_difference,symmetry_perimeter_interaction,comprehensive_lesion_index,color_variance_ratio,border_color_interaction,border_color_interaction_2,size_color_contrast_ratio,age_normalized_nevi_confidence,age_normalized_nevi_confidence_2,color_asymmetry_index,volume_approximation_3d,color_range,shape_color_consistency,border_length_ratio,age_size_symmetry_index,index_age_size_symmetry,age_approx_patient_norm,...,tbp_lv_Hext_count_prev_5,tbp_lv_Hext_count_next_5,tbp_lv_Hext_count_diff_prev_5,tbp_lv_Hext_count_diff_next_5,tbp_lv_L_count_prev_5,tbp_lv_L_count_next_5,tbp_lv_L_count_diff_prev_5,tbp_lv_L_count_diff_next_5,tbp_lv_Lext_count_prev_5,tbp_lv_Lext_count_next_5,tbp_lv_Lext_count_diff_prev_5,tbp_lv_Lext_count_diff_next_5,tbp_lv_areaMM2_count_prev_5,tbp_lv_areaMM2_count_next_5,tbp_lv_areaMM2_count_diff_prev_5,tbp_lv_areaMM2_count_diff_next_5,tbp_lv_area_perim_ratio_count_prev_5,tbp_lv_area_perim_ratio_count_next_5,tbp_lv_area_perim_ratio_count_diff_prev_5,tbp_lv_area_perim_ratio_count_diff_next_5,tbp_lv_color_std_mean_count_prev_5,tbp_lv_color_std_mean_count_next_5,tbp_lv_color_std_mean_count_diff_prev_5,tbp_lv_color_std_mean_count_diff_next_5,tbp_lv_deltaA_count_prev_5,tbp_lv_deltaA_count_next_5,tbp_lv_deltaA_count_diff_prev_5,tbp_lv_deltaA_count_diff_next_5,tbp_lv_deltaB_count_prev_5,tbp_lv_deltaB_count_next_5,tbp_lv_deltaB_count_diff_prev_5,tbp_lv_deltaB_count_diff_next_5,tbp_lv_deltaL_count_prev_5,tbp_lv_deltaL_count_next_5,tbp_lv_deltaL_count_diff_prev_5,tbp_lv_deltaL_count_diff_next_5,tbp_lv_deltaLB_count_prev_5,tbp_lv_deltaLB_count_next_5,tbp_lv_deltaLB_count_diff_prev_5,tbp_lv_deltaLB_count_diff_next_5,tbp_lv_deltaLBnorm_count_prev_5,tbp_lv_deltaLBnorm_count_next_5,tbp_lv_deltaLBnorm_count_diff_prev_5,tbp_lv_deltaLBnorm_count_diff_next_5,tbp_lv_eccentricity_count_prev_5,tbp_lv_eccentricity_count_next_5,tbp_lv_eccentricity_count_diff_prev_5,tbp_lv_eccentricity_count_diff_next_5,tbp_lv_minorAxisMM_count_prev_5,tbp_lv_minorAxisMM_count_next_5,tbp_lv_minorAxisMM_count_diff_prev_5,tbp_lv_minorAxisMM_count_diff_next_5,tbp_lv_nevi_confidence_count_prev_5,tbp_lv_nevi_confidence_count_next_5,tbp_lv_nevi_confidence_count_diff_prev_5,tbp_lv_nevi_confidence_count_diff_next_5,tbp_lv_norm_border_count_prev_5,tbp_lv_norm_border_count_next_5,tbp_lv_norm_border_count_diff_prev_5,tbp_lv_norm_border_count_diff_next_5,tbp_lv_norm_color_count_prev_5,tbp_lv_norm_color_count_next_5,tbp_lv_norm_color_count_diff_prev_5,tbp_lv_norm_color_count_diff_next_5,tbp_lv_perimeterMM_count_prev_5,tbp_lv_perimeterMM_count_next_5,tbp_lv_perimeterMM_count_diff_prev_5,tbp_lv_perimeterMM_count_diff_next_5,tbp_lv_radial_color_std_max_count_prev_5,tbp_lv_radial_color_std_max_count_next_5,tbp_lv_radial_color_std_max_count_diff_prev_5,tbp_lv_radial_color_std_max_count_diff_next_5,tbp_lv_stdL_count_prev_5,tbp_lv_stdL_count_next_5,tbp_lv_stdL_count_diff_prev_5,tbp_lv_stdL_count_diff_next_5,tbp_lv_stdLExt_count_prev_5,tbp_lv_stdLExt_count_next_5,tbp_lv_stdLExt_count_diff_prev_5,tbp_lv_stdLExt_count_diff_next_5,tbp_lv_symm_2axis_count_prev_5,tbp_lv_symm_2axis_count_next_5,tbp_lv_symm_2axis_count_diff_prev_5,tbp_lv_symm_2axis_count_diff_next_5,tbp_lv_symm_2axis_angle_count_prev_5,tbp_lv_symm_2axis_angle_count_next_5,tbp_lv_symm_2axis_angle_count_diff_prev_5,tbp_lv_symm_2axis_angle_count_diff_next_5,tbp_lv_x_count_prev_5,tbp_lv_x_count_next_5,tbp_lv_x_count_diff_prev_5,tbp_lv_x_count_diff_next_5,tbp_lv_y_count_prev_5,tbp_lv_y_count_next_5,tbp_lv_y_count_diff_prev_5,tbp_lv_y_count_diff_next_5,tbp_lv_z_count_prev_5,tbp_lv_z_count_next_5,tbp_lv_z_count_diff_prev_5,tbp_lv_z_count_diff_next_5
0,ISIC_0015670,0,IP_1235828,60.0,male,lower extremity,3.04,TBP tile: close-up,3D: white,20.244422,16.261975,26.922447,23.954773,33.684638,28.953117,53.058545,55.828924,54.367448,62.025701,3.152561,27.47617,0.0,3.982447,2.967674,-7.658253,8.360566,5.784302,0.901302,Right Leg - Upper,Right Leg,1.543016,0.002628592,7.09136,0.0,9.307003,0.0,2.036195,2.63778,0.590476,85,-182.703552,613.493652,-42.427948,Memorial Sloan Kettering Cancer Center,CC-BY,,Benign,Benign,,,,,,,97.517282,0.507571,1.600149,0.036395,-2.770379,-7.658253,9.12775,7.681836,0.0,641.525666,2.952204,0.33873,5.784302,lower extremity_Right Leg - Upper,4.187279,0.545088,0.032828,1.971475,182.4,0.0,2.66422,7.718231,5.07617,1.423725,0.050667,54.443734,5.269909,9.355549,1.860241,-0.236044,5.495564,7.241987,0.0,0.0,0.0,0.52556,4.380987e-05,60.076964,0.0,2022.449088,14.608375,0.0,1.478676,107.702857,111.690749,0.0,...,0.0,0.0,118.0,118.0,0.0,0.0,118.0,118.0,0.0,0.0,118.0,118.0,0.0,0.0,118.0,118.0,0.0,0.0,118.0,118.0,0.0,0.0,118.0,118.0,0.0,0.0,118.0,118.0,0.0,0.0,118.0,118.0,0.0,0.0,118.0,118.0,0.0,0.0,118.0,118.0,0.0,0.0,118.0,118.0,0.0,0.0,118.0,118.0,0.0,0.0,118.0,118.0,0.0,0.0,118.0,118.0,0.0,0.0,118.0,118.0,0.0,0.0,118.0,118.0,0.0,0.0,118.0,118.0,0.0,0.0,118.0,118.0,0.0,0.0,118.0,118.0,0.0,0.0,118.0,118.0,0.0,0.0,118.0,118.0,0.0,0.0,118.0,118.0,0.0,0.0,118.0,118.0,0.0,0.0,118.0,118.0,0.0,0.0,118.0,118.0
1,ISIC_0015845,0,IP_8170065,60.0,male,head/neck,1.1,TBP tile: close-up,3D: white,31.71257,25.36474,26.331,24.54929,41.21903,35.29926,39.70291,44.06404,48.86152,55.36236,0.919497,12.23529,0.0,6.34783,1.781713,-6.500838,6.839008,4.987244,0.639885,Head & Neck,Head & Neck,0.821918,1.334303e-07,2.116402,0.0,3.354148,0.0,0.853227,3.912844,0.285714,55,-0.078308,1575.687,57.1745,Memorial Sloan Kettering Cancer Center,CC-BY,IL_6727506,Benign,Benign,,,,,,,3.141455,0.747198,0.687046,0.081731,-4.36113,-6.50084,9.259068,2.402116,0.0,1576.723962,3.647807,0.274137,4.987244,head/neck_Head & Neck,0.604686,0.251731,0.015412,0.840277,66.0,0.0,0.918762,2.483847,6.615949,0.652063,0.018333,41.883475,5.345725,4.173668,1.570846,0.542902,0.958328,3.290222,0.0,0.0,0.0,0.220563,2.223838e-09,60.010082,0.0,1449.79311,14.63038,0.0,0.986739,18.857144,15.762808,0.0,...,0.0,0.0,8.0,8.0,0.0,0.0,8.0,8.0,0.0,0.0,8.0,8.0,0.0,0.0,8.0,8.0,0.0,0.0,8.0,8.0,0.0,0.0,8.0,8.0,0.0,0.0,8.0,8.0,0.0,0.0,8.0,8.0,0.0,0.0,8.0,8.0,0.0,0.0,8.0,8.0,0.0,0.0,8.0,8.0,0.0,0.0,8.0,8.0,0.0,0.0,8.0,8.0,0.0,0.0,8.0,8.0,0.0,0.0,8.0,8.0,0.0,0.0,8.0,8.0,0.0,0.0,8.0,8.0,0.0,0.0,8.0,8.0,0.0,0.0,8.0,8.0,0.0,0.0,8.0,8.0,0.0,0.0,8.0,8.0,0.0,0.0,8.0,8.0,0.0,0.0,8.0,8.0,0.0,0.0,8.0,8.0,0.0,0.0,8.0,8.0
2,ISIC_0015864,0,IP_6724798,60.0,male,posterior torso,3.4,TBP tile: close-up,3D: XP,22.57583,17.12817,37.97046,33.48541,44.17492,37.6118,59.26585,62.90973,53.96118,61.67052,3.265153,24.18462,0.0,5.447655,4.485044,-7.709336,9.092376,6.290359,0.932147,Torso Back Top Third,Torso Back,1.194905,0.0002959177,4.798335,0.0,8.886309,0.0,1.743651,1.950777,0.361905,105,123.6497,1472.01,232.9089,Memorial Sloan Kettering Cancer Center,CC-BY,,Benign,Benign,,,,,,,99.80404,0.351443,1.147514,0.041349,-3.64388,-7.70934,10.451145,5.16024,0.0,1495.442825,2.72156,0.367436,6.290359,posterior torso_Torso Back Top Third,1.73654,0.336523,0.028274,1.695707,204.0,0.0,1.910161,5.201588,8.513722,1.450478,0.056667,61.08779,6.033971,8.182175,1.486992,0.741121,3.215998,6.369668,0.0,0.0,0.0,0.54051,4.931962e-06,60.096256,0.0,4882.849628,17.64205,0.0,1.387282,73.828579,70.900473,0.0,...,0.0,0.0,466.0,466.0,0.0,0.0,466.0,466.0,0.0,0.0,466.0,466.0,0.0,0.0,466.0,466.0,0.0,0.0,466.0,466.0,0.0,0.0,466.0,466.0,0.0,0.0,466.0,466.0,0.0,0.0,466.0,466.0,0.0,0.0,466.0,466.0,0.0,0.0,466.0,466.0,0.0,0.0,466.0,466.0,0.0,0.0,466.0,466.0,0.0,0.0,466.0,466.0,0.0,0.0,466.0,466.0,0.0,0.0,466.0,466.0,0.0,0.0,466.0,466.0,0.0,0.0,466.0,466.0,0.0,0.0,466.0,466.0,0.0,0.0,466.0,466.0,0.0,0.0,466.0,466.0,0.0,0.0,466.0,466.0,0.0,0.0,466.0,466.0,0.0,0.0,466.0,466.0,0.0,0.0,466.0,466.0,0.0,0.0,466.0,466.0
3,ISIC_0015902,0,IP_4111386,65.0,male,anterior torso,3.22,TBP tile: close-up,3D: XP,14.242329,12.164757,21.448144,21.121356,25.7462,24.374023,56.414429,60.060388,18.649518,23.314841,6.07994,14.889242,0.51452,2.077572,0.326788,-4.665323,4.783413,6.400196,0.654458,Torso Front Top Half,Torso Front,2.481328,21.98945,1.975874,1.771705,9.514499,0.66469,1.258541,1.573733,0.209581,130,-141.02478,1442.185791,58.359802,ACEMID MIA,CC-0,,Benign,Benign,,,,,,,99.989998,0.770599,4.685194,0.067163,-3.645959,-4.665323,5.117454,2.185455,0.774064,1450.239191,1.5649,0.639018,8.171901,anterior torso_Torso Front Top Half,0.414105,0.189482,0.05398,1.194084,209.3,29.026369,1.467346,2.252618,4.139233,1.957265,0.049538,58.237408,2.954563,5.204448,1.668272,-0.753654,1.994057,4.381246,0.326943,3.500666,0.934114,0.50311,0.3382993,65.079708,0.139306,8817.367196,7.069683,0.336732,1.088507,43.865269,82.825529,0.0,...,0.0,,161.0,,0.0,,161.0,,0.0,,161.0,,0.0,,161.0,,0.0,,161.0,,0.0,,161.0,,0.0,,161.0,,0.0,,161.0,,0.0,,161.0,,0.0,,161.0,,0.0,,161.0,,0.0,,161.0,,0.0,,161.0,,0.0,,161.0,,0.0,,161.0,,0.0,,161.0,,0.0,,161.0,,0.0,,161.0,,0.0,,161.0,,0.0,,161.0,,0.0,,161.0,,0.0,,161.0,,0.0,,161.0,,0.0,,161.0,,0.0,,161.0,
4,ISIC_0024200,0,IP_8313778,55.0,male,anterior torso,2.73,TBP tile: close-up,3D: white,24.72552,20.05747,26.4649,25.71046,36.21798,32.60874,46.94607,52.04118,46.27631,54.85574,2.101708,19.90256,0.0,4.668053,0.754434,-8.579431,9.148495,6.531302,0.946448,Torso Front Top Half,Torso Front,0.929916,0.001378832,3.658854,0.0,6.467562,0.0,2.085409,2.480509,0.313433,20,-72.31564,1488.72,21.42896,Memorial Sloan Kettering Cancer Center,CC-BY,,Benign,Benign,,,,,,,70.44251,0.340629,0.715902,0.050245,-5.09511,-8.57943,9.796251,3.972287,0.0,1490.629394,3.077289,0.324961,6.531302,anterior torso_Torso Front Top Half,1.146805,0.288701,0.038016,2.009033,150.15,0.0,1.535101,4.022532,3.374358,1.131953,0.049636,49.493625,5.655868,6.738664,1.619334,-1.052315,2.027146,5.29061,0.0,0.0,0.0,0.417987,2.506967e-05,55.067712,0.0,3132.867723,14.00192,0.0,1.258489,47.061935,36.230932,0.067495,...,0.0,0.0,95.0,95.0,0.0,0.0,95.0,95.0,0.0,0.0,95.0,95.0,0.0,0.0,95.0,95.0,0.0,0.0,95.0,95.0,0.0,0.0,95.0,95.0,0.0,0.0,95.0,95.0,0.0,0.0,95.0,95.0,0.0,0.0,95.0,95.0,0.0,0.0,95.0,95.0,0.0,0.0,95.0,95.0,0.0,0.0,95.0,95.0,0.0,0.0,95.0,95.0,0.0,0.0,95.0,95.0,0.0,0.0,95.0,95.0,0.0,0.0,95.0,95.0,0.0,0.0,95.0,95.0,0.0,0.0,95.0,95.0,0.0,0.0,95.0,95.0,0.0,0.0,95.0,95.0,0.0,0.0,95.0,95.0,0.0,0.0,95.0,95.0,0.0,0.0,95.0,95.0,0.0,0.0,95.0,95.0,0.0,0.0,95.0,95.0


In [None]:
import joblib

le_dict = {}
for c in cat_cols:
    le = LabelEncoder()
    le.fit(pd.concat([train_df[c]]))
    train_df[c] = le.transform(train_df[c])
    # test_df[c] = test_df[c].map(lambda s: '<unknown>' if s not in le.classes_ else s)

    le.classes_ = np.append(le.classes_, '<unknown>')
    # test_df[c] = le.transform(test_df[c])

    le_dict[c] = le

In [None]:
# joblib.dump(le_dict, output_dir / "exp1" / "labelEncoder.joblib")


In [None]:
target_col = 'target'
# target_col = 'has_lesion_id'


In [None]:
import gc

class Trainer(object):
    def __init__(self, X, y, X_valid, y_valid, params, predictors):
        self.X = X
        self.y = y
        self.X_valid = X_valid
        self.y_valid = y_valid
        self.params = params
        self.predictors = predictors

    def fit(self):
        oof_result = []
        X_train, X_valid = self.X, self.X_valid
        y_train, y_valid = self.y, self.y_valid

        dtrain = lgb.Dataset(
            X_train[self.predictors], label=y_train, feature_name=self.predictors
        )
        dvalid = lgb.Dataset(
            X_valid[self.predictors], label=y_valid, feature_name=self.predictors
        )

        # もう使わないデータを削除
        del X_train, X_valid
        gc.collect()

        callbacks = [
            lgb.log_evaluation(100),
            lgb.early_stopping(100),
            # lgb.log_evaluation(200),
            # lgb.early_stopping(200),
        ]

        clf = lgb.train(
            self.params,
            dtrain,
            valid_sets=[dtrain, dvalid],
            num_boost_round=100000,
            categorical_feature=[],
            callbacks=callbacks,
#             feval=pauc_80, # custome metric
        )
        oof_result.append([x for x in clf.best_score["valid_1"].values()][0])
        self.clf = clf
        self.oof_result = oof_result
        return clf, oof_result

    def fit_xgb(self):
        oof_result = []
        X_train, X_valid = self.X, self.X_valid
        y_train, y_valid = self.y, self.y_valid

        X_train = X_train.replace([np.inf, -np.inf], np.nan)
        X_valid = X_valid.replace([np.inf, -np.inf], np.nan)

        X_train = X_train.fillna(0)
        X_valid = X_valid.fillna(0)

        X_train[self.predictors] = X_train[self.predictors].astype(np.float32)
        X_valid[self.predictors] = X_valid[self.predictors].astype(np.float32)

        dtrain = xgb.DMatrix(X_train[self.predictors], label=y_train)
        dvalid = xgb.DMatrix(X_valid[self.predictors], label=y_valid)

        # もう使わないデータを削除
        del X_train, X_valid
        gc.collect()

        watchlist = [(dtrain, 'train'), (dvalid, 'eval')]

        clf = xgb.train(
            self.params,
            dtrain,
            10000,
            early_stopping_rounds=100,
            evals=watchlist,
            verbose_eval=50,
        )

        self.clf = clf
        self.oof_result = oof_result
        return clf, oof_result


    # def fit_xgb(self):
    #     oof_result = []
    #     X_train, X_valid = self.X.copy(), self.X_valid.copy()
    #     y_train, y_valid = self.y.copy(), self.y_valid.copy()

    #     X_train = X_train.replace([np.inf, -np.inf], np.nan)
    #     X_valid = X_valid.replace([np.inf, -np.inf], np.nan)

    #     X_train = X_train.fillna(0)
    #     X_valid = X_valid.fillna(0)

    #     X_train[self.predictors] = X_train[self.predictors].astype(np.float32)
    #     X_valid[self.predictors] = X_valid[self.predictors].astype(np.float32)

    #     dtrain = xgb.DMatrix(X_train[self.predictors], label=y_train)
    #     dvalid = xgb.DMatrix(X_valid[self.predictors], label=y_valid)

    #     watchlist = [(dtrain, 'train'), (dvalid, 'eval')]#訓練データはdtrain、評価用のテストデータはdvalidと設定

    #     clf = xgb.train(
    #         self.params,
    #         dtrain,
    #         10000,
    #         early_stopping_rounds=50,
    #         evals=watchlist,
    #         verbose_eval=50
    #     )

    #     self.clf = clf
    #     self.oof_result = oof_result
    #     return clf, oof_result


In [None]:
params_xgb = {
    'objective':  'binary:logistic',
    'eval_metric': 'logloss', # auc, logloss
    "nthread": -1,
    "learning_rate" : 0.005,
    'colsample_bytree': 0.5,
    'subsample': 0.6,
    'max_depth': 7,
    # 'lambda': 8,
    'lambda': 5,
    'tree_method':"hist",
    # 'scale_pos_weight':20
    # 'scale_pos_weight':7,
    'scale_pos_weight':8,

    'tree_method': 'hist',  # GPU対応のツリービルディングアルゴリズム
    'device': 'cuda',  # GPUを使用するデバイスとしてCUDAを指定
}

In [None]:
n_splits = 5
gkf = StratifiedGroupKFold(n_splits=n_splits, random_state=42, shuffle=True)
train_df["fold"] = -1

for fold, (train_idx, val_idx) in enumerate(
    gkf.split(train_df, train_df['target'], groups=train_df["patient_id"])
):
    train_df.loc[val_idx, "fold"] = fold

In [None]:
train_df["has_lesion_id"] = (train_df["lesion_id"] != '').astype('int32')
train_df["has_lesion_id"]

Unnamed: 0,has_lesion_id
0,0
1,1
2,0
3,0
4,0
...,...
401054,1
401055,0
401056,1
401057,0


In [None]:
def score(solution: np.ndarray, submission: np.ndarray, min_tpr: float=0.80) -> float:
    v_gt = abs(solution-1)
    v_pred = np.array([1.0 - x for x in submission])
    max_fpr = abs(1-min_tpr)
    partial_auc_scaled = roc_auc_score(v_gt, v_pred, max_fpr=max_fpr)
    # change scale from [0.5, 1.0] to [0.5 * max_fpr**2, max_fpr]
    # https://math.stackexchange.com/questions/914823/shift-numbers-into-a-different-range
    partial_auc = 0.5 * max_fpr**2 + (max_fpr - 0.5 * max_fpr**2) / (1.0 - 0.5) * (partial_auc_scaled - 0.5)

    return(partial_auc)

In [None]:
# train_df[feature_cols] = train_df[feature_cols]

In [None]:
xgb_clfs = []
for fold in range(5):
    X = train_df[train_df["fold"] != fold]
    X_valid = train_df[train_df["fold"] == fold]

    y = X[target_col].astype(float)
    y_valid = X_valid[target_col].astype(float)

    trainer = Trainer(X, y, X_valid, y_valid, params_xgb, feature_cols)
    trainer.fit_xgb()
    xgb_clfs.append(trainer.clf)

    # trainer.clf.save_model(output_dir / 'exp1' / f"xgb_fold_{fold}.json")


[0]	train-logloss:0.13171	eval-logloss:0.13216
[50]	train-logloss:0.10222	eval-logloss:0.10295
[100]	train-logloss:0.07982	eval-logloss:0.08079
[150]	train-logloss:0.06266	eval-logloss:0.06386
[200]	train-logloss:0.04940	eval-logloss:0.05082
[250]	train-logloss:0.03912	eval-logloss:0.04076
[300]	train-logloss:0.03112	eval-logloss:0.03295
[350]	train-logloss:0.02485	eval-logloss:0.02687
[400]	train-logloss:0.01991	eval-logloss:0.02213
[450]	train-logloss:0.01603	eval-logloss:0.01843
[500]	train-logloss:0.01296	eval-logloss:0.01554
[550]	train-logloss:0.01056	eval-logloss:0.01329
[600]	train-logloss:0.00863	eval-logloss:0.01152
[650]	train-logloss:0.00712	eval-logloss:0.01015
[700]	train-logloss:0.00591	eval-logloss:0.00908
[750]	train-logloss:0.00494	eval-logloss:0.00824
[800]	train-logloss:0.00417	eval-logloss:0.00759
[850]	train-logloss:0.00355	eval-logloss:0.00708
[900]	train-logloss:0.00305	eval-logloss:0.00667
[950]	train-logloss:0.00265	eval-logloss:0.00637
[1000]	train-logloss:0.

In [None]:
xgb_oof_df = train_df[['isic_id', target_col]].copy()


In [None]:
xgb_oof_scores = []
for fold in range(5):
    X = train_df[train_df["fold"] != fold]
    X_valid = train_df[train_df["fold"] == fold].copy()

    X_valid = X_valid.replace([np.inf, -np.inf], np.nan)
    X_valid = X_valid.fillna(0)

    dvalid = xgb.DMatrix(X_valid[feature_cols])

    tmp = xgb_clfs[fold].predict(dvalid)
    print(score(X_valid[target_col], tmp))
    xgb_oof_scores.append(score(X_valid[target_col], tmp))

    xgb_oof_df.loc[X_valid.index, 'pred'] = tmp


0.17274424578916667
0.17642746096928816
0.18537560954651877
0.1674829393534765
0.17535867288961515


In [None]:
print(np.mean(xgb_oof_scores))

0.17547778570961306


In [None]:
print(score(xgb_oof_df[target_col], xgb_oof_df['pred']))

0.1752876917946885


In [None]:
params = {
    'objective': 'binary',
    "boosting_type": "gbdt",
    "verbosity": -1,
    'learning_rate': 0.01,
    'bagging_freq': 1,
    'feature_fraction': 0.6,
    'lambda_l1': 2.5,
    'lambda_l2': 3.5,
    # 'lambda_l1': 1.5,
    # 'lambda_l2': 2.5,
    'num_leaves': 32,
    "min_data_in_leaf": 20,
    'scale_pos_weight': 7,

    # GPU対応のパラメータ
    'device': 'gpu',  # GPUを使用
    'max_bin': 127,  # GPUに適したbinの数
    'gpu_platform_id': 0,
    'gpu_device_id': 0,
}


In [None]:
clfs = []
for fold in range(5):
    X = train_df[train_df["fold"] != fold]
    X_valid = train_df[train_df["fold"] == fold]

    y = X['target'].astype(float)
    y_valid = X_valid['target'].astype(float)

    lgbm_trainer = Trainer(X, y, X_valid, y_valid, params, feature_cols)
    lgbm_trainer.fit()
    clfs.append(lgbm_trainer.clf)

    # lgbm_trainer.clf.save_model(OUTOUT_DIR / f"lgb_fold_{fold}.json")
    # lgbm_trainer.clf.save_model(output_dir / 'exp1' / f"lgb_fold_{fold}.json")




In [None]:
oof_df = train_df[['isic_id', 'target']].copy()

In [None]:
# valid
oof_scores = []
for fold in range(5):
    X = train_df[train_df["fold"] != fold]
    X_valid = train_df[train_df["fold"] == fold]
    tmp = clfs[fold].predict(X_valid[feature_cols])
    print(score(X_valid['target'], tmp))
    oof_scores.append(score(X_valid['target'], tmp))

    oof_df.loc[X_valid.index, 'pred'] = tmp

In [None]:
np.mean(oof_scores)

In [None]:
score(oof_df['target'], oof_df['pred'])

In [None]:
importance_df = []
for e, c in enumerate(clfs):
    importance_df_tmp = pd.DataFrame({
        'fold':e,
        'feature_name':c.feature_name(),
        'imporatance_gain':c.feature_importance(importance_type='gain')
    })
    importance_df.append(importance_df_tmp)
importance_df = pd.concat(importance_df)

In [None]:
importance_df = importance_df.sort_values('imporatance_gain', ascending=False)

In [None]:
sns.boxplot(data=importance_df[:50], x="imporatance_gain", y="feature_name")

In [None]:
# ensemble
ensemble_oof_scores = []
ensemble_oof_df = train_df[['isic_id', 'target']].copy()


for fold in range(5):
    X = train_df[train_df["fold"] != fold]
    X_valid = train_df[train_df["fold"] == fold].copy()

    # xgb
    X_valid = X_valid.replace([np.inf, -np.inf], np.nan)
    X_valid = X_valid.fillna(0)

    dvalid = xgb.DMatrix(X_valid[feature_cols], enable_categorical=True)

    tmp_xgb = xgb_clfs[fold].predict(dvalid)
    print(f'fold {fold}', 'xgb', score(X_valid['target'], tmp_xgb))

    # lgb
    X_valid = train_df[train_df["fold"] == fold].copy()
    tmp_lgb = clfs[fold].predict(X_valid[feature_cols])
    print(f'fold {fold}', 'lgb', score(X_valid['target'], tmp_lgb))

    blend_tmp =  tmp_xgb * 0.9 + tmp_lgb * 0.1
    print(f'fold {fold}', 'blend', score(X_valid['target'], blend_tmp))

    ensemble_oof_scores.append(score(X_valid['target'], blend_tmp))

    ensemble_oof_df.loc[X_valid.index, 'pred'] = blend_tmp
    ensemble_oof_df.loc[X_valid.index, 'pred_xgb'] = tmp_xgb
    ensemble_oof_df.loc[X_valid.index, 'pred_lgb'] = tmp_lgb


In [None]:
np.mean(ensemble_oof_scores)

In [None]:
score(ensemble_oof_df['target'], ensemble_oof_df['pred'])

In [None]:
ensemble_oof_df.head()

In [None]:
ensemble_oof_df.to_csv('ensemble_oof_df_exp1_without_fyk.csv', index=False)


In [None]:
np.mean(ensemble_oof_scores)

In [None]:
from matplotlib import pyplot as plt

fpr_train, tpr_train, thresholds_train = roc_curve(ensemble_oof_df['target'], ensemble_oof_df['pred'], drop_intermediate=False)
plt.plot(fpr_train, tpr_train, color="blue", label="train")
plt.plot([0, 1], [0.8, 0.8], color="green", label="tpr=0.8")
plt.xlabel('FPR: False positive rate')
plt.ylabel('TPR: True positive rate')
plt.xlim(0, 1.02)
plt.ylim(0, 1.02)
plt.grid()
plt.legend()
plt.show()


In [None]:
len(train_df.columns)