In [1]:
import re
import numpy as np
import pandas as pd
import pandas.api.types
import matplotlib.pyplot as plt

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.model_selection import GroupKFold, StratifiedKFold
from sklearn.ensemble import VotingClassifier

import optuna
import catboost as cb
import lightgbm as lgb
import xgboost as xgb

OPTIMIZE_OPTUNA = True
SUBSAMPLE = False
SUBSAMPLE_RATIO = 0.5 # only effective if SUBSAMPLE=True
DISPLAY_FEATURE_IMPORTANCE = False
path='/kaggle/input/modified-datav0/modified_dataset.csv'

In [2]:
df_train=pd.read_csv(path,low_memory=False)

In [3]:
df_test = pd.read_csv("/kaggle/input/isic-2024-challenge/test-metadata.csv")

In [4]:
df_test.head()

Unnamed: 0,isic_id,patient_id,age_approx,sex,anatom_site_general,clin_size_long_diam_mm,image_type,tbp_tile_type,tbp_lv_A,tbp_lv_Aext,...,tbp_lv_radial_color_std_max,tbp_lv_stdL,tbp_lv_stdLExt,tbp_lv_symm_2axis,tbp_lv_symm_2axis_angle,tbp_lv_x,tbp_lv_y,tbp_lv_z,attribution,copyright_license
0,ISIC_0015657,IP_6074337,45.0,male,posterior torso,2.7,TBP tile: close-up,3D: XP,22.80433,20.00727,...,0.304827,1.281532,2.299935,0.479339,20,-155.0651,1511.222,113.9801,Memorial Sloan Kettering Cancer Center,CC-BY
1,ISIC_0015729,IP_1664139,35.0,female,lower extremity,2.52,TBP tile: close-up,3D: XP,16.64867,9.657964,...,0.0,1.27194,2.011223,0.42623,25,-112.36924,629.535889,-15.019287,"Frazer Institute, The University of Queensland...",CC-BY
2,ISIC_0015740,IP_7142616,65.0,male,posterior torso,3.16,TBP tile: close-up,3D: XP,24.25384,19.93738,...,0.230742,1.080308,2.705857,0.366071,110,-84.29282,1303.978,-28.57605,FNQH Cairns,CC-BY


In [5]:
for col in df_test.columns:
    if((df_test[col].dtype=='float64' or df_test[col].dtype=='int64') and df_test[col].isna().sum()!=0):
        df_test[col].fillna(value=df_test[col].mean(),inplace=True);
    if(df_test[col].dtype=='object' and df_test[col].isna().sum()!=0):
        df_test[col].fillna(value=df_test[col].mode()[0],inplace=True);

In [6]:
obj_but_not_cat=['image_type','copyright_license','patient_id','attribution','image_type']

In [7]:
train=pd.read_csv('/kaggle/input/isic-2024-challenge/train-metadata.csv',low_memory=False)
for col in train.columns:
    if((train[col].dtype=='float64' or train[col].dtype=='int64') and train[col].isna().sum()!=0):
        train[col].fillna(value=train[col].mean(),inplace=True);
    if(train[col].dtype=='object' and train[col].isna().sum()!=0):
        train[col].fillna(value=train[col].mode()[0],inplace=True);

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train[col].fillna(value=train[col].mean(),inplace=True);
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train[col].fillna(value=train[col].mode()[0],inplace=True);


In [8]:
cat_labels=[col for col in df_test.columns if (df_test[col].dtype=='object' and col not in obj_but_not_cat and col!='isic_id')]

In [9]:
cat_labels

['sex',
 'anatom_site_general',
 'tbp_tile_type',
 'tbp_lv_location',
 'tbp_lv_location_simple']

In [10]:
cols_unique_train=[col for col in train.columns if col not in df_test.columns]
train_df=train.drop(cols_unique_train,axis=1)
encoder = OneHotEncoder(sparse_output=False, drop='first')
one_hot_encoded = encoder.fit_transform(train_df[cat_labels])
one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(cat_labels))
train_df_encoded = pd.concat([train_df.drop(columns=cat_labels), one_hot_df], axis=1)
target=train['target']
train_df_original=train_df_encoded.drop(obj_but_not_cat,axis=1)
train_df_original['targets']=target
train_df_original.shape

(401059, 69)

In [11]:
one_hot_encoded = encoder.transform(df_test[cat_labels])
one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(cat_labels))
test_df_encoded = pd.concat([df_test.drop(columns=cat_labels), one_hot_df], axis=1)

In [12]:
cat_cols=encoder.get_feature_names_out()

In [13]:
cat_cols

array(['sex_male', 'anatom_site_general_head/neck',
       'anatom_site_general_lower extremity',
       'anatom_site_general_posterior torso',
       'anatom_site_general_upper extremity', 'tbp_tile_type_3D: white',
       'tbp_lv_location_Left Arm', 'tbp_lv_location_Left Arm - Lower',
       'tbp_lv_location_Left Arm - Upper', 'tbp_lv_location_Left Leg',
       'tbp_lv_location_Left Leg - Lower',
       'tbp_lv_location_Left Leg - Upper', 'tbp_lv_location_Right Arm',
       'tbp_lv_location_Right Arm - Lower',
       'tbp_lv_location_Right Arm - Upper', 'tbp_lv_location_Right Leg',
       'tbp_lv_location_Right Leg - Lower',
       'tbp_lv_location_Right Leg - Upper', 'tbp_lv_location_Torso Back',
       'tbp_lv_location_Torso Back Bottom Third',
       'tbp_lv_location_Torso Back Middle Third',
       'tbp_lv_location_Torso Back Top Third',
       'tbp_lv_location_Torso Front',
       'tbp_lv_location_Torso Front Bottom Half',
       'tbp_lv_location_Torso Front Top Half', 'tbp_lv_l

In [14]:
test_df_encoded.shape
test_df=test_df_encoded.drop(obj_but_not_cat,axis=1)
df_test.shape,test_df.shape

((3, 44), (3, 68))

In [15]:
test_df.shape,df_train.shape

((3, 68), (10786, 69))

In [16]:
[col for col in df_train.columns if col not in test_df.columns]

['targets']

In [17]:
[col for col in ['anatom_site_general_head_neck',
 'tbp_tile_type_3D_ white',
 'tbp_lv_location_Left Arm _ Lower',
 'tbp_lv_location_Left Arm _ Upper',
 'tbp_lv_location_Left Leg _ Lower',
 'tbp_lv_location_Left Leg _ Upper',
 'tbp_lv_location_Right Arm _ Lower',
 'tbp_lv_location_Right Arm _ Upper',
 'tbp_lv_location_Right Leg _ Lower',
 'tbp_lv_location_Right Leg _ Upper',
 'targets',
 'fold'] if col not in df_train.columns]

['anatom_site_general_head_neck',
 'tbp_tile_type_3D_ white',
 'tbp_lv_location_Left Arm _ Lower',
 'tbp_lv_location_Left Arm _ Upper',
 'tbp_lv_location_Left Leg _ Lower',
 'tbp_lv_location_Left Leg _ Upper',
 'tbp_lv_location_Right Arm _ Lower',
 'tbp_lv_location_Right Arm _ Upper',
 'tbp_lv_location_Right Leg _ Lower',
 'tbp_lv_location_Right Leg _ Upper',
 'fold']

In [18]:
df_train.head()

Unnamed: 0,isic_id,age_approx,clin_size_long_diam_mm,tbp_lv_A,tbp_lv_Aext,tbp_lv_B,tbp_lv_Bext,tbp_lv_C,tbp_lv_Cext,tbp_lv_H,...,tbp_lv_location_Torso Front Top Half,tbp_lv_location_Unknown,tbp_lv_location_simple_Left Arm,tbp_lv_location_simple_Left Leg,tbp_lv_location_simple_Right Arm,tbp_lv_location_simple_Right Leg,tbp_lv_location_simple_Torso Back,tbp_lv_location_simple_Torso Front,tbp_lv_location_simple_Unknown,targets
0,ISIC_4798323,75.0,4.64,21.60004,12.26432,24.05497,23.69372,32.3296,26.67969,48.0779,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
1,ISIC_2133518,55.0,2.8,24.71447,16.41283,37.20803,34.84004,44.66814,38.51246,56.40693,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
2,ISIC_4495069,85.0,8.54,16.80642,17.87494,17.12793,23.33297,23.99629,29.39287,45.54284,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1
3,ISIC_6355136,35.0,3.84,17.366963,14.068074,26.068361,27.089353,31.323647,30.524478,56.328111,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
4,ISIC_3413459,45.0,2.82,25.223707,19.673211,33.591657,30.663451,42.007557,36.431888,53.097407,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0


In [19]:
df_train.columns = df_train.columns.str.replace(r'[^\w\s]', '_', regex=True)
test_df.columns=test_df.columns.str.replace(r'[^\w\s]', '_', regex=True)
cat_cols = [re.sub(r'[^\w\s]', '_', col) for col in cat_cols]

In [20]:
test_df[cat_cols]=test_df[cat_cols].astype(int)
df_train[cat_cols]=df_train[cat_cols].astype(int)

In [21]:
df_train.shape

(10786, 69)

In [22]:
N_SPLITS = 5

gkf = StratifiedKFold(n_splits=N_SPLITS,shuffle=True,random_state=42)

if SUBSAMPLE:
    df_pos = df_train[df_train["targets"] == 1]
    df_neg = df_train[df_train["targets"] == 0]
    df_neg = df_neg.sample(frac=SUBSAMPLE_RATIO, random_state=42)
    df_train = pd.concat([df_pos, df_neg]).sample(frac=1.0, random_state=42).reset_index(drop=True)    

df_train["fold"] = -1

for idx, (train_idx, val_idx) in enumerate(gkf.split(df_train, df_train["targets"])):
    df_train.loc[val_idx, "fold"] = idx

In [23]:
train_cols=df_train.drop(['targets','isic_id','fold'],axis=1).columns

In [24]:
len(train_cols)

67

In [25]:
[col for col in df_train.columns if col not in test_df.columns]

['targets', 'fold']

In [26]:
def comp_score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str, min_tpr: float=0.80):
    v_gt = abs(np.asarray(solution.values)-1)
    v_pred = np.array([1.0 - x for x in submission.values])
    max_fpr = abs(1-min_tpr)
    partial_auc_scaled = roc_auc_score(v_gt, v_pred, max_fpr=max_fpr)
    # change scale from [0.5, 1.0] to [0.5 * max_fpr**2, max_fpr]
    # https://math.stackexchange.com/questions/914823/shift-numbers-into-a-different-range
    partial_auc = 0.5 * max_fpr**2 + (max_fpr - 0.5 * max_fpr**2) / (1.0 - 0.5) * (partial_auc_scaled - 0.5)
    return partial_auc

def custom_lgbm_metric(y_true, y_hat):
    # TODO: Refactor with the above.
    min_tpr = 0.80
    v_gt = abs(y_true-1)
    v_pred = np.array([1.0 - x for x in y_hat])
    max_fpr = abs(1-min_tpr)
    partial_auc_scaled = roc_auc_score(v_gt, v_pred, max_fpr=max_fpr)
    # change scale from [0.5, 1.0] to [0.5 * max_fpr**2, max_fpr]
    # https://math.stackexchange.com/questions/914823/shift-numbers-into-a-different-range
    partial_auc = 0.5 * max_fpr**2 + (max_fpr - 0.5 * max_fpr**2) / (1.0 - 0.5) * (partial_auc_scaled - 0.5)
    return "pauc80", partial_auc, True

In [27]:
def objective(trial):
    param = {
        "objective":         "binary",
        # "metric":           "custom",
        "verbosity":         -1,
        "boosting_type":     "gbdt",
        "lambda_l1":         trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2":         trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves":        trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction":  trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction":  trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq":      trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "device":            "gpu"
    }
    
    scores = []
    
    for fold in range(N_SPLITS):
        _df_train = df_train[df_train["fold"] != fold].reset_index(drop=True)
        _df_valid = df_train[df_train["fold"] == fold].reset_index(drop=True)
        dtrain = lgb.Dataset(_df_train[train_cols], label=_df_train["targets"])
        gbm = lgb.train(param, dtrain)
        preds = gbm.predict(_df_valid[train_cols])
        score = comp_score(_df_valid[["targets"]], pd.DataFrame(preds, columns=["prediction"]), "")
        scores.append(score)
        
    return np.mean(scores)

In [28]:
if OPTIMIZE_OPTUNA:
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=21)

    print("Number of finished trials: {}".format(len(study.trials)))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))


[I 2024-08-12 13:31:49,708] A new study created in memory with name: no-name-9d2bedab-3153-40bc-95ab-411adfd9d8e7
[I 2024-08-12 13:31:58,377] Trial 0 finished with value: 0.187833751511731 and parameters: {'lambda_l1': 3.4705418676925754, 'lambda_l2': 3.569686689772692e-08, 'num_leaves': 225, 'feature_fraction': 0.7589582192977397, 'bagging_fraction': 0.5028147443018077, 'bagging_freq': 2, 'min_child_samples': 74}. Best is trial 0 with value: 0.187833751511731.
[I 2024-08-12 13:32:01,478] Trial 1 finished with value: 0.1917363210513585 and parameters: {'lambda_l1': 6.582596796625422e-05, 'lambda_l2': 0.038990677300184684, 'num_leaves': 24, 'feature_fraction': 0.8140921467686277, 'bagging_fraction': 0.5770870690658569, 'bagging_freq': 7, 'min_child_samples': 96}. Best is trial 1 with value: 0.1917363210513585.
[I 2024-08-12 13:32:07,306] Trial 2 finished with value: 0.19345886882205915 and parameters: {'lambda_l1': 0.00015099145265295538, 'lambda_l2': 0.00024452735224089213, 'num_leaves

Number of finished trials: 21
Best trial:
  Value: 0.19435193259695233
  Params: 
    lambda_l1: 0.014070211412687408
    lambda_l2: 0.001782397068061876
    num_leaves: 163
    feature_fraction: 0.8537416122698143
    bagging_fraction: 0.5901122028199145
    bagging_freq: 3
    min_child_samples: 33


In [29]:
new_params = {
    "objective": "binary",
    "verbosity": -1,
    "boosting_type": "gbdt",
    "n_estimators": 200,
    'learning_rate': 0.05,    
    'lambda_l1': 0.00013629833487533019,
    'lambda_l2': 6.521166069777159e-08,
    'num_leaves': 61,
    'feature_fraction': 0.6984612122416661,
    'bagging_fraction': 0.9064787175972882,
    'bagging_freq': 3,
    'min_child_samples': 16,
    "device": "gpu"
}

In [30]:
new_params_original = {
    "objective": "binary",
    "verbosity": -1,
    "boosting_type": "gbdt",
    "n_estimators": 200,
    'learning_rate': 0.05,    
    'lambda_l1': 0.0004681884533249742, 
    'lambda_l2': 8.765240856362274, 
    'num_leaves': 136, 
    'feature_fraction': 0.5392005444882538, 
    'bagging_fraction': 0.9577412548866563, 
    'bagging_freq': 6,
    'min_child_samples': 60,
    "device": "gpu"
}
lgb_scores = []
lgb_models = []
oof_df = pd.DataFrame()
for fold in range(N_SPLITS):
    _df_train = df_train[df_train["fold"] != fold].reset_index(drop=True)
    _df_valid = df_train[df_train["fold"] == fold].reset_index(drop=True)
    # model = lgb.LGBMClassifier(**new_params)
    model = VotingClassifier([(f"lgb_{i}", lgb.LGBMClassifier(random_state=i, **new_params)) for i in range(7)], voting="soft")
    model.fit(_df_train[train_cols], _df_train["targets"])
    preds = model.predict_proba(_df_valid[train_cols])[:, 1]
    score = comp_score(_df_valid[["targets"]], pd.DataFrame(preds, columns=["prediction"]), "")
    print(f"fold: {fold} - Partial AUC Score: {score:.5f}")
    lgb_models.append(model)
    oof_single = _df_valid[["isic_id", "targets"]].copy()
    oof_single["pred"] = preds
    oof_df = pd.concat([oof_df, oof_single])

fold: 0 - Partial AUC Score: 0.19844
fold: 1 - Partial AUC Score: 0.19715
fold: 2 - Partial AUC Score: 0.19739
fold: 3 - Partial AUC Score: 0.19005
fold: 4 - Partial AUC Score: 0.19011


In [31]:
lgbm_score = comp_score(oof_df["targets"], oof_df["pred"], "")
print(f"LGBM Score: {lgbm_score:.5f}")

LGBM Score: 0.19450


In [32]:
if DISPLAY_FEATURE_IMPORTANCE:
    # Make sure that this is a single model, not voting classifier. Will handle that later on.
    importances = np.mean([model.feature_importances_ for model in lgb_models], 0)
    df_imp = pd.DataFrame({"feature": model.feature_name_, "importance": importances}).sort_values("importance").reset_index(drop=True)

    plt.figure(figsize=(16, 12))
    plt.barh(df_imp["feature"], df_imp["importance"])
    plt.show()

In [33]:
def objective(trial):
    param = {
        "objective":         trial.suggest_categorical("objective", ["Logloss", "CrossEntropy"]),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
        "depth":             trial.suggest_int("depth", 1, 12),
        "boosting_type":     trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type":    trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]),
        # "task_type":       "GPU",
        # "used_ram_limit":  "3gb",
    }
    
    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1)

    scores = []
    
    for fold in range(N_SPLITS):
        _df_train = df_train[df_train["fold"] != fold].reset_index(drop=True)
        _df_valid = df_train[df_train["fold"] == fold].reset_index(drop=True)
        gbm = cb.CatBoostClassifier(**param)
        gbm.fit(_df_train[train_cols], _df_train["targets"], eval_set=[(_df_valid[train_cols], _df_valid["targets"])], verbose=0, early_stopping_rounds=100)
        preds = gbm.predict(_df_valid[train_cols])
        score = comp_score(_df_valid[["targets"]], pd.DataFrame(preds, columns=["prediction"]), "")
        scores.append(score)
        
    return np.mean(scores)

In [34]:
if OPTIMIZE_OPTUNA:
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=21, timeout=500)
    print("Number of finished trials: {}".format(len(study.trials)))
    print("Best trial:")
    trial = study.best_trial
    print("  Value: {}".format(trial.value))
    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

[I 2024-08-12 13:35:07,923] A new study created in memory with name: no-name-d3454f38-d8f9-42bc-a63a-35f2431f3826
[I 2024-08-12 13:35:20,330] Trial 0 finished with value: 0.04778276139544104 and parameters: {'objective': 'CrossEntropy', 'colsample_bylevel': 0.03649281578904136, 'depth': 2, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 0 with value: 0.04778276139544104.
[I 2024-08-12 13:35:33,327] Trial 1 finished with value: 0.06860134816555252 and parameters: {'objective': 'Logloss', 'colsample_bylevel': 0.021297472875530164, 'depth': 5, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 1 with value: 0.06860134816555252.
[I 2024-08-12 13:36:26,599] Trial 2 finished with value: 0.1549215152911431 and parameters: {'objective': 'CrossEntropy', 'colsample_bylevel': 0.09894586912893882, 'depth': 9, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'subsample': 0.8835930550238581}. Best is trial 2 with value: 0.1549215152911431.
[I 2024-08-12 13:36

Number of finished trials: 11
Best trial:
  Value: 0.1549215152911431
  Params: 
    objective: CrossEntropy
    colsample_bylevel: 0.09894586912893882
    depth: 9
    boosting_type: Plain
    bootstrap_type: Bernoulli
    subsample: 0.8835930550238581


In [35]:
cb_params = {
    'objective': 'Logloss',
    "iterations": 400,
    "learning_rate": 0.05,
    "cat_features": cat_cols,
    "max_depth": 8,
    "l2_leaf_reg": 5,
    "task_type": "GPU",
    # "scale_pos_weight": 2,
    "verbose": 0,
}
cb_scores = []
cb_models = []
for fold in range(N_SPLITS):
    _df_train = df_train[df_train["fold"] != fold].reset_index(drop=True)
    _df_valid = df_train[df_train["fold"] == fold].reset_index(drop=True)
    # model = cb.CatBoostClassifier(**cb_params)
    model = VotingClassifier([(f"cb_{i}", cb.CatBoostClassifier(random_state=i, **cb_params)) for i in range(3)], voting="soft")
    # eval_set=(_df_valid[train_cols], _df_valid["target"]), early_stopping_rounds=50
    model.fit(_df_train[train_cols], _df_train["targets"])
    preds = model.predict_proba(_df_valid[train_cols])[:, 1]
    score = comp_score(_df_valid[["targets"]], pd.DataFrame(preds, columns=["prediction"]), "")
    print(f"fold: {fold} - Partial AUC Score: {score:.5f}")
    cb_scores.append(score)
    cb_models.append(model)

fold: 0 - Partial AUC Score: 0.19744
fold: 1 - Partial AUC Score: 0.19672
fold: 2 - Partial AUC Score: 0.19651
fold: 3 - Partial AUC Score: 0.19193
fold: 4 - Partial AUC Score: 0.19059


In [36]:
cb_score = np.mean(cb_scores)
print(f"CatBoost Score: {cb_score:.5f}")

CatBoost Score: 0.19464


In [37]:
test_df.shape

(3, 68)

In [38]:
lgb_preds = np.mean([model.predict_proba(test_df[train_cols])[:, 1] for model in lgb_models], 0)
cb_preds  = np.mean([model.predict_proba(test_df[train_cols])[:, 1] for model in cb_models],  0)
preds = lgb_preds * 0.5 + cb_preds * 0.5

In [39]:
df_sub = pd.read_csv("/kaggle/input/isic-2024-challenge/sample_submission.csv")
df_sub["target"] = preds
df_sub.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
target,3.0,0.001238,0.000939,0.000396,0.000732,0.001067,0.001658,0.00225


In [40]:
df_sub

Unnamed: 0,isic_id,target
0,ISIC_0015657,0.001067
1,ISIC_0015729,0.000396
2,ISIC_0015740,0.00225


In [41]:
df_sub.to_csv("submission.csv", index=False)