In [35]:
from os.path import join, basename, splitext
from collections import defaultdict
import itertools
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from coreml.utils.io import read_yml

In [36]:
configs = [
    'competitions/2020/melanoma-classification/configs/effb5/best-1cycle-wd4e-1-384/',
    'competitions/2020/melanoma-classification/configs/effb5/best-1cycle-wd4e-1-384/fold2',
    'competitions/2020/melanoma-classification/configs/effb5/best-1cycle-wd4e-1-384/fold3',
    'competitions/2020/melanoma-classification/configs/effb5/best-1cycle-wd4e-1-384/fold4',
    'competitions/2020/melanoma-classification/configs/effb5/best-1cycle-wd4e-1-384/fold5'
]

In [37]:
whole_train_2020 = pd.read_csv('/data/siim-isic-melanoma/raw/2020/train.csv')
whole_train_2019 = pd.read_csv('/data/siim-isic-melanoma/raw/2019/train.csv')
test_2020 = pd.read_csv('/data/siim-isic-melanoma/raw/2020/test.csv')

In [38]:
whole_train_2020.head()

Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge,diagnosis,benign_malignant,target,tfrecord,width,height
0,ISIC_2637011,IP_7279968,male,45.0,head/neck,unknown,benign,0,0,6000,4000
1,ISIC_0015719,IP_3075186,female,45.0,upper extremity,unknown,benign,0,0,6000,4000
2,ISIC_0052212,IP_2842074,female,50.0,lower extremity,nevus,benign,0,6,1872,1053
3,ISIC_0068279,IP_6890425,female,45.0,head/neck,unknown,benign,0,0,1872,1053
4,ISIC_0074268,IP_8723313,female,55.0,upper extremity,unknown,benign,0,11,6000,4000


In [39]:
whole_train_2019.head()

Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge,diagnosis,benign_malignant,target,tfrecord,width,height
0,ISIC_0000000,-1,female,55.0,anterior torso,NV,benign,0,4,1022,767
1,ISIC_0000001,-1,female,30.0,anterior torso,NV,benign,0,18,1022,767
2,ISIC_0000002,-1,female,60.0,upper extremity,MEL,malignant,1,0,1022,767
3,ISIC_0000003,-1,male,30.0,upper extremity,NV,benign,0,24,1022,767
4,ISIC_0000004,-1,male,80.0,posterior torso,MEL,malignant,1,14,1022,767


In [40]:
whole_train_2020['anatom_site_general_challenge'].value_counts()

torso              16845
lower extremity     8417
upper extremity     4983
head/neck           1855
palms/soles          375
oral/genital         124
Name: anatom_site_general_challenge, dtype: int64

In [41]:
whole_train_2019['anatom_site_general_challenge'].value_counts()

anterior torso     6915
lower extremity    4990
head/neck          4587
upper extremity    2910
posterior torso    2787
palms/soles         398
oral/genital         59
lateral torso        54
Name: anatom_site_general_challenge, dtype: int64

In [42]:
# convert torso varieties to torso in 2019
whole_train_2019['anatom_site_general_challenge'] = whole_train_2019['anatom_site_general_challenge'].apply(lambda x: 'torso' if isinstance(x, str) and 'torso' in x else x)

In [43]:
whole_train_2019['anatom_site_general_challenge'].value_counts()

torso              9756
lower extremity    4990
head/neck          4587
upper extremity    2910
palms/soles         398
oral/genital         59
Name: anatom_site_general_challenge, dtype: int64

In [44]:
whole_train_2020['diagnosis'].value_counts()

unknown                               27124
nevus                                  5193
melanoma                                584
seborrheic keratosis                    135
lentigo NOS                              44
lichenoid keratosis                      37
solar lentigo                             7
atypical melanocytic proliferation        1
cafe-au-lait macule                       1
Name: diagnosis, dtype: int64

In [45]:
whole_train_2019['diagnosis'].value_counts()

NV      12875
MEL      4522
BCC      3323
BKL      2624
AK        867
SCC       628
VASC      253
DF        239
Name: diagnosis, dtype: int64

In [46]:
whole_train_data = whole_train_2019.append(whole_train_2020)

In [47]:
whole_train_data.head()

Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge,diagnosis,benign_malignant,target,tfrecord,width,height
0,ISIC_0000000,-1,female,55.0,torso,NV,benign,0,4,1022,767
1,ISIC_0000001,-1,female,30.0,torso,NV,benign,0,18,1022,767
2,ISIC_0000002,-1,female,60.0,upper extremity,MEL,malignant,1,0,1022,767
3,ISIC_0000003,-1,male,30.0,upper extremity,NV,benign,0,24,1022,767
4,ISIC_0000004,-1,male,80.0,torso,MEL,malignant,1,14,1022,767


In [48]:
# remove redundant target column
whole_train_data = whole_train_data.drop(columns=['target'])

In [49]:
whole_train_data.head()

Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge,diagnosis,benign_malignant,tfrecord,width,height
0,ISIC_0000000,-1,female,55.0,torso,NV,benign,4,1022,767
1,ISIC_0000001,-1,female,30.0,torso,NV,benign,18,1022,767
2,ISIC_0000002,-1,female,60.0,upper extremity,MEL,malignant,0,1022,767
3,ISIC_0000003,-1,male,30.0,upper extremity,NV,benign,24,1022,767
4,ISIC_0000004,-1,male,80.0,torso,MEL,malignant,14,1022,767


In [50]:
whole_train_data = whole_train_data.dropna().reset_index(drop=True)

In [57]:
len(whole_train_data)

55011

In [51]:
# add unknown value to nan in anatom_site_general_challenge
# whole_train_data['anatom_site_general_challenge'] = whole_train_data['anatom_site_general_challenge'].fillna(value='unknown')
# whole_train_data['sex'] = whole_train_data['sex'].fillna(value='unknown')

In [52]:
test_2020.head()

Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge
0,ISIC_0052060,IP_3579794,male,70.0,
1,ISIC_0052349,IP_7782715,male,40.0,lower extremity
2,ISIC_0058510,IP_7960270,female,55.0,torso
3,ISIC_0073313,IP_6375035,female,50.0,torso
4,ISIC_0073502,IP_0589375,female,45.0,lower extremity


In [53]:
# add unknown value to nan in anatom_site_general_challenge
test_2020['anatom_site_general_challenge'] = test_2020['anatom_site_general_challenge'].fillna(value='unknown')

In [54]:
test_2020.head()

Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge
0,ISIC_0052060,IP_3579794,male,70.0,unknown
1,ISIC_0052349,IP_7782715,male,40.0,lower extremity
2,ISIC_0058510,IP_7960270,female,55.0,torso
3,ISIC_0073313,IP_6375035,female,50.0,torso
4,ISIC_0073502,IP_0589375,female,45.0,lower extremity


In [55]:
label_encoder = OneHotEncoder(handle_unknown='ignore')

In [56]:
label_encoder.fit(whole_train_data[['sex', 'anatom_site_general_challenge']])

OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,
              handle_unknown='ignore', sparse=True)

In [74]:
all_test_predictions = []
val_auc_no_meta = []
val_auc_meta = []

for index, config in enumerate(configs):
    prediction_train = pd.read_csv(join('/output', config, 'logs/evaluation/train.csv'))
    prediction_val = pd.read_csv(join('/output', config, 'logs/evaluation/val.csv'))
    prediction_test = pd.read_csv(join('/output', config, 'logs/evaluation/test.csv'))
    
    data_config_path = f'/data/siim-isic-melanoma/processed/versions/v3.0.{index}.yml'
    print(f'Reading data config: {data_config_path}')
    data_config = read_yml(data_config_path)
    
    train = pd.DataFrame(data_config['train'])
    val = pd.DataFrame(data_config['val'])
    test = pd.DataFrame(data_config['test'])
    
    train['image_name'] = train['file'].apply(lambda x: splitext(basename(x))[0])
    val['image_name'] = val['file'].apply(lambda x: splitext(basename(x))[0])
    test['image_name'] = test['file'].apply(lambda x: splitext(basename(x))[0])
    
    train['label'] = train['label'].apply(lambda x: x['classification'])
    val['label'] = val['label'].apply(lambda x: x['classification'])
    test['label'] = test['label'].apply(lambda x: x['classification'])
    
    train = train.drop(columns=['file'])
    val = val.drop(columns=['file'])
    test = test.drop(columns=['file'])
    
    print('Shapes:')
    print(len(prediction_train), len(train))
    print(len(prediction_val), len(val))
    print(len(prediction_test), len(test))
    print()
    
    train = pd.merge(prediction_train, train)
    val = pd.merge(prediction_val, val)
    test = pd.merge(prediction_test, test)
    
    print('Performance without using tabular data')
    val_preds = val['target'].values
    val_labels = val['label'].values
    roc = roc_auc_score(val_labels, val_preds)
    print(roc)
    val_auc_no_meta.append(roc)
    print()
    
    train = pd.merge(train, whole_train_data, on='image_name')
    val = pd.merge(val, whole_train_data, on='image_name')
    test = pd.merge(test, test_2020, on='image_name')
    print(len(train), len(val), len(test), '\n')
    
    train = train.dropna().reset_index(drop=True)
    val = val.dropna().reset_index(drop=True)
    
    train_tree = train.copy()
    val_tree = val.copy()
    print(len(train_tree), len(val_tree), len(test), '\n')
    
    train_sex_site = label_encoder.transform(
        train_tree[['sex', 'anatom_site_general_challenge']]).toarray()
    train_age_target = train_tree[['target', 'age_approx']]
    X_train = np.concatenate([train_age_target, train_sex_site], -1)
    
    val_sex_site = label_encoder.transform(
        val_tree[['sex', 'anatom_site_general_challenge']]).toarray()
    val_age_target = val_tree[['target', 'age_approx']]
    X_val = np.concatenate([val_age_target, val_sex_site], -1)
    
    test_sex_site = label_encoder.transform(
        test[['sex', 'anatom_site_general_challenge']]).toarray()
    test_age_target = test[['target', 'age_approx']]
    X_test = np.concatenate([test_age_target, test_sex_site], -1)
    
    y_train = train_tree['label'].values
    y_val = val_tree['label'].values
        
    print(X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, '\n')
    count_dict = dict(train_tree['label'].value_counts())    
    pos_scale = count_dict[0] / count_dict[1]
    
    test_predictions = [X_test[:, 0]]
    val_predictions = [X_val[:, 0]]
    lrs = [0.05, 0.1]
    depths = [2, 3, 5]
    scales = [1, pos_scale]

    for lr, depth, scale in itertools.product(lrs, depths, scales):
        print(f'LR={lr}, max_depth={depth}, pos_scale={scale}')
        model = XGBClassifier(learning_rate=lr, max_depth=depth, scale_pos_weight=scale)
        model.fit(X_train, y_train)

        y_val_pred = model.predict_proba(X_val)
        print(roc_auc_score(y_val, y_val_pred[:, 1]))
        print()

        val_predictions.append(y_val_pred[:, 1])
        
        y_test_pred = model.predict_proba(X_test)
        test_predictions.append(y_test_pred[:, 1])

    ensemble_val = np.stack(val_predictions, -1).mean(-1)
    print('Final AUC')
    roc = roc_auc_score(y_val, ensemble_val)
    print(roc)
    val_auc_meta.append(roc)
    all_test_predictions += test_predictions
    print('=========================================================')
#     break

Reading data config: /data/siim-isic-melanoma/processed/versions/v3.0.0.yml
Shapes:
39015 39015
6536 6536
10982 10982

Performance without using tabular data
0.9181316467934257

35920 6455 10982 

35920 6455 10982 

(35920, 10) (35920,) (6455, 10) (6455,) (10982, 10) 

LR=0.05, max_depth=2, pos_scale=1
0.818977756743966

LR=0.05, max_depth=2, pos_scale=17.3640081799591
0.8925228878698369

LR=0.05, max_depth=3, pos_scale=1
0.8189274387888876

LR=0.05, max_depth=3, pos_scale=17.3640081799591
0.8923848534795547

LR=0.05, max_depth=5, pos_scale=1
0.8187078077146945

LR=0.05, max_depth=5, pos_scale=17.3640081799591
0.844814666731944

LR=0.1, max_depth=2, pos_scale=1
0.8957160381002116

LR=0.1, max_depth=2, pos_scale=17.3640081799591
0.8945600850781424

LR=0.1, max_depth=3, pos_scale=1
0.8944451697483013

LR=0.1, max_depth=3, pos_scale=17.3640081799591
0.8949633086911347

LR=0.1, max_depth=5, pos_scale=1
0.8883655368245835

LR=0.1, max_depth=5, pos_scale=17.3640081799591
0.8609796497870327



In [75]:
all_test_predictions = np.array(all_test_predictions)

In [76]:
all_test_predictions = all_test_predictions.mean(0)

In [77]:
final_df = test[['image_name']]

In [78]:
final_df.head()

Unnamed: 0,image_name
0,ISIC_0052060
1,ISIC_0052349
2,ISIC_0058510
3,ISIC_0073313
4,ISIC_0073502


In [79]:
final_df['target'] = all_test_predictions.tolist()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [80]:
final_df.head()

Unnamed: 0,image_name,target
0,ISIC_0052060,0.0019
1,ISIC_0052349,0.001756
2,ISIC_0058510,0.001759
3,ISIC_0073313,0.001817
4,ISIC_0073502,0.001838


In [81]:
final_df.to_csv('submissions/v1.csv', index=False)