In [279]:
from os.path import join, basename, splitext
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from coreml.utils.io import read_yml

In [2]:
config = 'competitions/2020/melanoma-classification/configs/efficientnet/best-ns-colorjitter'

In [414]:
prediction_train = pd.read_csv(join('/output', config, 'logs/evaluation/train.csv'))
prediction_val = pd.read_csv(join('/output', config, 'logs/evaluation/val.csv'))

In [415]:
data_config = read_yml(join('/data/siim-isic-melanoma/processed/versions/v3.0.0.yml'))

In [416]:
train, val = data_config['train'], data_config['val']

In [417]:
train = pd.DataFrame(train)
val = pd.DataFrame(val)

In [418]:
train['name'] = train['file'].apply(lambda x: splitext(basename(x))[0])
val['name'] = val['file'].apply(lambda x: splitext(basename(x))[0])

In [419]:
train['label'] = train['label'].apply(lambda x: x['classification'])
val['label'] = val['label'].apply(lambda x: x['classification'])

In [420]:
train = train.drop(columns=['file'])
val = val.drop(columns=['file'])

In [421]:
train = train.rename(columns={'name': 'image_name'})
val = val.rename(columns={'name': 'image_name'})

In [422]:
train.head()

Unnamed: 0,label,image_name
0,0,ISIC_0052212
1,0,ISIC_0074311
2,0,ISIC_0074542
3,0,ISIC_0076545
4,0,ISIC_0076995


In [423]:
whole_train_2020 = pd.read_csv('/data/siim-isic-melanoma/raw/2020/train.csv')
whole_train_2019 = pd.read_csv('/data/siim-isic-melanoma/raw/2019/train.csv')

In [424]:
prediction_train

Unnamed: 0,image_name,target
0,ISIC_0052212,0.067617
1,ISIC_0188432,0.508675
2,ISIC_0074311,0.019427
3,ISIC_0207268,0.471323
4,ISIC_0074542,0.116158
...,...,...
4227,ISIC_0034313,0.933255
4228,ISIC_0905652,0.022936
4229,ISIC_0034316,0.973842
4230,ISIC_0906053,0.555337


In [425]:
len(prediction_train), len(train)

(4232, 39015)

In [426]:
len(prediction_val), len(val)

(6536, 6536)

In [427]:
train = pd.merge(prediction_train, train)
val = pd.merge(prediction_val, val)

## Performance without using tabular data

In [428]:
val_preds = val['target'].values
val_labels = val['label'].values

In [429]:
roc_auc_score(val_labels, val_preds)

0.9166250402835966

## Adding tabular data

In [430]:
whole_train_2020.head()

Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge,diagnosis,benign_malignant,target,tfrecord,width,height
0,ISIC_2637011,IP_7279968,male,45.0,head/neck,unknown,benign,0,0,6000,4000
1,ISIC_0015719,IP_3075186,female,45.0,upper extremity,unknown,benign,0,0,6000,4000
2,ISIC_0052212,IP_2842074,female,50.0,lower extremity,nevus,benign,0,6,1872,1053
3,ISIC_0068279,IP_6890425,female,45.0,head/neck,unknown,benign,0,0,1872,1053
4,ISIC_0074268,IP_8723313,female,55.0,upper extremity,unknown,benign,0,11,6000,4000


In [431]:
whole_train_2019.head()

Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge,diagnosis,benign_malignant,target,tfrecord,width,height
0,ISIC_0000000,-1,female,55.0,anterior torso,NV,benign,0,4,1022,767
1,ISIC_0000001,-1,female,30.0,anterior torso,NV,benign,0,18,1022,767
2,ISIC_0000002,-1,female,60.0,upper extremity,MEL,malignant,1,0,1022,767
3,ISIC_0000003,-1,male,30.0,upper extremity,NV,benign,0,24,1022,767
4,ISIC_0000004,-1,male,80.0,posterior torso,MEL,malignant,1,14,1022,767


In [432]:
whole_train_2020['anatom_site_general_challenge'].value_counts()

torso              16845
lower extremity     8417
upper extremity     4983
head/neck           1855
palms/soles          375
oral/genital         124
Name: anatom_site_general_challenge, dtype: int64

In [433]:
whole_train_2019['anatom_site_general_challenge'].value_counts()

anterior torso     6915
lower extremity    4990
head/neck          4587
upper extremity    2910
posterior torso    2787
palms/soles         398
oral/genital         59
lateral torso        54
Name: anatom_site_general_challenge, dtype: int64

In [434]:
# convert torso varieties to torso in 2019
whole_train_2019['anatom_site_general_challenge'] = whole_train_2019['anatom_site_general_challenge'].apply(lambda x: 'torso' if isinstance(x, str) and 'torso' in x else x)

In [435]:
whole_train_2019['anatom_site_general_challenge'].value_counts()

torso              9756
lower extremity    4990
head/neck          4587
upper extremity    2910
palms/soles         398
oral/genital         59
Name: anatom_site_general_challenge, dtype: int64

In [436]:
whole_train_2020['diagnosis'].value_counts()

unknown                               27124
nevus                                  5193
melanoma                                584
seborrheic keratosis                    135
lentigo NOS                              44
lichenoid keratosis                      37
solar lentigo                             7
cafe-au-lait macule                       1
atypical melanocytic proliferation        1
Name: diagnosis, dtype: int64

In [437]:
whole_train_2019['diagnosis'].value_counts()

NV      12875
MEL      4522
BCC      3323
BKL      2624
AK        867
SCC       628
VASC      253
DF        239
Name: diagnosis, dtype: int64

In [438]:
whole_train_data = whole_train_2019.append(whole_train_2020)

In [439]:
whole_train_data.head()

Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge,diagnosis,benign_malignant,target,tfrecord,width,height
0,ISIC_0000000,-1,female,55.0,torso,NV,benign,0,4,1022,767
1,ISIC_0000001,-1,female,30.0,torso,NV,benign,0,18,1022,767
2,ISIC_0000002,-1,female,60.0,upper extremity,MEL,malignant,1,0,1022,767
3,ISIC_0000003,-1,male,30.0,upper extremity,NV,benign,0,24,1022,767
4,ISIC_0000004,-1,male,80.0,torso,MEL,malignant,1,14,1022,767


In [440]:
# remove redundant target column
whole_train_data = whole_train_data.drop(columns=['target'])

In [441]:
whole_train_data.head()

Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge,diagnosis,benign_malignant,tfrecord,width,height
0,ISIC_0000000,-1,female,55.0,torso,NV,benign,4,1022,767
1,ISIC_0000001,-1,female,30.0,torso,NV,benign,18,1022,767
2,ISIC_0000002,-1,female,60.0,upper extremity,MEL,malignant,0,1022,767
3,ISIC_0000003,-1,male,30.0,upper extremity,NV,benign,24,1022,767
4,ISIC_0000004,-1,male,80.0,torso,MEL,malignant,14,1022,767


In [442]:
train = pd.merge(train, whole_train_data, on='image_name')

In [443]:
val = pd.merge(val, whole_train_data, on='image_name')

In [444]:
len(train), len(val)

(4232, 6536)

In [445]:
train.head()

Unnamed: 0,image_name,target,label,patient_id,sex,age_approx,anatom_site_general_challenge,diagnosis,benign_malignant,tfrecord,width,height
0,ISIC_0052212,0.067617,0,IP_2842074,female,50.0,lower extremity,nevus,benign,6,1872,1053
1,ISIC_0188432,0.508675,1,IP_0135517,female,50.0,upper extremity,melanoma,malignant,5,3264,2448
2,ISIC_0074311,0.019427,0,IP_2950485,female,40.0,lower extremity,unknown,benign,1,6000,4000
3,ISIC_0207268,0.471323,1,IP_7735373,male,55.0,torso,melanoma,malignant,3,6000,4000
4,ISIC_0074542,0.116158,0,IP_4698288,male,25.0,lower extremity,unknown,benign,14,5184,3456


In [446]:
relevant_columns = ['image_name', 'target', 'label', 'sex', 'age_approx', 'anatom_site_general_challenge']

In [447]:
train = train[relevant_columns]
val = val[relevant_columns]

In [448]:
val.head()

Unnamed: 0,image_name,target,label,sex,age_approx,anatom_site_general_challenge
0,ISIC_2637011,0.063738,0,male,45.0,head/neck
1,ISIC_0015719,0.017058,0,female,45.0,upper extremity
2,ISIC_0068279,0.075364,0,female,45.0,head/neck
3,ISIC_0074268,0.017126,0,female,55.0,upper extremity
4,ISIC_0075663,0.042783,0,female,35.0,torso


In [449]:
len(train)

4232

In [450]:
train = train.dropna().reset_index()
val = val.dropna().reset_index()

In [451]:
sex_label_encoder = OneHotEncoder(handle_unknown='ignore')
anatom_label_encoder = OneHotEncoder(handle_unknown='ignore')

In [452]:
sex_label_encoder.fit([['male'], ['female']])
anatom_label_encoder.fit(whole_train_data['anatom_site_general_challenge'].value_counts().keys().values.reshape(-1, 1));

In [453]:
train['sex'] = train['sex'].apply(lambda x: '0' if not isinstance(x, str) else x)
val['sex'] = val['sex'].apply(lambda x: '0' if not isinstance(x, str) else x)

In [454]:
train['sex'] = train['sex'].apply(lambda x: sex_label_encoder.transform([[x]]).toarray()[0]).values
val['sex'] = val['sex'].apply(lambda x: sex_label_encoder.transform([[x]]).toarray()[0]).values

In [455]:
train['anatom_site_general_challenge'] = train['anatom_site_general_challenge'].apply(lambda x: '0' if not isinstance(x, str) else x)
val['anatom_site_general_challenge'] = val['anatom_site_general_challenge'].apply(lambda x: '0' if not isinstance(x, str) else x)

In [456]:
train['anatom_site_general_challenge'] = train['anatom_site_general_challenge'].apply(lambda x: anatom_label_encoder.transform([[x]]).toarray()[0]).values
val['anatom_site_general_challenge'] = val['anatom_site_general_challenge'].apply(lambda x: anatom_label_encoder.transform([[x]]).toarray()[0]).values

In [457]:
X_train = []
y_train = []

for index in range(len(train)):
    row = train.loc[index]
    row_value = [row['target']] + row['sex'].tolist() + [row['age_approx']] + row['anatom_site_general_challenge'].tolist()
    y_train.append(row['label'])
    X_train.append(row_value)

In [458]:
np.array(X_train).shape, np.array(y_train).shape

((4018, 10), (4018,))

In [459]:
X_val = []
y_val = []

for index in range(len(val)):
    row = val.loc[index]
    row_value = [row['target']] + row['sex'].tolist() + [row['age_approx']] + row['anatom_site_general_challenge'].tolist()
    y_val.append(row['label'])
    X_val.append(row_value)

In [460]:
np.array(X_val).shape, np.array(y_val).shape

((6455, 10), (6455,))

In [461]:
X_train = np.array(X_train)
X_val = np.array(X_val)
y_train = np.array(y_train)
y_val = np.array(y_val)

In [462]:
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((4018, 10), (6455, 10), (4018,), (6455,))

In [463]:
ensemble_pred = X_val[:, 0]

In [498]:
model = SVC(probability=True, C=100, gamma='scale') 

In [499]:
model.fit(X_train, y_train)

SVC(C=100, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [500]:
y_val_pred = model.predict_proba(X_val)

In [501]:
roc_auc_score(y_val, y_val_pred[:, 1])

0.9175370312950482

In [469]:
model = GradientBoostingClassifier(loss='exponential', n_estimators=100, learning_rate=0.1) 

In [470]:
model.fit(X_train, y_train)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='exponential', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [471]:
y_val_pred = model.predict_proba(X_val)

In [472]:
ensemble_pred = np.concatenate([ensemble_pred, y_val_pred[:, 1].reshape(-1, 1)], -1)

In [473]:
roc_auc_score(y_val, y_val_pred[:, 1])

0.9150836909987978

In [474]:
model = RandomForestClassifier(n_estimators=500) 

In [475]:
model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [476]:
y_val_pred = model.predict_proba(X_val)

In [477]:
ensemble_pred = np.concatenate([ensemble_pred, y_val_pred[:, 1].reshape(-1, 1)], -1)

In [478]:
roc_auc_score(y_val, y_val_pred[:, 1])

0.8950979432195876

In [479]:
ensemble_pred.shape

(6455, 4)

In [480]:
ensemble_pred = ensemble_pred.mean(-1)

In [481]:
roc_auc_score(y_val, ensemble_pred)

0.9159118973404922

## Run on test

In [502]:
test_2020 = pd.read_csv('/data/siim-isic-melanoma/raw/2020/test.csv')
prediction_test = pd.read_csv(join('/output', config, 'logs/evaluation/test.csv'))

In [503]:
test_2020.head()

Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge
0,ISIC_0052060,IP_3579794,male,70.0,
1,ISIC_0052349,IP_7782715,male,40.0,lower extremity
2,ISIC_0058510,IP_7960270,female,55.0,torso
3,ISIC_0073313,IP_6375035,female,50.0,torso
4,ISIC_0073502,IP_0589375,female,45.0,lower extremity


In [162]:
prediction_test.head()

Unnamed: 0,image_name,target
0,ISIC_0052060,0.018915
1,ISIC_0052349,0.003282
2,ISIC_0058510,0.007697
3,ISIC_0073313,0.00456
4,ISIC_0073502,0.05195
