In [17]:
import pandas as pd

import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import (GradientBoostingClassifier, 
        RandomForestClassifier, RandomForestRegressor)
from sklearn.base import BaseEstimator, TransformerMixin

from fastai.vision import *
from fastai.callbacks.hooks import *
from fastai.utils.mem import *

import matplotlib.pyplot as plt

import xgboost as xgb

%matplotlib inline
%xmode Plain

pd.set_option('display.max_columns', 100)

Exception reporting mode: Plain


In [18]:
path = Config.data_path()/'melclass'
df = pd.read_csv(path/'predictions-resnet50-full-train-subset.csv')

print(df.head())
df.anatom_site_general_challenge.fillna('unknown', inplace=True)
df.sex.fillna('unknown', inplace=True)
df.age_approx.fillna(np.nanmean(df.age_approx), inplace=True)

df.isnull().sum()

numeric_cols = ['age_approx', 'prob']
factor_cols = ['sex','anatom_site_general_challenge']
feature_cols = numeric_cols + factor_cols

     image_name  patient_id     sex  age_approx anatom_site_general_challenge  \
0  ISIC_0355023  IP_9623174    male        45.0               lower extremity   
1  ISIC_7508527  IP_5945442  female        35.0                         torso   
2  ISIC_2541969  IP_8039381    male        65.0               upper extremity   
3  ISIC_1495182  IP_0621614    male        40.0               lower extremity   
4  ISIC_6845907  IP_6175417  female        45.0                  oral/genital   

  diagnosis benign_malignant  target      prob  modclass  
0   unknown           benign       0  0.000303         0  
1     nevus           benign       0  0.001300         0  
2   unknown           benign       0  0.000424         0  
3   unknown           benign       0  0.002465         0  
4   unknown           benign       0  0.000464         0  


In [19]:
num_pipeline = Pipeline([
    ('replacenans', SimpleImputer(missing_values=np.nan, strategy='median')),
    ('std_scaler', StandardScaler())
    ])

cat_pipeline = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    # Input the numerical pipeline object as the processor (num_pipeline) for
    # the numerical attributes (num_attribs)
    # num_attribs and cat_attribs are lists of column names
        ("num", num_pipeline, numeric_cols),
        ("cat", cat_pipeline, factor_cols)
    ])

In [20]:
X_train, X_test, y_train, y_test = train_test_split(df, df['target'].values, 
    test_size = 0.5, random_state=42)

X_proc = preprocessor.fit_transform(df[feature_cols])

In [21]:
X_train_proc = preprocessor.transform(X_train[feature_cols])
X_test_proc = preprocessor.transform(X_test[feature_cols])

In [22]:
ratio = df.shape[0] / df.shape[1]

model = xgb.XGBClassifier(
    max_depth=3,
    n_estimators=1000, 
    seed=1337, 
    learning_rate=0.3,
    objective='binary:logistic', 
    eval_metric='auc',
    scale_pos_weight = ratio,
    n_jobs = 1
)

In [23]:
model.fit(X_train_proc, y_train, eval_set=[(X_test_proc, y_test)], early_stopping_rounds=15)

[0]	validation_0-auc:0.85550
Will train until validation_0-auc hasn't improved in 15 rounds.
[1]	validation_0-auc:0.92117
[2]	validation_0-auc:0.92163
[3]	validation_0-auc:0.92095
[4]	validation_0-auc:0.91968
[5]	validation_0-auc:0.91889
[6]	validation_0-auc:0.91546
[7]	validation_0-auc:0.91354
[8]	validation_0-auc:0.91442
[9]	validation_0-auc:0.92905
[10]	validation_0-auc:0.93025
[11]	validation_0-auc:0.93091
[12]	validation_0-auc:0.93180
[13]	validation_0-auc:0.93211
[14]	validation_0-auc:0.93239
[15]	validation_0-auc:0.93244
[16]	validation_0-auc:0.93257
[17]	validation_0-auc:0.93175
[18]	validation_0-auc:0.93151
[19]	validation_0-auc:0.93216
[20]	validation_0-auc:0.93047
[21]	validation_0-auc:0.92883
[22]	validation_0-auc:0.92935
[23]	validation_0-auc:0.92966
[24]	validation_0-auc:0.93042
[25]	validation_0-auc:0.92982
[26]	validation_0-auc:0.92992
[27]	validation_0-auc:0.93004
[28]	validation_0-auc:0.92983
[29]	validation_0-auc:0.92849
[30]	validation_0-auc:0.92875
[31]	validation_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='auc',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.3, max_delta_step=0,
              max_depth=3, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=1000, n_jobs=1,
              num_parallel_tree=1, random_state=1337, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=87.6, seed=1337, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [24]:
model.feature_importances_

array([0.339264, 0.458768, 0.      , 0.      , 0.      , 0.      , 0.086046, 0.      , 0.      , 0.059345, 0.      ,
       0.056577], dtype=float32)

In [25]:
onehotnames = list(preprocessor.named_transformers_.cat['onehot'].get_feature_names())
numericnames = list(preprocessor.named_transformers_.num)
featurenames = numeric_cols + onehotnames

newfeaturenames = featurenames
for fi in range(len(factor_cols)):
    newfeaturenames = [s.replace('x%d' % fi, factor_cols[fi]) for s in newfeaturenames]
newfeaturenames

['age_approx',
 'prob',
 'sex_female',
 'sex_male',
 'sex_unknown',
 'anatom_site_general_challenge_head/neck',
 'anatom_site_general_challenge_lower extremity',
 'anatom_site_general_challenge_oral/genital',
 'anatom_site_general_challenge_palms/soles',
 'anatom_site_general_challenge_torso',
 'anatom_site_general_challenge_unknown',
 'anatom_site_general_challenge_upper extremity']

In [26]:
pd.DataFrame({'name':newfeaturenames, 'importance':model.feature_importances_}).sort_values('importance', ascending=False)

Unnamed: 0,name,importance
1,prob,0.458768
0,age_approx,0.339264
6,anatom_site_general_challenge_lower extremity,0.086046
9,anatom_site_general_challenge_torso,0.059345
11,anatom_site_general_challenge_upper extremity,0.056577
2,sex_female,0.0
3,sex_male,0.0
4,sex_unknown,0.0
5,anatom_site_general_challenge_head/neck,0.0
7,anatom_site_general_challenge_oral/genital,0.0
