In [19]:
import pandas as pd

import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import (GradientBoostingClassifier, 
        RandomForestClassifier, RandomForestRegressor)
from sklearn.base import BaseEstimator, TransformerMixin

from fastai.vision import *
from fastai.callbacks.hooks import *
from fastai.utils.mem import *

import matplotlib.pyplot as plt

import xgboost as xgb

%matplotlib inline
%xmode Plain

pd.set_option('display.max_columns', 100)
np.random.seed(42)

Exception reporting mode: Plain


In [20]:
path = Config.data_path()/'melclass'
df = pd.read_csv(path/'predictions-resnet18-train.csv')

print(df.head())

     image_name  patient_id     sex  age_approx anatom_site_general_challenge  \
0  ISIC_2637011  IP_7279968    male        45.0                     head/neck   
1  ISIC_0015719  IP_3075186  female        45.0               upper extremity   
2  ISIC_0052212  IP_2842074  female        50.0               lower extremity   
3  ISIC_0068279  IP_6890425  female        45.0                     head/neck   
4  ISIC_0074268  IP_8723313  female        55.0               upper extremity   

  diagnosis benign_malignant  target      prob  modclass  
0   unknown           benign       0  0.000712         0  
1   unknown           benign       0  0.001208         0  
2     nevus           benign       0  0.000321         0  
3   unknown           benign       0  0.001085         0  
4   unknown           benign       0  0.003588         0  


In [21]:
# df.anatom_site_general_challenge.fillna('unknown', inplace=True)
# df.sex.fillna('unknown', inplace=True)
# df.age_approx.fillna(np.nanmean(df.age_approx), inplace=True)

# df.isnull().sum()

numeric_cols = ['age_approx']
factor_cols = ['anatom_site_general_challenge']
feature_cols = numeric_cols + factor_cols

In [22]:
num_pipeline = Pipeline([
    ('replacenans', SimpleImputer(missing_values=np.nan, strategy='median')),
    ('std_scaler', StandardScaler())
    ])

cat_pipeline = Pipeline([
    ('replacenans', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='unknown')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    # Input the numerical pipeline object as the processor (num_pipeline) for
    # the numerical attributes (num_attribs)
    # num_attribs and cat_attribs are lists of column names
        ("num", num_pipeline, numeric_cols),
        ("cat", cat_pipeline, factor_cols)
    ])

In [23]:
X_train, X_test, y_train, y_test = train_test_split(df, df['target'].values, 
    test_size = 0.5, random_state=42)

X_proc = preprocessor.fit_transform(df[feature_cols])

In [24]:
X_train_proc = preprocessor.transform(X_train[feature_cols])
X_test_proc = preprocessor.transform(X_test[feature_cols])

In [25]:
ratio = df.shape[0] / df.shape[1]

model = xgb.XGBClassifier(
    max_depth=3,
    n_estimators=1000, 
    seed=1337, 
    learning_rate=0.3,
    eval_metric='auc'
)

In [26]:
model.fit(X_train_proc, y_train, eval_set=[(X_test_proc, y_test)], early_stopping_rounds=15)

[0]	validation_0-auc:0.57620
Will train until validation_0-auc hasn't improved in 15 rounds.
[1]	validation_0-auc:0.64458
[2]	validation_0-auc:0.64465
[3]	validation_0-auc:0.64535
[4]	validation_0-auc:0.64526
[5]	validation_0-auc:0.64818
[6]	validation_0-auc:0.66366
[7]	validation_0-auc:0.67298
[8]	validation_0-auc:0.67495
[9]	validation_0-auc:0.67654
[10]	validation_0-auc:0.68005
[11]	validation_0-auc:0.68171
[12]	validation_0-auc:0.68209
[13]	validation_0-auc:0.67893
[14]	validation_0-auc:0.67860
[15]	validation_0-auc:0.68243
[16]	validation_0-auc:0.68231
[17]	validation_0-auc:0.68063
[18]	validation_0-auc:0.68389
[19]	validation_0-auc:0.68284
[20]	validation_0-auc:0.68305
[21]	validation_0-auc:0.68464
[22]	validation_0-auc:0.68483
[23]	validation_0-auc:0.68388
[24]	validation_0-auc:0.68336
[25]	validation_0-auc:0.68367
[26]	validation_0-auc:0.68528
[27]	validation_0-auc:0.68450
[28]	validation_0-auc:0.68325
[29]	validation_0-auc:0.68351
[30]	validation_0-auc:0.68370
[31]	validation_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='auc',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.3, max_delta_step=0,
              max_depth=3, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=1000, n_jobs=1,
              num_parallel_tree=1, random_state=1337, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, seed=1337, subsample=1, tree_method='exact',
              validate_parameters=1, verbosity=None)

In [27]:
onehotnames = list(preprocessor.named_transformers_.cat['onehot'].get_feature_names())
numericnames = list(preprocessor.named_transformers_.num)
featurenames = numeric_cols + onehotnames

newfeaturenames = featurenames
for fi in range(len(factor_cols)):
    newfeaturenames = [s.replace('x%d' % fi, factor_cols[fi]) for s in newfeaturenames]

pd.DataFrame({
    'name':newfeaturenames, 
    'importance':model.feature_importances_
}).sort_values(
    'importance', 
    ascending=False
)

Unnamed: 0,name,importance
0,age_approx,0.306465
1,anatom_site_general_challenge_head/neck,0.161715
6,anatom_site_general_challenge_unknown,0.153441
2,anatom_site_general_challenge_lower extremity,0.097911
5,anatom_site_general_challenge_torso,0.085253
3,anatom_site_general_challenge_oral/genital,0.082927
7,anatom_site_general_challenge_upper extremity,0.072467
4,anatom_site_general_challenge_palms/soles,0.039822


In [28]:
dftest = pd.read_csv(path/'predictions-resnet18-test.csv')
Xtest = preprocessor.transform(dftest[feature_cols])
Xtrain_check = preprocessor.transform(df[feature_cols])

In [29]:
target = model.predict(Xtest)
target_check = model.predict(Xtrain_check)

In [30]:
pd.Series(target_check).value_counts()
# df['target'].value_counts()

0    33126
dtype: int64

In [31]:
dfsubmit = pd.DataFrame({'image_name': dftest.image_name, 'target': target})
dfsubmit.head()

Unnamed: 0,image_name,target
0,ISIC_0052060,0
1,ISIC_0052349,0
2,ISIC_0058510,0
3,ISIC_0073313,0
4,ISIC_0073502,0


In [32]:
dfsubmit.target.value_counts()

0    10982
Name: target, dtype: int64

In [None]:
import subprocess as sp
dfsubmit.to_csv(path/'submission1.csv', index=False)
p = sp.run([
    'kaggle', 'competitions', 'submit',
    'siim-isic-melanoma-classification',
    '-f', path/'submission1.csv',
    '-m', '"test submission API"'
    ], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
print(p.stdout.decode('utf-8'))
print(p.stderr.decode('utf-8'))

In [None]:
print(p.stdout.decode('utf-8'))

In [None]:
dfsubmit.target.value_counts()