In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from fastai.vision import *
from fastai.callbacks.hooks import *
from fastai.utils.mem import *
import pandas as pd

In [3]:
# Download the data - 106 GB !!!
# !kaggle competitions download -c siim-isic-melanoma-classification

In [4]:
path = Config.data_path()/'melclass'
path.mkdir(parents=True, exist_ok=True)
path
bs = 64
imsize = 224
valid_pct = .5
np.random.seed(42)

In [5]:
data = ImageDataBunch.from_csv(
    path, 
    'jpeg/train', 
    csv_labels='subset.csv',
    suffix='.jpg', 
    valid_pct=valid_pct,
    fn_col=0,
    label_col=7,
    size=imsize, # 224
    bs=bs,
    ds_tfms=get_transforms()
).normalize(imagenet_stats) # need to use imagenet stats because this is the one the model was trained with

In [6]:
# data.show_batch(rows=5, figsize=(7,6))

In [7]:
print(data.classes)
len(data.classes),data.c

[0, 1]


(2, 2)

In [8]:
# learn = cnn_learner(data, models.resnet50, metrics=[error_rate, AUROC()])
# Could try resnet101 or 152. Also densenet etc.

In [9]:
# interp = ClassificationInterpretation.from_learner(learn)
# losses,idxs = interp.top_losses()
# len(data.valid_ds)==len(losses)==len(idxs)

In [10]:
# interp.plot_top_losses(9, figsize=(15,11))

In [11]:
# interp.plot_confusion_matrix(figsize=(12,12), dpi=60)

In [12]:
# learn.lr_find()

In [13]:
# learn.recorder.plot()

In [14]:
# learn.unfreeze()
# s = slice(1e-3/(2.6**4),1e-3)
# learn.fit_one_cycle(10, max_lr=s)
# # learn.save('melclass-stage-2-resnet50');

Note, can also use freeze_to to unfreeze last x layers (e.g. freeze_to(-2) to unfreeze the last two layers)) - perhaps only relevant for NLP

OK now continue to train overnight using the entire training data.

In [15]:
data = ImageDataBunch.from_csv(
    path, 
    'jpeg/train_resized', 
    csv_labels='train.csv', # use full training data
    suffix='.jpg', 
    valid_pct=valid_pct,
    fn_col=0,
    label_col=7,
    size=imsize, # 224
    bs=bs,
    ds_tfms=get_transforms()
).normalize(imagenet_stats) # need to use imagenet stats because this is the one the model was trained with

learn = cnn_learner(data, models.resnet50, metrics=[error_rate, AUROC()])

In [16]:
s = slice(1e-3/(2.6**4),1e-3)
learn.fit_one_cycle(1, s)

epoch,train_loss,valid_loss,error_rate,auroc,time
0,0.259109,0.124215,0.019079,0.619649,01:33


In [17]:
learn.export(file = 'resnet50.pkl')

In [18]:
learn.unfreeze()

learn.fit_one_cycle(5, max_lr=s, wd=1e-1)

learn.save('melclass-stage-2-resnet50-full');
learn.export(file = 'resnet50-stage2.pkl')

epoch,train_loss,valid_loss,error_rate,auroc,time
0,0.11972,0.083752,0.019501,0.774213,01:53
1,0.08917,0.086419,0.017932,0.787217,01:52
2,0.072914,0.077312,0.017932,0.853935,01:52
3,0.066137,0.068513,0.017811,0.881042,01:52
4,0.059681,0.068686,0.017932,0.879539,01:52


In [None]:
import numpy

def create_preds(folderpath, imset):
    imlist = ImageList.from_csv(
        path/('jpeg/%s' % folderpath),
        csv_name = path/(imset),
        suffix = '.jpg'
    )

    learner = load_learner(path, file = 'resnet50-stage2.pkl', test=imlist)
    preds, y = learner.get_preds(ds_type = DatasetType.Test)

    modclass = numpy.zeros(shape = preds.shape[0])
    for i in range(preds.shape[0]):
        modclass[i] = numpy.argmax(preds[i,:].numpy())

    imagenames = [str(l).split('/')[-1].replace('.jpg','') for l in imlist.items]
    predcol = [p for p in preds.numpy()[:,1]]
    df_preds = pd.DataFrame({
        'image_name':imagenames, 
        'prob': predcol, 
        'modclass': modclass.astype('int64')})
    df_preds.head()

    df = pd.read_csv(path/('%s' % imset))
    dfout = df.merge(df_preds, on='image_name')
    return dfout


In [20]:
dfouttrain = create_preds('train_resized','train.csv')

In [21]:
dfouttest = create_preds('test_resized','test.csv')

In [22]:
dfoutsubset = create_preds('train_resized','subset.csv')

In [29]:
dfouttrain.modclass.value_counts()

0    32967
1      159
Name: modclass, dtype: int64

In [24]:
dfouttest.modclass.value_counts()

0    10949
1       33
Name: modclass, dtype: int64

In [28]:
dfoutsubset.modclass.value_counts()

0    1646
1     106
Name: modclass, dtype: int64

In [25]:
dfouttrain.to_csv(path/'predictions-resnet50-train.csv', index=False)
dfouttest.to_csv(path/'predictions-resnet50-test.csv', index=False)

In [26]:
dfouttest

Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge,prob,modclass
0,ISIC_0052060,IP_3579794,male,70.0,,3.514509e-07,0
1,ISIC_0052349,IP_7782715,male,40.0,lower extremity,2.294691e-05,0
2,ISIC_0058510,IP_7960270,female,55.0,torso,1.331852e-03,0
3,ISIC_0073313,IP_6375035,female,50.0,torso,3.192749e-06,0
4,ISIC_0073502,IP_0589375,female,45.0,lower extremity,6.397128e-03,0
...,...,...,...,...,...,...,...
10977,ISIC_9992485,IP_4152479,male,40.0,torso,1.016208e-02,0
10978,ISIC_9996992,IP_4890115,male,35.0,torso,2.937184e-02,0
10979,ISIC_9997917,IP_2852390,male,25.0,upper extremity,5.906133e-02,0
10980,ISIC_9998234,IP_8861963,male,65.0,lower extremity,7.759741e-06,0
