In [35]:
import sys
from os.path import join
import warnings
warnings.simplefilter('ignore')

import numpy as np
import torch
from tqdm import tqdm

from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
from coreml.config import Config
from coreml.data.dataloader import get_dataloader
from coreml.utils.io import read_yml
from coreml.decomposition import PCA
from coreml.sklearn import SVM

In [2]:
config = Config('competitions/2020/melanoma-classification/configs/pca-svm.yml')

In [3]:
dataloader, dataset = get_dataloader(
        config.data, 'train',
        config.model['batch_size'],
        num_workers=10,
        shuffle=False,
        drop_last=False)

[33m=> Loading dataset version file: [siim-isic-melanoma, v1.0-200x200, train][0m


Loading items: 100%|██████████| 26377/26377 [00:00<00:00, 862663.60it/s]


In [4]:
iterator = tqdm(dataloader)
train_signals = []
train_labels = []

for batch in iterator:
    train_signals.append(batch['signals'])
    train_labels.append(batch['labels'])

100%|██████████| 207/207 [00:16<00:00, 12.91it/s]


In [5]:
train_signals = torch.cat(train_signals)

In [6]:
train_labels = torch.cat(train_labels)

In [7]:
train_signals.shape, train_labels.shape

(torch.Size([26377, 3, 50, 50]), torch.Size([26377]))

In [8]:
dataloader, dataset = get_dataloader(
        config.data, 'val',
        config.model['batch_size'],
        num_workers=10,
        shuffle=False,
        drop_last=False)

[33m=> Loading dataset version file: [siim-isic-melanoma, v1.0-200x200, val][0m


Loading items: 100%|██████████| 6749/6749 [00:00<00:00, 812752.53it/s]


In [9]:
iterator = tqdm(dataloader)
val_signals = []
val_labels = []

for batch in iterator:
    val_signals.append(batch['signals'])
    val_labels.append(batch['labels'])

100%|██████████| 53/53 [00:03<00:00, 16.94it/s]


In [10]:
val_signals = torch.cat(val_signals)

In [11]:
val_labels = torch.cat(val_labels)

In [12]:
val_signals.shape, val_labels.shape

(torch.Size([6749, 3, 50, 50]), torch.Size([6749]))

In [13]:
dataloader, dataset = get_dataloader(
        config.data, 'test',
        config.model['batch_size'],
        num_workers=10,
        shuffle=False,
        drop_last=False)

[33m=> Loading dataset version file: [siim-isic-melanoma, v1.0-200x200, test][0m


Loading items: 100%|██████████| 10982/10982 [00:00<00:00, 854548.00it/s]


In [14]:
iterator = tqdm(dataloader)
test_signals = []

for batch in iterator:
    test_signals.append(batch['signals'])

100%|██████████| 86/86 [00:04<00:00, 18.14it/s]


In [15]:
test_signals = torch.cat(test_signals)

In [16]:
test_signals.shape

torch.Size([10982, 3, 50, 50])

In [17]:
train_signals = train_signals.reshape(train_signals.shape[0], -1)
val_signals = val_signals.reshape(val_signals.shape[0], -1)
test_signals = test_signals.reshape(test_signals.shape[0], -1)

In [18]:
train_signals.shape, val_signals.shape, test_signals.shape, 

(torch.Size([26377, 7500]),
 torch.Size([6749, 7500]),
 torch.Size([10982, 7500]))

In [19]:
all_signals = torch.cat([train_signals, val_signals, test_signals], dim=0)

In [21]:
all_signals.shape

torch.Size([44108, 7500])

In [22]:
pca = PCA(2)

In [23]:
pca.fit(train_signals)

PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [24]:
all_data = pca.transform(all_signals)

In [25]:
all_data.shape

(44108, 2)

In [26]:
train_data = all_data[:len(train_signals)]
val_data = all_data[len(train_signals): len(train_signals) + len(val_signals)]
test_data = all_data[-len(test_signals):]

In [27]:
train_data.shape, val_data.shape, test_data.shape

((26377, 2), (6749, 2), (10982, 2))

In [37]:
svm = SVC(probability=True)

In [38]:
svm

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [39]:
svm.fit(train_data, train_labels)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [40]:
val_predictions = svm.predict_proba(val_data)

In [41]:
val_predictions.shape, val_labels.shape

((6749, 2), torch.Size([6749]))

In [42]:
roc_auc_score(val_labels, val_predictions[:, 1])

0.5921723965413079