In [ ]:
import os
import pickle

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from PIL import Image
from scipy.stats import uniform
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import auc, roc_curve, accuracy_score, roc_auc_score
from sklearn.model_selection import RandomizedSearchCV

GOAL_POSITIVE_SAMPLE_PERCENTAGE = 0.35

PATH_TO_TRAIN_IMAGES = os.path.join(os.pardir, 'ISIC2018_Task3_Training_Input')
PATH_TO_TEST_IMAGES = os.path.join(os.pardir, 'ISIC2018_Task3_Test_Input')

PATH_TO_TRAIN_GROUND_TRUTH = os.path.join(os.pardir, 'ISIC2018_Task3_Training_GroundTruth.csv')
PATH_TO_TEST_GROUND_TRUTH = os.path.join(os.pardir, 'ISIC2018_Task3_Test_GroundTruth.csv')

OUTPUT_HEIGHT = 150
OUTPUT_WIDTH = 200

N_COMPS = 30

RANDOM_STATE = 42


def get_flat_data(ground_truth, path_to_images, filename):
    image_names = ground_truth['image'].values
    num_images = len(image_names)

    flat_data = np.empty((num_images, OUTPUT_HEIGHT * OUTPUT_WIDTH * 3), dtype=np.float16)

    for i, image_file in enumerate(image_names):
        with Image.open(os.path.join(path_to_images, image_file + '.jpg')) as img:
            img_resized = img.resize((OUTPUT_WIDTH, OUTPUT_HEIGHT), Image.LANCZOS)
            img_norm = np.array(img_resized) / 255
            img_flatten = img_norm.flatten()
            flat_data[i] = img_flatten

        print(f'Images processed: {i+1}/{num_images}', end='\r')

    return flat_data


def reduce_dataset(dataset, filename, pca=None):
    if not pca:
        pca = PCA(n_components=N_COMPS)
        pca.fit(dataset)

    dataset_reduced = pca.transform(dataset)

    with open('pca.pkl', 'wb') as file:
        pickle.dump(pca, file)

    return dataset_reduced, pca


def downsample_dataset(X, y):
    positive_samples_current = y.sum()
    all_samples_goal = int(positive_samples_current // GOAL_POSITIVE_SAMPLE_PERCENTAGE)
    all_samples_current = X.shape[0]
    samples_to_delete_number = all_samples_current - all_samples_goal
    negative_samples_indices = np.where(y!= 1)[0]

    np.random.seed(RANDOM_STATE)
    delete_indices = np.random.choice(negative_samples_indices, size=samples_to_delete_number, replace=False)

    X_downsampled = np.delete(X, delete_indices, axis=0)
    y_downsampled = np.delete(y, delete_indices, axis=0)

    return X_downsampled, y_downsampled

In [ ]:
train_ground_truth = pd.read_csv(PATH_TO_TRAIN_GROUND_TRUTH)
X_train = get_flat_data(train_ground_truth, PATH_TO_TRAIN_IMAGES, 'X_train.npy')
y_train = train_ground_truth['MEL'].values

X_train, y_train = downsample_dataset(X_train, y_train)

X_train, pca = reduce_dataset(X_train, 'X_train_reduced.npy')

In [ ]:
test_ground_truth = pd.read_csv(PATH_TO_TEST_GROUND_TRUTH)
X_test = get_flat_data(test_ground_truth, PATH_TO_TEST_IMAGES, 'X_test.npy')
X_test, _ = reduce_dataset(X_test, 'X_test_reduced.npy', pca)
y_test = test_ground_truth['MEL'].values

In [ ]:
def calc_roc_auc(model):
    probabilities = model.predict_proba(X_test)[:, 1]
    return roc_auc_score(y_test, probabilities)

In [ ]:
rf_default = RandomForestClassifier()
rf_default.fit(X_train, y_train)

print('ROC-AUC (default parameters):', calc_roc_auc(rf_default))

In [ ]:
y_pred = rf_default.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy (default parameters): {accuracy:.2f}')

In [ ]:
params = {'n_estimators': range(100, 401, 50), 'max_depth': range(2, 21, 3), 'max_features': uniform(0.1, 0.9),
              'min_samples_split': range(2, 10), 'min_samples_leaf': range(1, 10), 'bootstrap': [True, False]}

random_search = RandomizedSearchCV(RandomForestClassifier(), params, n_iter=30, cv=5, scoring='roc_auc', n_jobs=-1)
random_search.fit(X_train, y_train)

rf_best = random_search.best_estimator_

print('Best parameters:', random_search.best_params_)
print('ROC-AUC (best parameters):', calc_roc_auc(rf_best))

In [ ]:
def plot_curve(x, y, auc, ax):
    ax.plot(x, y, 'b', label = 'AUC = %0.2f' % auc)
    ax.legend(loc = 'lower right')


def plot_roc_curve(model, y_true, ax):
    probabilities = model.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_true, probabilities)
    roc_auc = auc(fpr, tpr)
    plot_curve(fpr, tpr, roc_auc, ax)
    ax.set_title('Receiver Operating Characteristic (best model)')
    ax.set_ylabel('True Positive Rate')
    ax.set_xlabel('False Positive Rate')
    ax.plot([0, 1], [0, 1],'r--')

In [ ]:
fig, ax = plt.subplots()
plot_roc_curve(rf_best, y_test, ax)

y_pred = rf_best.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy (best model): {accuracy:.2f}')

In [ ]:
with open('rf_model.pkl', 'wb') as file:
    pickle.dump(rf_best, file)