In [1]:
from pathlib import Path
import skimage
from sklearn.model_selection import GridSearchCV
from sklearn import svm
import numpy as np
import tensorflow as tf
import cv2
import matplotlib.pyplot as plt
from joblib import load
import pandas as pd

In [18]:
from numpy.random import default_rng
rng = default_rng(1337)
samples_path = Path("data/processed/crop_6_1000x1000/")
data_dir = Path("data/sets/6_999x999/")
img_width = img_height = 256

In [19]:
def normalize_img(img_np):
    """Normalize CV2-image which default is uint8"""
    return cv2.normalize(src=img_np, dst=None, alpha=0, beta=1, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_32F)

def load_img(img_path, color_mode, shape, img_filter):
    color = cv2.IMREAD_COLOR if color_mode == "rgb" else cv2.IMREAD_GRAYSCALE
    img = normalize_img(cv2.resize(cv2.imread(str(img_path), color), dsize=shape))
    if img_filter is not None:
        img = img_filter(img)
    if color_mode == "rgb": return img[:, :, ::-1] # Converting to RGB from BGR
    return img

def load_dataset(data_dir, num_samples=50, color="rgb", shape=(128,128), sets=["train"], categories=["dc", "marvel"], img_filter=None):
    target_arr, flat_data_arr = [], []
    for set in sets:
        print("Loading data set:", set)
        set_path = data_dir / set 
        for idx, category in enumerate(categories):
            print(f"Loading category : {category}")
            dir_path = set_path / category
            all_files = list(dir_path.iterdir())
            print("Number of files:", len(all_files))
            if num_samples is None or num_samples > len(all_files):
                cat_samples = len(all_files)
                
            files = rng.permutation(all_files)[:cat_samples]
            print("Number of files to load:", len(files))

            category_imgs = [load_img(img_path, color, shape, img_filter).flatten() for img_path in files]
            flat_data_arr.extend(category_imgs)
            target_arr.extend([idx] * len(category_imgs))
        print("---------------------------")

    X_flattened = np.array(flat_data_arr)
    y = np.array(target_arr)
    return X_flattened, y

In [None]:
# Visualize the effects of the different pre processing methods 
test_img = cv2.resize(cv2.imread(str(data_dir / "train" / "marvel" / "marvel_134.png"), cv2.IMREAD_GRAYSCALE), dsize=(512,512))
normalized_img = normalize_img(test_img)
normalized_hist_shifted = skimage.exposure.equalize_hist(test_img)
norm_adapt_hist = skimage.exposure.equalize_adapthist(test_img)
fig = plt.figure(figsize=(20,20))
test_imgs = [test_img, normalized_img, normalized_hist_shifted, norm_adapt_hist]
row, col = 2, 2
for i in range(1,5):
    fig.add_subplot(row, col, i)
    plt.imshow(test_imgs[i-1], cmap="gray")
plt.show()

In [None]:
# Loading training samples
samples_train, labels_train = load_dataset(data_dir, num_samples=None, shape=(img_width, img_height), color="gray", img_filter=None)
# Randomize samples in case of picking a subset
random_idx_train = rng.choice(np.arange(labels_train.size), labels_train.size, replace=False)
X_train, y_train = samples_train[random_idx_train], labels_train[random_idx_train]

In [None]:
# Loading test samples
samples_test, labels_test = load_dataset(data_dir, num_samples=None, sets=["test"], shape=(img_width, img_height), img_filter=None)
random_idx_test = rng.choice(np.arange(labels_test.size), labels_test.size, replace=False)
X_test, y_ttest = samples_test[random_idx_test], labels_test[random_idx_test]

In [24]:
# Find the best hyper parameters 
param_grid = {"C": [0.1, 1, 10, 100], "degree":[3], "gamma": [0.0001, 0.001, 0.01, 0.1, 1, 10,], "kernel": ["rbf, poly"]}

In [19]:
X_train.shape, y_train.shape, np.bincount(y_train)

((3716, 49152), (3716,))

In [32]:
default_set_sizes = [10, 25, 50, 100, 250, 500, 1000, 4000]
def parameter_evaluation(samples, labels, set_sizes=default_set_sizes):
    """Method to run the GridSearchCV on various sample sizes of each category"""
    accuracies = []
    for sample_set in set_sizes:
        print("Test:", sample_set)
        idx = np.random.choice(np.arange(labels.size), sample_set, replace=False)
        X_train, y = samples[idx], labels[idx]
        print("Label split:", np.bincount(y))


        svc = svm.SVC(probability=False)
        param_grid = {"C": [0.1, 1, 10], "gamma": [0.0001, 0.001, 0.01, 0.1, 1, 10], "kernel":["rbf"]}
        model = GridSearchCV(svc, param_grid, verbose=3, n_jobs=-1, error_score="raise")
        model.fit(X_train, y)
        score = model.best_score_
        
        print("Best score:", score)
        print("Best params", model.best_params_)
        accuracies.append(score)
    return accuracies

In [None]:
accuracies = parameter_evaluation(X_train, y_train) 

In [None]:
# Plot the best achieved accuracies as a function of the training data size
plt.plot(np.array(default_set_sizes), accuracies)
plt.title("Test accuracies by training data set size")
plt.xlabel('Training set size')
plt.ylabel('Accuracy')
plt.legend(loc='upper right')
# plt.savefig("svm_cross-valid_rbf_accuracy_by_size")

## Kernel SVM with PCA

In [None]:
# Assumes a saved PCA model based on 256x256 images 
PCA = load("pca_model_256x256_grayscale_2086_components.joblib")

In [None]:
# Transform the sampels according to the PCA generated components
samples_train_PCA, labels_train_pca = PCA.transform(samples_train), labels_train
random_idx_pca_train = rng.choice(np.arange(labels_train_pca.size), labels_train_pca.size, replace=False)
X_train_pca, y_train_pca = samples_train[random_idx_pca_train], labels_train[random_idx_pca_train]

In [None]:
accuracies_PCA = parameter_evaluation(X_train_pca, y_train_pca) 

In [None]:
plt.plot(np.array(default_set_sizes), accuracies)
plt.title("Test accuracies by training data set size")
plt.xlabel('Training set size')
plt.ylabel('Accuracy')
plt.legend(loc='upper right')
# plt.savefig("svm_cross-valid_rbf_accuracy_by_size")