In [1]:
from pathlib import Path, PurePath
from tensorflow.keras.preprocessing import image_dataset_from_directory
from skimage.transform import resize
from skimage.io import imread
from sklearn.model_selection import train_test_split
import numpy as np

In [2]:
from numpy.random import default_rng
rng = default_rng(1337)
samples_path = Path("data/processed/crop_6_1000x1000/")

In [8]:
categories = ["dc", "marvel"]
target_arr, flat_data_arr = [], []
number_of_samples = 100
for idx, category in enumerate(categories):
    print(f'Loading category : {category}')
    
    dir_path = samples_path / category
    # reproducable with seed set
    files = rng.permutation(list(dir_path.iterdir()))[:number_of_samples]
    
     # Anti Aliasing recomended for down scaling
    category_imgs = [resize(imread(img_path),(256,256,3), anti_aliasing=True).flatten() for img_path in files]
    flat_data_arr.extend(category_imgs)
    target_arr.extend([idx]*len(category_imgs))
    
flattened_X = np.array(flat_data_arr)
y = np.array(target_arr)

Loading category : dc
Loading category : marvel


In [9]:
X_train, X_test, y_train, y_test = train_test_split(flattened_X, y, test_size=0.2, random_state=7)

In [11]:
from sklearn import svm
from sklearn.model_selection import GridSearchCV

# Find the best hyper parameters -- Could also try sigmoid/linear, also could try different degrees of poly
param_grid = {'C':[0.1,1,10,100],'gamma':[0.0001,0.001,0.1,1],'kernel':['rbf','poly']}

# uses 5-fold-validation, but not necessary (there wasn't much variance in first run)
# Second without 5-fold runs better, but might be because of split
svc = svm.SVC(probability=False) 
model = GridSearchCV(svc, param_grid, verbose=3, n_jobs=-2)
model.fit(X_train, y_train)

Fitting 5 folds for each of 32 candidates, totalling 160 fits


GridSearchCV(estimator=SVC(), n_jobs=-2,
             param_grid={'C': [0.1, 1, 10, 100],
                         'gamma': [0.0001, 0.001, 0.1, 1],
                         'kernel': ['rbf', 'poly']},
             verbose=3)

In [12]:
import pandas as pd
results_df = pd.DataFrame(model.cv_results_)
results_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_gamma,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,23.99375,3.862507,23.018929,1.336222,0.1,0.0001,rbf,"{'C': 0.1, 'gamma': 0.0001, 'kernel': 'rbf'}",0.5,0.5,0.53125,0.53125,0.53125,0.51875,0.015309,20
1,22.94221,2.58291,4.206371,1.427359,0.1,0.0001,poly,"{'C': 0.1, 'gamma': 0.0001, 'kernel': 'poly'}",0.59375,0.59375,0.59375,0.625,0.5,0.58125,0.04239,4
2,20.940199,0.196059,21.31226,0.894461,0.1,0.001,rbf,"{'C': 0.1, 'gamma': 0.001, 'kernel': 'rbf'}",0.5,0.5,0.53125,0.53125,0.53125,0.51875,0.015309,20
3,20.547232,0.714048,4.590426,0.388352,0.1,0.001,poly,"{'C': 0.1, 'gamma': 0.001, 'kernel': 'poly'}",0.53125,0.625,0.59375,0.59375,0.40625,0.55,0.078062,5
4,21.550017,0.513518,19.562404,0.329324,0.1,0.1,rbf,"{'C': 0.1, 'gamma': 0.1, 'kernel': 'rbf'}",0.5,0.5,0.53125,0.53125,0.53125,0.51875,0.015309,20
5,21.116367,0.90879,4.724201,0.570733,0.1,0.1,poly,"{'C': 0.1, 'gamma': 0.1, 'kernel': 'poly'}",0.53125,0.625,0.59375,0.59375,0.40625,0.55,0.078062,5
6,22.604183,1.235051,20.984211,0.526172,0.1,1.0,rbf,"{'C': 0.1, 'gamma': 1, 'kernel': 'rbf'}",0.5,0.5,0.53125,0.53125,0.53125,0.51875,0.015309,20
7,20.779512,0.512679,5.195282,1.139855,0.1,1.0,poly,"{'C': 0.1, 'gamma': 1, 'kernel': 'poly'}",0.53125,0.625,0.59375,0.59375,0.40625,0.55,0.078062,5
8,22.737138,1.244707,20.831655,1.049457,1.0,0.0001,rbf,"{'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}",0.5,0.625,0.625,0.65625,0.5625,0.59375,0.055902,1
9,20.475662,0.382211,4.677055,0.150289,1.0,0.0001,poly,"{'C': 1, 'gamma': 0.0001, 'kernel': 'poly'}",0.53125,0.53125,0.625,0.53125,0.40625,0.525,0.069597,19


In [13]:
model.best_params_ 

{'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}

In [15]:
model.best_score_

0.59375