In [1]:
!git clone https://github.com/anminhhung/small_dog_cat_dataset

Cloning into 'small_dog_cat_dataset'...
remote: Enumerating objects: 2608, done.[K
remote: Total 2608 (delta 0), reused 0 (delta 0), pack-reused 2608[K
Receiving objects: 100% (2608/2608), 55.84 MiB | 21.41 MiB/s, done.
Resolving deltas: 100% (1/1), done.


In [None]:
!pip install ipython-autotime
%load_ext autotime

In [3]:
import glob 
import os 
import cv2 
import numpy as np 

from skimage.feature import hog

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import log_loss, roc_auc_score, accuracy_score
from sklearn.model_selection import KFold

time: 2.39 s (started: 2023-06-08 03:45:30 +00:00)


Trong một số trường hợp, quá trình đánh giá của chúng ta sẽ dựa vào ROC-AUC score, vì vậy nếu chỉ đơn giản là sử dụng Averaging thì vẫn chưa đủ. Lý do là vì các model khác nhau có thể sẽ sử dụng các chiến lược optimize khác nhau vì vậy output cũng sẽ cho ra kết quả khác nhau. Nên chúng ta cần chuyển xác suất output thì rank và tính trung bình trên rank đó, tại đây chúng ta sử dụng min-max scaler, để convert output model về 0-1, và sau đó sử dụng averaging. 

In [4]:
def read_file(path, target_size=(64, 64)):
    datas = []
    label = []

    for category in os.listdir(path):
     category_dir = os.path.join(path, category)
     for image_name in os.listdir(category_dir):
        image_path = os.path.join(category_dir, image_name)
        image = cv2.imread(image_path)
        image = cv2.resize(image, target_size)
        datas.append(image)
        label.append(category)

    return np.array(datas), np.array(label)

time: 6.79 ms (started: 2023-06-08 03:51:56 +00:00)


In [5]:
train_dir = 'small_dog_cat_dataset/train/'
test_dir = 'small_dog_cat_dataset/test/'
target_size = (64,64)

train_data, train_label = read_file(train_dir, target_size)
test_data, test_label = read_file(test_dir, target_size)

time: 5.79 s (started: 2023-06-08 03:52:00 +00:00)


In [6]:
len(train_data), len(train_label), len(test_data), len(test_label)

(2000, 2000, 600, 600)

time: 9.61 ms (started: 2023-06-08 03:52:06 +00:00)


In [7]:
# feature selection 
def hog_feature(data):
   hog_gray_features = []

   for image in data:
      gray_image = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
      hog_features, _ = hog(gray_image, visualize=True,
                               block_norm='L2-Hys',
                               pixels_per_cell=(16, 16),
                                cells_per_block=(2, 2))
      hog_gray_features.append(hog_features)

   return np.array(hog_gray_features)

time: 984 µs (started: 2023-06-08 03:52:11 +00:00)


In [8]:
train_data = hog_feature(train_data)
test_data = hog_feature(test_data)

time: 12.5 s (started: 2023-06-08 03:52:15 +00:00)


In [9]:
# K-fold validation 
kf = KFold(n_splits=5, shuffle=True, random_state=0)

time: 534 µs (started: 2023-06-08 03:52:28 +00:00)


In [10]:
model_1 = SVC(probability=True, random_state=0)
model_2 = RandomForestClassifier(random_state=0)
model_3 = KNeighborsClassifier()

time: 788 µs (started: 2023-06-08 03:52:28 +00:00)


In [11]:
scores_val = []
scores_test = []

for k, (train_index, test_index) in enumerate(kf.split(train_data)):
  model_1.fit(train_data[train_index, :], train_label[train_index])
  model_2.fit(train_data[train_index, :], train_label[train_index])
  model_3.fit(train_data[train_index, :], train_label[train_index])

  proba_val = np.stack(
          [model_1.predict_proba(train_data[test_index, :])[:, 1],
          model_2.predict_proba(train_data[test_index, :])[:, 1],
          model_3.predict_proba(train_data[test_index, :])[:, 1]]
        ).T

  arithmetic_val = MinMaxScaler().fit_transform(proba_val).mean(axis=1)
  ras_val = roc_auc_score(y_true=train_label[test_index], y_score=arithmetic_val)

  scores_val.append(ras_val)

  # evaluation 
  proba_test = np.stack(
          [model_1.predict_proba(test_data)[:, 1],
          model_2.predict_proba(test_data)[:, 1],
          model_3.predict_proba(test_data)[:, 1]]
        ).T
  
  arithmetic_test = MinMaxScaler().fit_transform(proba_test).mean(axis=1)
  ras_test = roc_auc_score(y_true=test_label, y_score=arithmetic_test)
  scores_test.append(ras_test)

  print("fold: ", k)
  print(f"Mean averaging ROC-AUC in val is: {ras_val:0.5f}")
  print(f"Mean averaging ROC-AUC in test is: {ras_test:0.5f}")
  print("\n")

print(f"CV Mean averaging ROC-AUC in val is: {np.mean(scores_val):0.5f}")
print(f"CV Mean averaging ROC-AUC in test is: {np.mean(scores_test):0.5f}")

fold:  0
Mean averaging ROC-AUC in val is: 0.79280
Mean averaging ROC-AUC in test is: 0.79418


fold:  1
Mean averaging ROC-AUC in val is: 0.78156
Mean averaging ROC-AUC in test is: 0.79582


fold:  2
Mean averaging ROC-AUC in val is: 0.81826
Mean averaging ROC-AUC in test is: 0.79792


fold:  3
Mean averaging ROC-AUC in val is: 0.80043
Mean averaging ROC-AUC in test is: 0.78932


fold:  4
Mean averaging ROC-AUC in val is: 0.79341
Mean averaging ROC-AUC in test is: 0.77935


CV Mean averaging ROC-AUC in val is: 0.79729
CV Mean averaging ROC-AUC in test is: 0.79132
time: 26.9 s (started: 2023-06-08 03:53:47 +00:00)
