In [None]:
!git clone https://github.com/anminhhung/small_dog_cat_dataset

Cloning into 'small_dog_cat_dataset'...
remote: Enumerating objects: 2608, done.[K
remote: Total 2608 (delta 0), reused 0 (delta 0), pack-reused 2608[K
Receiving objects: 100% (2608/2608), 55.84 MiB | 35.32 MiB/s, done.
Resolving deltas: 100% (1/1), done.


In [None]:
!pip install ipython-autotime
%load_ext autotime

Đối với kỹ thuật Averaging, sẽ không yêu cầu chúng ta phải xây dựng một pipeline quá phức tạp, chúng ta chỉ cần một số lượng dữ liệu nhất định sau đó tạo ra nhiều model để có thể áp dụng Averaging. Nhiều model ở đây chúng ta có thể lựa chọn train nhiều model khác nhau trên cùng bộ dataset có sẵn như Logistic Regression, Softmax Regression, SVM, RandomForest, KNN trên bài toán classification. Ở trong file này, mình sẽ hướng dẫn các bạn sử dụng kỹ thuật đã học ở buổi trước là k-fold validation để training trên nhiều model sau đó áp dụng kỹ thuật Averaging để tạo cơ hội tìm ra được lời giải tốt nhất trên bộ dữ liệu của chúng ta.

In [1]:
import glob 
import os 
import cv2 
import numpy as np 

from skimage.feature import hog

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import log_loss, roc_auc_score, accuracy_score
from sklearn.model_selection import KFold

In [None]:
def read_file(path, target_size=(64, 64)):
    datas = []
    label = []

    for category in os.listdir(path):
     category_dir = os.path.join(path, category)
     for image_name in os.listdir(category_dir):
        image_path = os.path.join(category_dir, image_name)
        image = cv2.imread(image_path)
        image = cv2.resize(image, target_size)
        datas.append(image)
        label.append(category)

    return np.array(datas), np.array(label)

time: 864 µs (started: 2023-06-08 03:20:26 +00:00)


In [None]:
train_dir = 'small_dog_cat_dataset/train/'
test_dir = 'small_dog_cat_dataset/test/'
target_size = (64,64)

train_data, train_label = read_file(train_dir, target_size)
test_data, test_label = read_file(test_dir, target_size)

time: 6.37 s (started: 2023-06-08 03:21:03 +00:00)


In [None]:
len(train_data), len(train_label), len(test_data), len(test_label)

(2000, 2000, 600, 600)

time: 3.05 ms (started: 2023-06-08 03:21:35 +00:00)


In [None]:
# feature selection 
def hog_feature(data):
   hog_gray_features = []

   for image in data:
      gray_image = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
      hog_features, _ = hog(gray_image, visualize=True,
                               block_norm='L2-Hys',
                               pixels_per_cell=(16, 16),
                                cells_per_block=(2, 2))
      hog_gray_features.append(hog_features)

   return np.array(hog_gray_features)

time: 674 µs (started: 2023-06-08 03:22:06 +00:00)


In [None]:
train_data = hog_feature(train_data)
test_data = hog_feature(test_data)

time: 11.9 s (started: 2023-06-08 03:22:06 +00:00)


In [None]:
# K-fold validation 
kf = KFold(n_splits=5, shuffle=True, random_state=0)

time: 534 µs (started: 2023-06-08 03:27:42 +00:00)


In [None]:
model_1 = SVC(probability=True, random_state=0)
model_2 = RandomForestClassifier(random_state=0)
model_3 = KNeighborsClassifier()

time: 1.32 ms (started: 2023-06-08 03:27:46 +00:00)


In [None]:
scores_val = []
scores_test = []

for k, (train_index, test_index) in enumerate(kf.split(train_data)):
  model_1.fit(train_data[train_index, :], train_label[train_index])
  model_2.fit(train_data[train_index, :], train_label[train_index])
  model_3.fit(train_data[train_index, :], train_label[train_index])

  proba_val = np.stack(
          [model_1.predict_proba(train_data[test_index, :])[:, 1],
          model_2.predict_proba(train_data[test_index, :])[:, 1],
          model_3.predict_proba(train_data[test_index, :])[:, 1]]
        ).T

  arithmetic_val = proba_val.mean(axis=1)
  ras_val = roc_auc_score(y_true=train_label[test_index], y_score=arithmetic_val)

  scores_val.append(ras_val)

  # evaluation 
  proba_test = np.stack(
          [model_1.predict_proba(test_data)[:, 1],
          model_2.predict_proba(test_data)[:, 1],
          model_3.predict_proba(test_data)[:, 1]]
        ).T
  
  arithmetic_test = proba_test.mean(axis=1)
  ras_test = roc_auc_score(y_true=test_label, y_score=arithmetic_test)
  scores_test.append(ras_test)

  print("fold: ", k)
  print(f"Mean averaging ROC-AUC in val is: {ras_val:0.5f}")
  print(f"Mean averaging ROC-AUC in test is: {ras_test:0.5f}")
  print("\n")

print(f"CV Mean averaging ROC-AUC in val is: {np.mean(scores_val):0.5f}")
print(f"CV Mean averaging ROC-AUC in test is: {np.mean(scores_test):0.5f}")

fold:  0
Mean averaging ROC-AUC in val is: 0.79185
Mean averaging ROC-AUC in test is: 0.79324


fold:  1
Mean averaging ROC-AUC in val is: 0.77943
Mean averaging ROC-AUC in test is: 0.79470


fold:  2
Mean averaging ROC-AUC in val is: 0.81804
Mean averaging ROC-AUC in test is: 0.79597


fold:  3
Mean averaging ROC-AUC in val is: 0.79968
Mean averaging ROC-AUC in test is: 0.78720


fold:  4
Mean averaging ROC-AUC in val is: 0.79238
Mean averaging ROC-AUC in test is: 0.77806


CV Mean averaging ROC-AUC in val is: 0.79627
CV Mean averaging ROC-AUC in test is: 0.78983
time: 24.3 s (started: 2023-06-08 03:43:03 +00:00)
