In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Image Size Check

In [None]:
from PIL import Image
import os

def calculate_image_size_statistics(dataset_folders):
    min_width = float('inf')
    min_height = float('inf')
    max_width = 0
    max_height = 0
    total_width = 0
    total_height = 0
    total_images = 0

    for folder in dataset_folders:
        for filename in os.listdir(folder):
            image_path = os.path.join(folder, filename)

            try:
                # 이미지 열기
                img = Image.open(image_path)

                # 이미지 크기 측정
                width, height = img.size
                total_width += width
                total_height += height
                total_images += 1

                # 최소 및 최대 크기 업데이트
                min_width = min(min_width, width)
                min_height = min(min_height, height)
                max_width = max(max_width, width)
                max_height = max(max_height, height)

            except Exception as e:
                print(f"Error processing {image_path}: {e}")

    # 이미지 평균 크기 계산
    average_width = total_width / total_images
    average_height = total_height / total_images

    return {
        'average_width': average_width,
        'average_height': average_height,
        'min_width': min_width,
        'min_height': min_height,
        'max_width': max_width,
        'max_height': max_height
    }

# 데이터셋 폴더 경로
dataset_folders = [
    "/content/drive/MyDrive/Colab Notebooks/finalproject_dataset/disease",
    "/content/drive/MyDrive/Colab Notebooks/finalproject_dataset/healthy"
]

# 이미지 크기 통계 계산
image_size_statistics = calculate_image_size_statistics(dataset_folders)

# 결과 출력
print(f"전체 데이터셋의 평균 이미지 폭: {image_size_statistics['average_width']}")
print(f"전체 데이터셋의 평균 이미지 높이: {image_size_statistics['average_height']}")
print(f"전체 데이터셋의 최소 이미지 폭: {image_size_statistics['min_width']}")
print(f"전체 데이터셋의 최소 이미지 높이: {image_size_statistics['min_height']}")
print(f"전체 데이터셋의 최대 이미지 폭: {image_size_statistics['max_width']}")
print(f"전체 데이터셋의 최대 이미지 높이: {image_size_statistics['max_height']}")




전체 데이터셋의 평균 이미지 폭: 1391.0697674418604
전체 데이터셋의 평균 이미지 높이: 1531.1834625322997
전체 데이터셋의 최소 이미지 폭: 152
전체 데이터셋의 최소 이미지 높이: 152
전체 데이터셋의 최대 이미지 폭: 13824
전체 데이터셋의 최대 이미지 높이: 10260


# Main

In [None]:
import os
import cv2
from sklearn.utils import shuffle
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import random
import numpy as np

from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization
from keras.optimizers import AdamW
from keras.regularizers import l2

from keras.callbacks import EarlyStopping


In [None]:
# 증강x, baseline
def load_images_and_labels(data_dir):
    images = []
    labels = []

    for folder_name in os.listdir(data_dir):
        folder_path = os.path.join(data_dir, folder_name)
        if os.path.isdir(folder_path):
            for filename in os.listdir(folder_path):
                img_path = os.path.join(folder_path, filename)
                img = cv2.imread(img_path)
                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)  # 이미지를 RGB 형식으로 변환
                img = cv2.resize(img, (128, 128), interpolation=cv2.INTER_LINEAR)  # 이미지 크기 조절 / 보간법 사용

                # 이미지 데이터를 리스트에 추가
                images.append(img)
                # 레이블을 리스트에 추가
                labels.append(folder_name)

    return shuffle(images, labels, random_state=42)


In [None]:
# 랜덤 증강o

def load_images_and_labels(data_dir):
    images = []
    labels = []

    # 랜덤으로 선택할 증강 리스트
    augmentation_options = [
        {'name': 'rotation', 'param': 40},
        {'name': 'width_shift', 'param': 0.2},
        {'name': 'height_shift', 'param': 0.2},
        {'name': 'shear', 'param': 0.2},
        {'name': 'zoom', 'param': 0.2},
        {'name': 'horizontal_flip', 'param': True},
    ]

    # 데이터 증강을 위한 ImageDataGenerator 정의
    datagen = ImageDataGenerator()

    for folder_name in os.listdir(data_dir):
        folder_path = os.path.join(data_dir, folder_name)
        if os.path.isdir(folder_path):
            for filename in os.listdir(folder_path):
                img_path = os.path.join(folder_path, filename)
                img = cv2.imread(img_path)
                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
                img = cv2.resize(img, (128, 128), interpolation=cv2.INTER_LINEAR)

                # 원본 이미지를 리스트에 추가
                images.append(img)
                labels.append(folder_name)

                # 랜덤으로 선택한 증강 옵션을 적용
                selected_augmentations = random.sample(augmentation_options, k=random.randint(0, len(augmentation_options)))
                for aug_option in selected_augmentations:
                    if aug_option['name'] == 'rotation':
                        img = ImageDataGenerator(rotation_range=aug_option['param']).random_transform(img)
                    elif aug_option['name'] == 'width_shift':
                        img = ImageDataGenerator(width_shift_range=aug_option['param']).random_transform(img)
                    elif aug_option['name'] == 'height_shift':
                        img = ImageDataGenerator(height_shift_range=aug_option['param']).random_transform(img)
                    elif aug_option['name'] == 'shear':
                        img = ImageDataGenerator(shear_range=aug_option['param']).random_transform(img)
                    elif aug_option['name'] == 'zoom':
                        img = ImageDataGenerator(zoom_range=aug_option['param']).random_transform(img)
                    elif aug_option['name'] == 'horizontal_flip':
                        img = ImageDataGenerator(horizontal_flip=aug_option['param']).random_transform(img)

                    # 증강된 이미지를 리스트에 추가
                    images.append(img)
                    labels.append(folder_name)
    return shuffle(images, labels, random_state=42)

# Class Weight



In [None]:
# 데이터 불러오기
data_dir = "/content/drive/MyDrive/Colab Notebooks/finalproject_dataset"
X, y = load_images_and_labels(data_dir)

# 레이블 인코딩
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
y = to_categorical(y, num_classes=2)  # 클래스 개수에 따라 수정

# 이미지 데이터를 전처리
X = np.array(X)
y = np.array(y)

# K-fold 교차 검증을 위한 K 값 설정
k_fold = KFold(n_splits=5, shuffle=True, random_state=42)

# 모델 생성 함수
def create_vgg16_model():
    model = Sequential()

    # Block 1
    model.add(Conv2D(64, (3, 3), activation='relu', padding='same', input_shape=(128, 128, 3)))
    model.add(Conv2D(64, (3, 3), activation='relu', padding='same'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))

    # Block 2
    model.add(Conv2D(128, (3, 3), activation='relu', padding='same'))
    model.add(Conv2D(128, (3, 3), activation='relu', padding='same'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))

    # Block 3
    model.add(Conv2D(256, (3, 3), activation='relu', padding='same'))
    model.add(Conv2D(256, (3, 3), activation='relu', padding='same'))
    model.add(Conv2D(256, (3, 3), activation='relu', padding='same'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))

    # Block 4
    model.add(Conv2D(512, (3, 3), activation='relu', padding='same'))
    model.add(Conv2D(512, (3, 3), activation='relu', padding='same'))
    model.add(Conv2D(512, (3, 3), activation='relu', padding='same'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))

    # Block 5
    model.add(Conv2D(512, (3, 3), activation='relu', padding='same'))
    model.add(Dropout(0.5))
    model.add(Conv2D(512, (3, 3), activation='relu', padding='same'))
    model.add(Dropout(0.5))
    model.add(Conv2D(512, (3, 3), activation='relu', padding='same'))
    model.add(Dropout(0.5))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))
    model.add(Dropout(0.5))

    # Classification block
    model.add(Flatten())
    model.add(Dense(4096, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(4096, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(2, activation='softmax'))

    # Compile the model
    model.compile(loss='binary_crossentropy', optimizer=AdamW(learning_rate=0.0001), metrics=['accuracy'])

    return model


# 전체 k-fold에 대한 accuracy를 저장할 리스트
all_accuracies = []

# K-fold 교차 검증 수행
for train_index, test_index in k_fold.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model = create_vgg16_model()

    class_weight = {0: 0.15, 1: 0.85}

    # 모델 훈련
    model.fit(X_train, y_train, epochs=50, batch_size=128, validation_data=(X_test, y_test), class_weight=class_weight)

    # 모델 평가
    eval_result = model.evaluate(X_test, y_test)
    print(f"Test Loss: {eval_result[0]}, Test Accuracy: {eval_result[1]}")

    # 평가 결과를 리스트에 추가
    all_accuracies.append(eval_result[1])

# 전체 k-fold에 대한 평균 accuracy 계산
average_accuracy = np.mean(all_accuracies)
print(f"\nAverage Accuracy Across All Folds: {average_accuracy}")

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Test Loss: 0.14566974341869354, Test Accuracy: 0.9675324559211731
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoc

# UpSampling by SMOTE



undersampling 하지 않은 이유 -> 안 그래도 적은 데이터에 대해 데이터 수를 더 줄이는 것은 옳지 않다고 판단

overfitting 가능성 증가 -> epoch 줄임

In [None]:
import imblearn
import matplotlib.pyplot as plt
from numpy import where
from imblearn.over_sampling import SMOTE
from collections import Counter

Epoch 50 -> 30

In [None]:
# 데이터 불러오기
data_dir = "/content/drive/MyDrive/Colab Notebooks/finalproject_dataset"
X, y = load_images_and_labels(data_dir)

# 레이블 인코딩
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
y = to_categorical(y, num_classes=2)  # 클래스 개수에 따라 수정

# 이미지 데이터를 전처리
# Flatten each image if needed
X = np.array(X)
y = np.array(y)

X_flatten = X.reshape(X.shape[0], -1)

# Apply SMOTE
smt = SMOTE()
X_new, y_new = smt.fit_resample(X_flatten, y)

X_new_reshaped = X_new.reshape(X_new.shape[0], 128, 128, 3)

# K-fold 교차 검증을 위한 K 값 설정
k_fold = KFold(n_splits=5, shuffle=True, random_state=42)

# 모델 생성 함수
def create_vgg16_model():
    model = Sequential()

    # Block 1
    model.add(Conv2D(64, (3, 3), activation='relu', padding='same', input_shape=(128, 128, 3)))
    model.add(Conv2D(64, (3, 3), activation='relu', padding='same'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))

    # Block 2
    model.add(Conv2D(128, (3, 3), activation='relu', padding='same'))
    model.add(Conv2D(128, (3, 3), activation='relu', padding='same'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))

    # Block 3
    model.add(Conv2D(256, (3, 3), activation='relu', padding='same'))
    model.add(Conv2D(256, (3, 3), activation='relu', padding='same'))
    model.add(Conv2D(256, (3, 3), activation='relu', padding='same'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))

    # Block 4
    model.add(Conv2D(512, (3, 3), activation='relu', padding='same'))
    model.add(Conv2D(512, (3, 3), activation='relu', padding='same'))
    model.add(Conv2D(512, (3, 3), activation='relu', padding='same'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))

    # Block 5
    model.add(Conv2D(512, (3, 3), activation='relu', padding='same'))
    model.add(Dropout(0.5))
    model.add(Conv2D(512, (3, 3), activation='relu', padding='same'))
    model.add(Dropout(0.5))
    model.add(Conv2D(512, (3, 3), activation='relu', padding='same'))
    model.add(Dropout(0.5))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))
    model.add(Dropout(0.5))

    # Classification block
    model.add(Flatten())
    model.add(Dense(4096, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(4096, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))

    # Compile the model
    model.compile(loss='binary_crossentropy', optimizer=AdamW(learning_rate=0.0001), metrics=['accuracy'])

    return model


# 전체 k-fold에 대한 accuracy를 저장할 리스트
all_accuracies = []

# K-fold 교차 검증 수행
for train_index, test_index in k_fold.split(X_new_reshaped):
    X_train, X_test = X_new_reshaped[train_index], X_new_reshaped[test_index]
    y_train, y_test = y_new[train_index], y_new[test_index]

    model = create_vgg16_model()


    # 모델 훈련
    model.fit(X_train, y_train, epochs=30, batch_size=128, validation_data=(X_test, y_test)) # 오버피팅 가능성이 있기 때문에 epoch 줄임

    # 모델 평가
    eval_result = model.evaluate(X_test, y_test)
    print(f"Test Loss: {eval_result[0]}, Test Accuracy: {eval_result[1]}")

    # 평가 결과를 리스트에 추가
    all_accuracies.append(eval_result[1])

# 전체 k-fold에 대한 평균 accuracy 계산
average_accuracy = np.mean(all_accuracies)
print(f"\nAverage Accuracy Across All Folds: {average_accuracy}")

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Test Loss: 0.09855705499649048, Test Accuracy: 0.9750000238418579
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Test Loss: 0.05254218727350235, Test Accuracy: 0.9818181991577148
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 1

BatchNormalization O / Epoch 30

In [None]:
# 데이터 불러오기
data_dir = "/content/drive/MyDrive/Colab Notebooks/finalproject_dataset"
X, y = load_images_and_labels(data_dir)

# 레이블 인코딩
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
y = to_categorical(y, num_classes=2)  # 클래스 개수에 따라 수정

# 이미지 데이터를 전처리
# Flatten each image if needed
X = np.array(X)
y = np.array(y)

X_flatten = X.reshape(X.shape[0], -1)

# Apply SMOTE
smt = SMOTE()
X_new, y_new = smt.fit_resample(X_flatten, y)

X_new_reshaped = X_new.reshape(X_new.shape[0], 128, 128, 3)

# K-fold 교차 검증을 위한 K 값 설정
k_fold = KFold(n_splits=5, shuffle=True, random_state=42)


# 모델 생성 함수
def create_vgg16_model_with_upsampling():
    model = Sequential()

    # Block 1
    model.add(Conv2D(64, (3, 3), activation='relu', padding='same', input_shape=(128, 128, 3)))
    model.add(BatchNormalization())
    model.add(Conv2D(64, (3, 3), activation='relu', padding='same'))
    model.add(BatchNormalization())
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))

    # Block 2
    model.add(Conv2D(128, (3, 3), activation='relu', padding='same'))
    model.add(BatchNormalization())
    model.add(Conv2D(128, (3, 3), activation='relu', padding='same'))
    model.add(BatchNormalization())
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))

    # Block 3
    model.add(Conv2D(256, (3, 3), activation='relu', padding='same'))
    model.add(BatchNormalization())
    model.add(Conv2D(256, (3, 3), activation='relu', padding='same'))
    model.add(BatchNormalization())
    model.add(Conv2D(256, (3, 3), activation='relu', padding='same'))
    model.add(BatchNormalization())
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))

    # Block 4
    model.add(Conv2D(512, (3, 3), activation='relu', padding='same'))
    model.add(BatchNormalization())
    model.add(Conv2D(512, (3, 3), activation='relu', padding='same'))
    model.add(BatchNormalization())
    model.add(Conv2D(512, (3, 3), activation='relu', padding='same'))
    model.add(BatchNormalization())
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))

    # Block 5
    model.add(Conv2D(512, (3, 3), activation='relu', padding='same'))
    model.add(BatchNormalization())
    model.add(Conv2D(512, (3, 3), activation='relu', padding='same'))
    model.add(BatchNormalization())
    model.add(Conv2D(512, (3, 3), activation='relu', padding='same'))
    model.add(BatchNormalization())
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))
    model.add(Dropout(0.5))  # Adding Dropout with a dropout rate of 0.5

    # Classification block
    model.add(Flatten())
    model.add(Dense(4096, activation='relu', kernel_regularizer=l2(0.01)))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))  # Adding Dropout with a dropout rate of 0.5
    model.add(Dense(4096, activation='relu', kernel_regularizer=l2(0.01)))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))  # Adding Dropout with a dropout rate of 0.5
    model.add(Dense(1, activation='sigmoid'))

    # Compile the model
    model.compile(loss='binary_crossentropy', optimizer=AdamW(learning_rate=0.0001), metrics=['accuracy'])

    return model

# 조기 종료 설정
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# 전체 k-fold에 대한 accuracy를 저장할 리스트
all_accuracies = []

# K-fold 교차 검증 수행
for train_index, test_index in k_fold.split(X_new_reshaped):
    X_train, X_test = X_new_reshaped[train_index], X_new_reshaped[test_index]
    y_train, y_test = y_new[train_index], y_new[test_index]

    model = create_vgg16_model_with_upsampling()


    # 모델 훈련
    model.fit(X_train, y_train, epochs=30, batch_size=64, validation_data=(X_test, y_test)) # 오버피팅 가능성이 있기 때문에 epoch 줄임

    # 모델 평가
    eval_result = model.evaluate(X_test, y_test)
    print(f"Test Loss: {eval_result[0]}, Test Accuracy: {eval_result[1]}")

    # 평가 결과를 리스트에 추가
    all_accuracies.append(eval_result[1])

# 전체 k-fold에 대한 평균 accuracy 계산
average_accuracy = np.mean(all_accuracies)
print(f"\nAverage Accuracy Across All Folds: {average_accuracy}")

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Test Loss: 4.215697288513184, Test Accuracy: 0.9929906725883484
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Test Loss: 4.250791549682617, Test Accuracy: 0.9976635575294495
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30

Epoch 40

In [None]:
# 데이터 불러오기
data_dir = "/content/drive/MyDrive/Colab Notebooks/finalproject_dataset"
X, y = load_images_and_labels(data_dir)

# 레이블 인코딩
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
y = to_categorical(y, num_classes=2)  # 클래스 개수에 따라 수정

# 이미지 데이터를 전처리
# Flatten each image if needed
X = np.array(X)
y = np.array(y)

X_flatten = X.reshape(X.shape[0], -1)

# Apply SMOTE
smt = SMOTE()
X_new, y_new = smt.fit_resample(X_flatten, y)

X_new_reshaped = X_new.reshape(X_new.shape[0], 128, 128, 3)

# K-fold 교차 검증을 위한 K 값 설정
k_fold = KFold(n_splits=5, shuffle=True, random_state=42)

# 모델 생성 함수
def create_vgg16_model():
    model = Sequential()

    # Block 1
    model.add(Conv2D(64, (3, 3), activation='relu', padding='same', input_shape=(128, 128, 3)))
    model.add(Conv2D(64, (3, 3), activation='relu', padding='same'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))

    # Block 2
    model.add(Conv2D(128, (3, 3), activation='relu', padding='same'))
    model.add(Conv2D(128, (3, 3), activation='relu', padding='same'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))

    # Block 3
    model.add(Conv2D(256, (3, 3), activation='relu', padding='same'))
    model.add(Conv2D(256, (3, 3), activation='relu', padding='same'))
    model.add(Conv2D(256, (3, 3), activation='relu', padding='same'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))

    # Block 4
    model.add(Conv2D(512, (3, 3), activation='relu', padding='same'))
    model.add(Conv2D(512, (3, 3), activation='relu', padding='same'))
    model.add(Conv2D(512, (3, 3), activation='relu', padding='same'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))

    # Block 5
    model.add(Conv2D(512, (3, 3), activation='relu', padding='same'))
    model.add(Dropout(0.5))
    model.add(Conv2D(512, (3, 3), activation='relu', padding='same'))
    model.add(Dropout(0.5))
    model.add(Conv2D(512, (3, 3), activation='relu', padding='same'))
    model.add(Dropout(0.5))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))
    model.add(Dropout(0.5))

    # Classification block
    model.add(Flatten())
    model.add(Dense(4096, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(4096, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))

    # Compile the model
    model.compile(loss='binary_crossentropy', optimizer=AdamW(learning_rate=0.0001), metrics=['accuracy'])

    return model


# 전체 k-fold에 대한 accuracy를 저장할 리스트
all_accuracies = []

# K-fold 교차 검증 수행
for train_index, test_index in k_fold.split(X_new_reshaped):
    X_train, X_test = X_new_reshaped[train_index], X_new_reshaped[test_index]
    y_train, y_test = y_new[train_index], y_new[test_index]

    model = create_vgg16_model()


    # 모델 훈련
    model.fit(X_train, y_train, epochs=40, batch_size=128, validation_data=(X_test, y_test)) # 오버피팅 가능성이 있기 때문에 epoch 줄임

    # 모델 평가
    eval_result = model.evaluate(X_test, y_test)
    print(f"Test Loss: {eval_result[0]}, Test Accuracy: {eval_result[1]}")

    # 평가 결과를 리스트에 추가
    all_accuracies.append(eval_result[1])

# 전체 k-fold에 대한 평균 accuracy 계산
average_accuracy = np.mean(all_accuracies)
print(f"\nAverage Accuracy Across All Folds: {average_accuracy}")

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
Test Loss: 0.0417899452149868, Test Accuracy: 0.9907407164573669
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch

# Data Augmentation for Minor class

class 1이 minor class

Random으로 여러 개의 augmentation을 minor class에 적용

In [None]:
import os
import cv2
import random
from sklearn.utils import shuffle
from tensorflow.keras.preprocessing.image import ImageDataGenerator

def load_images_and_labels(data_dir, minority_class):
    images = []
    labels = []

    # 랜덤으로 선택할 증강 리스트
    augmentation_options = [
        {'name': 'rotation', 'param': 40},
        {'name': 'width_shift', 'param': 0.2},
        {'name': 'height_shift', 'param': 0.2},
        {'name': 'shear', 'param': 0.2},
        {'name': 'zoom', 'param': 0.2},
        {'name': 'horizontal_flip', 'param': True},
    ]

    # 데이터 증강을 위한 ImageDataGenerator 정의
    datagen = ImageDataGenerator()

    for folder_name in os.listdir(data_dir):
        folder_path = os.path.join(data_dir, folder_name)
        if os.path.isdir(folder_path):
            is_minority_class = (folder_name == minority_class)

            for filename in os.listdir(folder_path):
                img_path = os.path.join(folder_path, filename)
                img = cv2.imread(img_path)
                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
                img = cv2.resize(img, (128, 128), interpolation=cv2.INTER_LINEAR)

                # 원본 이미지를 리스트에 추가
                images.append(img)
                labels.append(folder_name)

                # 랜덤으로 선택한 증강 옵션을 적용 (for the minority class)
                if is_minority_class:
                    selected_augmentations = random.sample(augmentation_options, k=random.randint(0, len(augmentation_options)))
                    for aug_option in selected_augmentations:
                        if aug_option['name'] == 'rotation':
                            img = ImageDataGenerator(rotation_range=aug_option['param']).random_transform(img)
                        elif aug_option['name'] == 'width_shift':
                            img = ImageDataGenerator(width_shift_range=aug_option['param']).random_transform(img)
                        elif aug_option['name'] == 'height_shift':
                            img = ImageDataGenerator(height_shift_range=aug_option['param']).random_transform(img)
                        elif aug_option['name'] == 'shear':
                            img = ImageDataGenerator(shear_range=aug_option['param']).random_transform(img)
                        elif aug_option['name'] == 'zoom':
                            img = ImageDataGenerator(zoom_range=aug_option['param']).random_transform(img)
                        elif aug_option['name'] == 'horizontal_flip':
                            img = ImageDataGenerator(horizontal_flip=aug_option['param']).random_transform(img)

                        # 증강된 이미지를 리스트에 추가
                        images.append(img)
                        labels.append(folder_name)
                else:
                    # For majority class, you can optionally choose not to apply augmentation
                    pass

    return shuffle(images, labels, random_state=42)



In [None]:
# 데이터 불러오기
data_dir = "/content/drive/MyDrive/Colab Notebooks/finalproject_dataset"
X, y = load_images_and_labels(data_dir, 1)

# 레이블 인코딩
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
y = to_categorical(y, num_classes=2)  # 클래스 개수에 따라 수정

# 이미지 데이터를 전처리
X = np.array(X)
y = np.array(y)

# K-fold 교차 검증을 위한 K 값 설정
k_fold = KFold(n_splits=5, shuffle=True, random_state=42)

# 모델 생성 함수
def create_vgg16_model():
    model = Sequential()

    # Block 1
    model.add(Conv2D(64, (3, 3), activation='relu', padding='same', input_shape=(128, 128, 3)))
    model.add(Conv2D(64, (3, 3), activation='relu', padding='same'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))

    # Block 2
    model.add(Conv2D(128, (3, 3), activation='relu', padding='same'))
    model.add(Conv2D(128, (3, 3), activation='relu', padding='same'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))

    # Block 3
    model.add(Conv2D(256, (3, 3), activation='relu', padding='same'))
    model.add(Conv2D(256, (3, 3), activation='relu', padding='same'))
    model.add(Conv2D(256, (3, 3), activation='relu', padding='same'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))

    # Block 4
    model.add(Conv2D(512, (3, 3), activation='relu', padding='same'))
    model.add(Conv2D(512, (3, 3), activation='relu', padding='same'))
    model.add(Conv2D(512, (3, 3), activation='relu', padding='same'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))

    # Block 5
    model.add(Conv2D(512, (3, 3), activation='relu', padding='same'))
    model.add(Dropout(0.5))
    model.add(Conv2D(512, (3, 3), activation='relu', padding='same'))
    model.add(Dropout(0.5))
    model.add(Conv2D(512, (3, 3), activation='relu', padding='same'))
    model.add(Dropout(0.5))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))
    model.add(Dropout(0.5))

    # Classification block
    model.add(Flatten())
    model.add(Dense(4096, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(4096, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(2, activation='softmax'))

    # Compile the model
    model.compile(loss='binary_crossentropy', optimizer=AdamW(learning_rate=0.0001), metrics=['accuracy'])

    return model


# 전체 k-fold에 대한 accuracy를 저장할 리스트
all_accuracies = []

# K-fold 교차 검증 수행
for train_index, test_index in k_fold.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model = create_vgg16_model()


    # 모델 훈련
    model.fit(X_train, y_train, epochs=50, batch_size=128, validation_data=(X_test, y_test))

    # 모델 평가
    eval_result = model.evaluate(X_test, y_test)
    print(f"Test Loss: {eval_result[0]}, Test Accuracy: {eval_result[1]}")

    # 평가 결과를 리스트에 추가
    all_accuracies.append(eval_result[1])

# 전체 k-fold에 대한 평균 accuracy 계산
average_accuracy = np.mean(all_accuracies)
print(f"\nAverage Accuracy Across All Folds: {average_accuracy}")

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Test Loss: 0.36083418130874634, Test Accuracy: 0.8846153616905212
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoc

Random augmentation 1개 적용

데이터 양이 2배 가까이 차이나기에, 이를 균형 맞추기 위해 augmentation 1개만 적용해서 데이터를 2배로 만듦

In [None]:
import os
import cv2
import random
from sklearn.utils import shuffle
from tensorflow.keras.preprocessing.image import ImageDataGenerator

def load_images_and_labels(data_dir, minority_class):
    images = []
    labels = []

    # 랜덤으로 선택할 증강 리스트
    augmentation_options = [
        {'name': 'rotation', 'param': 40},
        {'name': 'width_shift', 'param': 0.2},
        {'name': 'height_shift', 'param': 0.2},
        {'name': 'shear', 'param': 0.2},
        {'name': 'zoom', 'param': 0.2},
        {'name': 'horizontal_flip', 'param': True},
    ]

    # 데이터 증강을 위한 ImageDataGenerator 정의
    datagen = ImageDataGenerator()

    for folder_name in os.listdir(data_dir):
        folder_path = os.path.join(data_dir, folder_name)
        if os.path.isdir(folder_path):
            is_minority_class = (folder_name == minority_class)

            for filename in os.listdir(folder_path):
                img_path = os.path.join(folder_path, filename)
                img = cv2.imread(img_path)
                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
                img = cv2.resize(img, (128, 128), interpolation=cv2.INTER_LINEAR)

                # 원본 이미지를 리스트에 추가
                images.append(img)
                labels.append(folder_name)

                # 랜덤으로 선택한 증강 옵션을 적용 (for the minority class)
                if is_minority_class:
                    random_augmentation = random.choice(augmentation_options)
                    aug_name, aug_param = random_augmentation['name'], random_augmentation['param']
                    img = apply_augmentation(img, aug_name, aug_param)

                    # 증강된 이미지를 리스트에 추가
                    images.append(img)
                    labels.append(folder_name)

    return shuffle(images, labels, random_state=42)

def apply_augmentation(img, aug_name, aug_param):
    datagen = ImageDataGenerator()
    if aug_name == 'rotation':
        img = datagen.apply_transform(img, {'theta': aug_param})
    elif aug_name == 'width_shift':
        img = datagen.apply_transform(img, {'tx': aug_param * img.shape[1]})
    elif aug_name == 'height_shift':
        img = datagen.apply_transform(img, {'ty': aug_param * img.shape[0]})
    elif aug_name == 'shear':
        img = datagen.apply_transform(img, {'shear': aug_param})
    elif aug_name == 'zoom':
        img = datagen.apply_transform(img, {'zx': 1 + aug_param, 'zy': 1 + aug_param})
    elif aug_name == 'horizontal_flip':
        img = datagen.apply_transform(img, {'flip_horizontal': aug_param})

    return img

In [None]:
# 데이터 불러오기
data_dir = "/content/drive/MyDrive/Colab Notebooks/finalproject_dataset"
X, y = load_images_and_labels(data_dir, 1)

# 레이블 인코딩
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
y = to_categorical(y, num_classes=2)  # 클래스 개수에 따라 수정

# 이미지 데이터를 전처리
X = np.array(X)
y = np.array(y)

# K-fold 교차 검증을 위한 K 값 설정
k_fold = KFold(n_splits=5, shuffle=True, random_state=42)

# 모델 생성 함수
def create_vgg16_model():
    model = Sequential()

    # Block 1
    model.add(Conv2D(64, (3, 3), activation='relu', padding='same', input_shape=(128, 128, 3)))
    model.add(Conv2D(64, (3, 3), activation='relu', padding='same'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))

    # Block 2
    model.add(Conv2D(128, (3, 3), activation='relu', padding='same'))
    model.add(Conv2D(128, (3, 3), activation='relu', padding='same'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))

    # Block 3
    model.add(Conv2D(256, (3, 3), activation='relu', padding='same'))
    model.add(Conv2D(256, (3, 3), activation='relu', padding='same'))
    model.add(Conv2D(256, (3, 3), activation='relu', padding='same'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))

    # Block 4
    model.add(Conv2D(512, (3, 3), activation='relu', padding='same'))
    model.add(Conv2D(512, (3, 3), activation='relu', padding='same'))
    model.add(Conv2D(512, (3, 3), activation='relu', padding='same'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))

    # Block 5
    model.add(Conv2D(512, (3, 3), activation='relu', padding='same'))
    model.add(Dropout(0.5))
    model.add(Conv2D(512, (3, 3), activation='relu', padding='same'))
    model.add(Dropout(0.5))
    model.add(Conv2D(512, (3, 3), activation='relu', padding='same'))
    model.add(Dropout(0.5))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))
    model.add(Dropout(0.5))

    # Classification block
    model.add(Flatten())
    model.add(Dense(4096, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(4096, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(2, activation='softmax'))

    # Compile the model
    model.compile(loss='binary_crossentropy', optimizer=AdamW(learning_rate=0.0001), metrics=['accuracy'])

    return model


# 전체 k-fold에 대한 accuracy를 저장할 리스트
all_accuracies = []

# K-fold 교차 검증 수행
for train_index, test_index in k_fold.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model = create_vgg16_model()


    # 모델 훈련
    model.fit(X_train, y_train, epochs=50, batch_size=128, validation_data=(X_test, y_test))

    # 모델 평가
    eval_result = model.evaluate(X_test, y_test)
    print(f"Test Loss: {eval_result[0]}, Test Accuracy: {eval_result[1]}")

    # 평가 결과를 리스트에 추가
    all_accuracies.append(eval_result[1])

# 전체 k-fold에 대한 평균 accuracy 계산
average_accuracy = np.mean(all_accuracies)
print(f"\nAverage Accuracy Across All Folds: {average_accuracy}")

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Test Loss: 0.41423851251602173, Test Accuracy: 0.8461538553237915
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoc