<a href="https://colab.research.google.com/github/ekdls02/ekdls2025/blob/main/xai_%EA%B3%BC%EC%A0%9C4_%ED%9D%89%EB%B6%80_X_Ray_%EC%9D%B4%EB%AF%B8%EC%A7%80%EC%97%90%EC%84%9C_%ED%8F%90%EB%A0%B4_%EA%B2%80%EC%B6%9C.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.models import Model
from tensorflow.keras.layers import GlobalAveragePooling2D, Dropout, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2
from sklearn.utils import class_weight # 클래스 가중치 계산을 위해 추가
import os


# 1. 데이터 로드 및 전처리
train_npz = np.load("/content/train.npz")
test_npz = np.load("/content/test.npz")

x_train_raw, y_train_raw = train_npz['x'], train_npz['y']
x_test_raw = test_npz['x']

def prep_images(x, target=(224,224)):
    x_out = []
    for im in x:
        if im.ndim == 2:  # grayscale
            im = im[..., np.newaxis]
        r = tf.image.resize(im, target).numpy()
        r = np.repeat(r, 3, axis=-1)
        x_out.append(r.astype('float32'))
    return np.array(x_out)

X = prep_images(x_train_raw)
X_test = prep_images(x_test_raw)

Y_str = y_train_raw.flatten()
Y = np.array([1 if v.lower()=='pneumonia' else 0 for v in Y_str])

X_train, X_val, Y_train, Y_val, Y_str_train, Y_str_val = train_test_split(
    X, Y, Y_str, test_size=0.2, stratify=Y, random_state=42
)


# 클래스 가중치 계산
class_weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(Y_train),
    y=Y_train
)
class_weights = dict(enumerate(class_weights))
print(f"클래스 가중치 계산 완료: {class_weights}")


# 2.데이터 증강 및 정규화
BATCH_SIZE = 32

train_gen_raw = ImageDataGenerator(
    rotation_range=15, width_shift_range=0.15, height_shift_range=0.15,
    zoom_range=0.15, horizontal_flip=True,
    featurewise_center=True, featurewise_std_normalization=True
)

val_gen_raw = ImageDataGenerator(
    featurewise_center=True, featurewise_std_normalization=True
)

train_gen_raw.fit(X_train)
val_gen_raw.fit(X_train)

train_gen = train_gen_raw.flow(X_train, Y_train, batch_size=BATCH_SIZE)
val_gen = val_gen_raw.flow(X_val, Y_val, batch_size=BATCH_SIZE)


# 3. 모델 구축 및 Stage 1 학습 (정규화 미세 조정 적용)
checkpoint_path = "best_model_stage1.weights.h5"

base = ResNet50(weights='imagenet', include_top=False, input_shape=X_train.shape[1:])
base.trainable = False

x = GlobalAveragePooling2D()(base.output)
x = Dropout(0.5)(x) # Dropout 비율 감소 (0.6 -> 0.5)
x = Dense(256, activation='relu', kernel_regularizer=l2(1e-5))(x) # L2 강도 감소 (1e-4 -> 1e-5)
out = Dense(1, activation='sigmoid')(x)
model = Model(base.input, out)

model.compile(optimizer=Adam(3e-4), loss='binary_crossentropy', metrics=['accuracy'])

# Stage 1: 분류기 학습
es_stage1 = EarlyStopping(monitor='val_accuracy', patience=8, restore_best_weights=True)
mc_stage1 = ModelCheckpoint(checkpoint_path, monitor='val_accuracy', save_best_only=True, save_weights_only=True, mode='max')

print("--- Stage 1: 분류기 레이어 학습 (base model frozen) ---")
model.fit(
    train_gen,
    epochs=20,
    validation_data=val_gen,
    steps_per_epoch=len(X_train)//BATCH_SIZE,
    callbacks=[es_stage1, mc_stage1],
    class_weight=class_weights # 클래스 가중치 적용
)

model.load_weights(checkpoint_path)


# 4. Stage 2: Fine-tuning (LR 스케줄러 적용)
base.trainable = True

print("--- Stage 2: Fine-tuning 범위 제한 ---")
trainable_layer_count = 0
for layer in base.layers:
    if 'conv5_block' in layer.name or 'res5' in layer.name:
        layer.trainable = True
        trainable_layer_count += 1
    else:
        layer.trainable = False

print(f"총 학습 가능 레이어 수 (base model 내): {trainable_layer_count}")

# Fine-tuning을 위한 낮은 학습률 (3e-6 유지)
model.compile(optimizer=Adam(3e-6), loss='binary_crossentropy', metrics=['accuracy'])

# Fine-tuning: 긴 patience
es_stage2 = EarlyStopping(monitor='val_accuracy', patience=20, restore_best_weights=True)
mc_stage2 = ModelCheckpoint("best_model_stage2.weights.h5", monitor='val_accuracy', save_best_only=True, save_weights_only=True, mode='max')
# 학습률 스케줄러 추가
reduce_lr = ReduceLROnPlateau(monitor='val_accuracy', factor=0.5, patience=5, min_lr=1e-7, verbose=1)


print("\n--- Stage 2: Fine-tuning (LR 스케줄러 적용) ---")
model.fit(
    train_gen,
    epochs=60,
    validation_data=val_gen,
    steps_per_epoch=len(X_train)//BATCH_SIZE,
    callbacks=[es_stage2, mc_stage2, reduce_lr], # LR 스케줄러 추가
    class_weight=class_weights # 클래스 가중치 적용
)

model.load_weights("best_model_stage2.weights.h5")


# 5. Threshold 기반 예측
X_test_norm = val_gen_raw.standardize(X_test.copy())

y_pred_test_proba = model.predict(X_test_norm).flatten()
y_pred_class = (y_pred_test_proba > 0.5).astype(int)

y_pred_str = np.array(['pneumonia' if x==1 else 'normal' for x in y_pred_class])


# 6. 제출 파일 생성
submission_path = "/content/submission.csv"
df_submission = pd.read_csv(submission_path)
df_submission['result'] = y_pred_str
output_file = "new_submission.csv"
df_submission.to_csv(output_file, index=False)

print(f"제출 파일 생성 완료: {output_file}")
print("Positive 개수:", int((df_submission['result']=='pneumonia').sum()))

클래스 가중치 계산 완료: {0: np.float64(1.0), 1: np.float64(1.0)}
Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m94765736/94765736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
--- Stage 1: 분류기 레이어 학습 (base model frozen) ---


  self._warn_if_super_not_called()


Epoch 1/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 6s/step - accuracy: 0.5032 - loss: 0.8151 - val_accuracy: 0.6667 - val_loss: 0.6122
Epoch 2/20
[1m1/3[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m0s[0m 98ms/step - accuracy: 0.5625 - loss: 0.7778



[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 411ms/step - accuracy: 0.5625 - loss: 0.7778 - val_accuracy: 0.7000 - val_loss: 0.5644
Epoch 3/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 814ms/step - accuracy: 0.5764 - loss: 0.7434 - val_accuracy: 0.9667 - val_loss: 0.4286
Epoch 4/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step - accuracy: 0.5625 - loss: 0.6305 - val_accuracy: 0.9667 - val_loss: 0.4047
Epoch 5/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 366ms/step - accuracy: 0.6708 - loss: 0.5386 - val_accuracy: 0.9667 - val_loss: 0.3470
Epoch 6/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step - accuracy: 0.7500 - loss: 0.4627 - val_accuracy: 0.9667 - val_loss: 0.3244
Epoch 7/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 430ms/step - accuracy: 0.8341 - loss: 0.4717 - val_accuracy: 0.9667 