In [94]:
import numpy as np
import tensorflow

data = np.load("./train.npz")
test = np.load("./test.npz")
print(data.files)
print(test.files)

x_train = data['x']
y_train = data['y']

x_train = x_train.reshape(-1, 500, 500, 1) / 255.0
test_data = test['x'].reshape(-1, 500, 500, 1)/ 255.0

print(type(x_train))
print(y_train.shape)

['x', 'y']
['x']
<class 'numpy.ndarray'>
(150,)


In [96]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

print(y_train[:10])
le = LabelEncoder()
y_train = le.fit_transform(y_train)

print(y_train[:10])

['normal' 'normal' 'normal' 'pneumonia' 'normal' 'normal' 'pneumonia'
 'pneumonia' 'normal' 'normal']
[0 0 0 1 0 0 1 1 0 0]


In [63]:
from sklearn.model_selection import train_test_split

# 원본에서 직접 분리 (shuffle 자동)
# x_train, x_val, y_train, y_val = train_test_split(
#     x_train, y_train,
#     test_size=0.2,   # 20%를 validation
#     random_state=42,
#     shuffle=True,
#     stratify=y_train,
# )

In [64]:
from tensorflow import keras
from keras import Sequential, Input
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization, GlobalAveragePooling2D
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam

In [65]:
from sklearn.utils.class_weight import compute_class_weight

# 자동으로 가중치 계산
class_weights = compute_class_weight(
    'balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weight_dict = {0: class_weights[0], 1: class_weights[1]}

print("가중치:", class_weight_dict)
# 예: {0: 0.56, 1: 4.5} ← class 1이 적으니까 4.5배 페널티

가중치: {0: 1.0, 1: 1.0}


In [97]:
from tensorflow.keras import backend as K
K.clear_session()

# 1. 모델 구조 단순화 + Regularization 강화
model = Sequential([
    Input(shape=(500,500,1)),

    Conv2D(128, 3, padding='same', activation='relu'),
    # BatchNormalization(),
    MaxPooling2D(),

    Conv2D(64, 3, padding='same', activation='relu'),
    # BatchNormalization(),
    MaxPooling2D(),


    Flatten(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss="binary_crossentropy",
    metrics=["accuracy"]
)


train_datagen = ImageDataGenerator(
#    height_shift_range=0.05,
    horizontal_flip=True,
    fill_mode='nearest'
)

val_datagen = ImageDataGenerator()

train_generator = train_datagen.flow(
    x_train, y_train,
    batch_size=32,
    shuffle=True
)

# val_generator = val_datagen.flow(
#     x_val, y_val,
#     batch_size=32,
#     shuffle=False
# )

# 4. Early Stopping 더 엄격하게
checkpoint_cb = ModelCheckpoint(
    "best-cnn-model.keras",
    monitor='accuracy',
    save_best_only=True
)

early_stopping_cb = EarlyStopping(
    patience=3,
    restore_best_weights=True,
    monitor='val_loss'
)

# 6. 학습
history = model.fit(
    x_train, y_train,
    epochs=20,
    validation_split=0.2,
    callbacks=[checkpoint_cb, early_stopping_cb],
    verbose=1
)

Epoch 1/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 9s/step - accuracy: 0.4115 - loss: 6.4329 - val_accuracy: 0.7667 - val_loss: 0.4062
Epoch 2/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 9s/step - accuracy: 0.7354 - loss: 0.6178 - val_accuracy: 0.8000 - val_loss: 0.4326
Epoch 3/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 9s/step - accuracy: 0.7923 - loss: 0.4231 - val_accuracy: 0.9333 - val_loss: 0.2117
Epoch 4/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 9s/step - accuracy: 0.8692 - loss: 0.2807 - val_accuracy: 0.9667 - val_loss: 0.1754
Epoch 5/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 9s/step - accuracy: 0.9598 - loss: 0.1522 - val_accuracy: 0.9667 - val_loss: 0.1641
Epoch 6/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 8s/step - accuracy: 0.9223 - loss: 0.1909 - val_accuracy: 0.9667 - val_loss: 0.2026
Epoch 7/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

In [99]:
print("Train:", np.unique(y_train, return_counts=True))
#print("Val:", np.unique(y_val, return_counts=True))

# 2. 예측 결과 확인
predictions = model.predict(test_data)
pred_classes = (predictions > 0.5).astype(int)
print("예측 분포:", np.unique(pred_classes, return_counts=True))
#print("실제 분포:", np.unique(y_val, return_counts=True))

Train: (array([0, 1]), array([75, 75]))
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 361ms/step
예측 분포: (array([0, 1]), array([18, 20]))


In [100]:
y_pred = model.predict(test_data)
print(y_pred)

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 362ms/step
[[9.9459726e-01]
 [3.6882408e-04]
 [8.6356777e-01]
 [9.1503459e-01]
 [9.4002151e-01]
 [2.3479292e-07]
 [6.5308505e-01]
 [9.3701720e-01]
 [8.8756061e-01]
 [4.9942604e-01]
 [2.1679710e-01]
 [6.0874781e-06]
 [1.5336383e-04]
 [1.5633500e-06]
 [2.9220166e-06]
 [9.9701548e-01]
 [9.6270031e-01]
 [6.2788367e-02]
 [8.2238734e-01]
 [9.9143821e-01]
 [6.9090748e-01]
 [4.1497493e-05]
 [9.5236582e-01]
 [2.1554599e-05]
 [3.0305159e-06]
 [4.8772208e-02]
 [3.1819032e-04]
 [8.6031514e-01]
 [9.9227202e-01]
 [9.6991986e-01]
 [9.5290869e-01]
 [9.9445421e-01]
 [1.2622567e-01]
 [1.1149867e-06]
 [4.9343016e-06]
 [9.9163330e-01]
 [9.7536802e-01]
 [5.8369114e-06]]


In [110]:
import numpy as np

y_pred_classes = (y_pred > 0.67).astype(int)
#print(y_pred_classes)
print(len(y_pred_classes))
a, cnt = np.unique(y_pred_classes, return_counts=True)
print(a, cnt)

38
[0 1] [19 19]


In [111]:
y_pred_classes = y_pred_classes.flatten().tolist()
y_list = ['normal', 'pneumonia']
y = [y_list[i] for i in y_pred_classes]
print(y)

['pneumonia', 'normal', 'pneumonia', 'pneumonia', 'pneumonia', 'normal', 'normal', 'pneumonia', 'pneumonia', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'pneumonia', 'pneumonia', 'normal', 'pneumonia', 'pneumonia', 'pneumonia', 'normal', 'pneumonia', 'normal', 'normal', 'normal', 'normal', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'normal', 'normal', 'normal', 'pneumonia', 'pneumonia', 'normal']


In [112]:
import pandas as pd
import numpy as np

df = pd.read_csv("submission.csv")

df.dropna(axis=1, inplace=True)

# 아래 "np.arange(0, df.shape[0], 1)" 부분을, 솜솜이가 만든 모델이 예측한 값으로 대체!
#y_pred = np.arange(0, df.shape[0], 1) 

df["result"] = y
df.to_csv("new_submission2.csv", index=False)