<a href="https://colab.research.google.com/github/dhckdduf/first-repository/blob/main/Untitled12.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ✅ Google Drive 마운트
from google.colab import drive
drive.mount('/content/drive')

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# ✅ 데이터 불러오기 (파일 경로 수정)
train_file_path = "/content/drive/MyDrive/train.csv"
test_file_path = "/content/drive/MyDrive/test.csv"

df_train = pd.read_csv(train_file_path)
df_test = pd.read_csv(test_file_path)

# ✅ 레이블 인코딩 (문자 → 숫자 변환)
label_encoder = LabelEncoder()
df_train['label'] = label_encoder.fit_transform(df_train['label'])

# ✅ 데이터 분리 (ID, label, 픽셀 데이터)
X_train = df_train.drop(['ID', 'label'], axis=1).to_numpy().astype('float32')
y_train = df_train['label'].to_numpy().astype('int')

X_test = df_test.drop(['ID'], axis=1).to_numpy().astype('float32')
test_ids = df_test['ID'].values

# ✅ 정규화 (0~1로 스케일링)
X_train /= 255.0
X_test /= 255.0

# ✅ 차원 변환 (CNN 입력 형태로)
X_train = X_train.reshape(-1, 32, 32, 1)
X_test = X_test.reshape(-1, 32, 32, 1)

print(f"🔹 변환된 X_train shape: {X_train.shape}")
print(f"🔹 변환된 X_test shape: {X_test.shape}")

# ✅ 클래스별 샘플 개수 확인 (데이터 불균형 체크)
print("클래스별 샘플 개수:", np.bincount(y_train))

# ✅ 훈련 데이터와 검증 데이터 분리 (90:10)
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train, y_train, test_size=0.1, random_state=42, stratify=y_train
)

# ✅ 데이터 증강 (적절한 범위로 조정)
datagen = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.1,
    height_shift_range=0.1,
    horizontal_flip=True
)
datagen.fit(X_train_split)

# ✅ CNN 모델 단순화 (필터 개수 줄임)
num_classes = len(label_encoder.classes_)

model = models.Sequential([
    layers.Conv2D(32, (3,3), activation='relu', input_shape=(32, 32, 1)),
    layers.MaxPooling2D((2,2)),

    layers.Conv2D(64, (3,3), activation='relu'),
    layers.MaxPooling2D((2,2)),

    layers.Flatten(),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(num_classes, activation='softmax')
])

# ✅ 학습률 설정 (0.001)
learning_rate = 0.001
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

# ✅ Early Stopping 설정
callbacks = [
    EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
]

# ✅ 모델 컴파일
model.compile(optimizer=optimizer,
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# ✅ 모델 학습
history = model.fit(
    datagen.flow(X_train_split, y_train_split, batch_size=32),
    validation_data=(X_val_split, y_val_split),
    epochs=60,
    callbacks=callbacks
)

# ✅ 학습 결과 시각화 (과적합 여부 확인)
plt.figure(figsize=(12, 4))

# 정확도 그래프
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.title('Training & Validation Accuracy')

# 손실 그래프
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.title('Training & Validation Loss')

plt.show()

# ✅ 테스트 데이터 예측
pred_probs = model.predict(X_test)
pred_labels = np.argmax(pred_probs, axis=1)

# ✅ 제출 파일 생성
df_submit = pd.DataFrame({'ID': test_ids, 'label': label_encoder.inverse_transform(pred_labels)})
df_submit.to_csv('/content/drive/MyDrive/submission.csv', index=False)

print("🎉 제출 파일 'submission.csv' 생성 완료! 🚀")
