In [None]:
import numpy as np 
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

### 하드웨어 가속기 확인하기

In [None]:
import math, re, os
import tensorflow as tf
import numpy as np
from matplotlib import pyplot as plt
from kaggle_datasets import KaggleDatasets
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
print("Tensorflow version " + tf.__version__)
AUTO = tf.data.experimental.AUTOTUNE

# Detect TPU, return appropriate distribution strategy
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver() 
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy() 

print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
# import tensorflow as tf

# # GPU 확인
# gpus = tf.config.list_physical_devices('GPU')
# if gpus:
#     for gpu in gpus:
#         print(f"GPU 사용 가능: {gpu}")
# else:
#     print("GPU 사용 불가능")

# # TPU 확인
# try:
#     tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU 확인
#     print('TPU 사용 가능:', tpu.master())
# except ValueError:
#     print("TPU 사용 불가능")

# # CPU 확인
# cpus = tf.config.list_physical_devices('CPU')
# if cpus:
#     print(f"CPU 사용 가능: {cpus[0]}")

### 데이터 확인하기

In [None]:
# Train Data
tfrecord_file_path_train = '/kaggle/input/tpu-getting-started/tfrecords-jpeg-192x192/train/00-192x192-798.tfrec'

# TFRecord 파일에서 첫 번째 레코드를 읽어서 파싱
raw_dataset = tf.data.TFRecordDataset(tfrecord_file_path_train)

# 첫 번째 레코드만 읽어들임
for raw_record in raw_dataset.take(1):
    example = tf.train.Example()
    example.ParseFromString(raw_record.numpy())
    print(example)

In [None]:
def _parse_function(example_proto):
    features = {
        'image': tf.io.FixedLenFeature([], tf.string),
        'class': tf.io.FixedLenFeature([], tf.int64),
        'id': tf.io.FixedLenFeature([], tf.string),
    }
    return tf.io.parse_single_example(example_proto, features)

train_files = tf.io.gfile.glob('/kaggle/input/tpu-getting-started/tfrecords-jpeg-192x192/train/*.tfrec')
raw_train_dataset = tf.data.TFRecordDataset(train_files)
print(raw_train_dataset)
parsed_train_dataset = raw_train_dataset.map(_parse_function)
print(parsed_train_dataset)

unique_labels = set()
for parsed_record in parsed_train_dataset:
    unique_labels.add(parsed_record['class'].numpy())

num_labels = len(unique_labels)
print(f'Total unique labels in the train dataset: {num_labels}')

### 데이터 전처리

In [None]:
def parse_tfrecord_function(example_proto, include_label=True):
    if include_label:
        image_feature_description = {
            'image': tf.io.FixedLenFeature([], tf.string),
            'class': tf.io.FixedLenFeature([], tf.int64),
            'id': tf.io.FixedLenFeature([], tf.string),
        }
    else:
        image_feature_description = {
            'image': tf.io.FixedLenFeature([], tf.string),
            'id': tf.io.FixedLenFeature([], tf.string),
        }
    return tf.io.parse_single_example(example_proto, image_feature_description)

def load_dataset(filenames, include_label=True):
    raw_dataset = tf.data.TFRecordDataset(filenames)
    parsed_dataset = raw_dataset.map(lambda x: parse_tfrecord_function(x, include_label))
    return parsed_dataset

file_paths = [
    "/kaggle/input/tpu-getting-started/tfrecords-jpeg-192x192/",
    "/kaggle/input/tpu-getting-started/tfrecords-jpeg-224x224/",
    "/kaggle/input/tpu-getting-started/tfrecords-jpeg-331x331/",
    "/kaggle/input/tpu-getting-started/tfrecords-jpeg-512x512/",
]

train_dataset = None
val_dataset = None
test_dataset = None

# 각 파일 경로를 순회하며 데이터셋을 로드
for base_path in file_paths:
    train_files = tf.io.gfile.glob(base_path + 'train/*.tfrec')
    test_files = tf.io.gfile.glob(base_path + 'test/*.tfrec')
    val_files = tf.io.gfile.glob(base_path + 'val/*.tfrec')
    
    if train_files:
        new_train_dataset = load_dataset(train_files, include_label=True)
        train_dataset = new_train_dataset if train_dataset is None else train_dataset.concatenate(new_train_dataset)
    
    if test_files:
        new_test_dataset = load_dataset(test_files, include_label=False) # 레이블을 포함하지 않음
        test_dataset = new_test_dataset if test_dataset is None else test_dataset.concatenate(new_test_dataset)
        
    if val_files:
        new_val_dataset = load_dataset(val_files, include_label=True)
        val_dataset = new_val_dataset if val_dataset is None else val_dataset.concatenate(new_val_dataset)

In [None]:
print(f'Train Dataset: {train_dataset}')
print(f'Test Dataset: {test_dataset}')
print(f'Validation Dataset: {val_dataset}')

In [None]:
def preprocess(features):
    image = tf.image.decode_jpeg(features['image'], channels=3)

    # 레이블 데이터
    label = features['class']

    # 이미지 데이터
    image = tf.image.resize(image, [512, 512])
    image = tf.cast(image, tf.float32) / 255.0

    # 레이블을 원-핫 인코딩
    label = tf.one_hot(label, depth=104)
    return image, label

# 전처리 적용 전 데이터셋에서 하나의 배치 샘플링 및 출력
for features in train_dataset.take(1):
    print("전처리 전:")
    print("이미지 데이터:", features['image'])
    print("레이블 데이터:", features['class'])

# 전처리 적용
train_dataset = train_dataset.map(preprocess)
val_dataset = val_dataset.map(preprocess)

# 전처리 적용 후 데이터셋에서 하나의 배치 샘플링 및 출력
for images, labels in train_dataset.take(1):
    print("전처리 후:")
    print("이미지 데이터:", images.numpy()[0]) 
    print("레이블 데이터:", labels.numpy()[0]) 
    
# 배치 사이즈 적용
train_dataset = train_dataset.batch(32)
val_dataset = val_dataset.batch(32)

### 모델링

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.applications import VGG16

tf.keras.backend.clear_session()

with strategy.scope():
    model = Sequential([
        Conv2D(32, (3, 3), activation='swish', input_shape=(512, 512, 3)),
        MaxPooling2D(2, 2),
        Conv2D(64, (3, 3), activation='swish'),
        MaxPooling2D(2, 2),
        Conv2D(128, (3, 3), activation='swish'),
        MaxPooling2D(2, 2),
        Conv2D(128, (3, 3), activation='swish'),
        MaxPooling2D(2, 2),

        Flatten(),
        Dense(512, activation='swish'),
        Dropout(0.5),
        Dense(104, activation='softmax')
    ])

    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    
model.summary()

In [None]:
early_stopping = EarlyStopping(monitor='val_loss', patience=3)

history = model.fit(train_dataset,
                    validation_data=val_dataset,
                    epochs=100,
                    callbacks=[early_stopping])

In [None]:
import matplotlib.pyplot as plt

# 정확도 그래프
plt.figure(figsize=(8, 8))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='accuracy')
plt.plot(history.history['val_accuracy'], label='val_accuracy')
plt.legend(loc='lower right')

# 손실 그래프
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.legend(loc='upper right')
plt.show()

### csv 파일로 만들기

In [None]:
# test_dataset에 대한 전처리 함수 정의
def preprocess_test(features):
    image = tf.image.decode_jpeg(features['image'], channels=3)
    image = tf.image.resize(image, [512, 512])
    image = tf.cast(image, tf.float32) / 255.0
    return image, features['id']

# test_dataset에 전처리 함수 적용
test_dataset = test_dataset.map(preprocess_test).batch(128)

# 모델을 사용해 예측하기
predicted_classes = []
image_ids = []

for images, ids in test_dataset:
    predictions = model.predict(images)
    predicted_classes.extend(tf.argmax(predictions, axis=-1).numpy()) # 예측된 클래스 저장
    image_ids.extend([id_str.decode('utf-8') for id_str in ids.numpy()]) # id 값을 bytes에서 string으로 변환하여 저장


# 예측 결과와 이미지 ID 매핑
results = pd.DataFrame({
    'id': image_ids,
    'label': predicted_classes  # 'class' 대신 'label'을 사용
})

results = results.drop_duplicates(subset='id', keep='first')  # 첫 번째로 나타나는 'id'만 유지하고 나머지는 제거

# 결과를 CSV 파일로 저장
results.to_csv('submission.csv', index=False)