In [1]:
import pandas as pd
import numpy as np
import os

import matplotlib.pyplot as plt
import seaborn as sns
import shutil
import glob
import seaborn as sns
from  tensorflow.keras.preprocessing import image
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [2]:
base_path = '/kaggle/input/cnu-mlclass2'

train_df = pd.read_csv(os.path.join(base_path, 'train.csv'))

# EDA

In [3]:
train_df

In [4]:
train_df.label.value_counts()

In [5]:
# 데이터 분포

figure = plt.figure(figsize=(5, 5))
ax = figure.add_subplot()
sns.countplot(data=train_df, x='label', ax=ax)

for patch in ax.patches:
    label_x = patch.get_x() + patch.get_width()/2
    label_y = patch.get_y() + patch.get_height()/2
    text_msg = str(int(patch.get_height())) 
    ax.text(label_x, label_y, text_msg, horizontalalignment='center', verticalalignment='center')
    
plt.show()

In [6]:
# 이미지 시각화
train_path = base_path +'/train'

plt.figure(figsize=(20,20))
index = np.random.randint(len(train_df), size=25)
for i in range(25):
    label = train_df['label'][index[i]]
    path_img = train_df['image'][index[i]]
    path_img = os.path.join(train_path, os.path.join(label, path_img))
    
    img = image.load_img(path_img,target_size=(224,224))
    img_tensor = image.img_to_array(img)
    img_tensor = np.expand_dims(img_tensor, axis=0)
    img_tensor /= 255.
    plt.subplot(5,5,i+1)
    plt.xticks([])
    plt.yticks([])
    plt.grid(False)
    plt.imshow(img, cmap=plt.cm.binary)
    plt.xlabel(label)
plt.show()

# 전처리

In [7]:
from sklearn.model_selection import train_test_split

train, valid = train_test_split(train_df, test_size=0.25, stratify=train_df['label'])
train = train.reset_index(drop=True)
valid = valid.reset_index(drop=True)

In [8]:
train

In [9]:
train['image'] = train['image'].map(lambda x: x.split('_')[0]+'/'+x)
train.head(10)

In [10]:
valid['image'] = valid['image'].map(lambda x: x.split('_')[0]+'/'+x)
valid.head(10)

In [11]:
# 분할 후 데이터 분포

figure, ax = plt.subplots(1, 2, figsize=(13,5))
sns.countplot(data=train, x='label', ax=ax[0])
sns.countplot(data=valid, x='label', ax=ax[1])

ax[0].set_title('train')
for patch in ax[0].patches:
    label_x = patch.get_x() + patch.get_width()/2
    label_y = patch.get_y() + patch.get_height()/2
    text_msg = str(int(patch.get_height())) 
    ax[0].text(label_x, label_y, text_msg, horizontalalignment='center', verticalalignment='center')

ax[1].set_title('valid')
for patch in ax[1].patches:
    label_x = patch.get_x() + patch.get_width()/2
    label_y = patch.get_y() + patch.get_height()/2
    text_msg = str(int(patch.get_height())) 
    ax[1].text(label_x, label_y, text_msg, horizontalalignment='center', verticalalignment='center')

plt.show()

In [12]:
# 데이터 증강

height, width, channel = (224, 224, 3)
batch_size = 16
labels = ['sunny', 'snow', 'rain', 'dust', 'fog']

train_datagen= ImageDataGenerator(rescale=1./255,
                              rotation_range=20,
                              width_shift_range=0.05,
                              height_shift_range=0.05,
                              zoom_range=[0.7, 1],
                              horizontal_flip=True,
                              vertical_flip=False,
                              fill_mode='nearest')



train_generator = train_datagen.flow_from_dataframe(train,
                                                   directory = train_path,
                                                   x_col='image',
                                                   y_col='label',
                                                   batch_size=batch_size,
                                                   class_mode='categorical',
                                                   color_mode= 'rgb',
                                                   target_size=(height, width))





valid_datagen = ImageDataGenerator(rescale=1. / 255.)

valid_generator = valid_datagen.flow_from_dataframe(valid,
                                                   directory = train_path,
                                                   x_col='image',
                                                   y_col='label',
                                                   batch_size=batch_size,
                                                   class_mode='categorical',
                                                   color_mode= 'rgb',
                                                   target_size=(height, width))

In [13]:
# 데이터 증강 시각화

pred2label={}
for x in train_generator.class_indices.keys():
    pred2label[train_generator.class_indices[x]] = x


plt.figure(figsize=(20,15))
index = np.random.randint(len(train_df), size=batch_size)
x_augs, y_augs = train_generator.next()
for i in range(16):
    plt.subplot(4,4,i+1)
    plt.imshow(x_augs[i, :, :, :])
    plt.title(pred2label[np.argmax(y_augs[i, :])], fontsize=30)
    plt.xticks([])
    plt.yticks([])
    plt.grid(False)
plt.tight_layout()
plt.show()

# 모델 학습

In [14]:
import tensorflow

from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Input, InputLayer, Resizing
from tensorflow.keras.layers import Conv2D, BatchNormalization
from tensorflow.keras.layers import MaxPool2D, GlobalAveragePooling2D
from tensorflow.keras.layers import Flatten, RandomRotation, RandomTranslation, RandomFlip, RandomContrast
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.applications.resnet50 import ResNet50
from tensorflow.keras.applications.efficientnet import EfficientNetB5
from tensorflow.keras.applications.efficientnet import EfficientNetB7
from tensorflow.keras.applications.efficientnet import EfficientNetB6
from tensorflow.keras.applications.efficientnet import EfficientNetB0
from tensorflow.keras.applications.efficientnet import preprocess_input
from tensorflow.keras.applications.efficientnet import decode_predictions
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import cohen_kappa_score
from keras.callbacks import Callback, EarlyStopping, ReduceLROnPlateau

In [15]:
print(height,  ' ', width, ' ', channel)

In [16]:
# # # # cnn_model = Sequential()

# # # # cnn_model.add(Conv2D(filters=32,kernel_size=3,activation="relu", input_shape=(height, width, channel)))
# # # # cnn_model.add(MaxPool2D(pool_size=2, strides=2))
# # # # cnn_model.add(Conv2D(filters=64, kernel_size=3,activation="relu"))
# # # # cnn_model.add(MaxPool2D(pool_size=2, strides=2))
# # # # cnn_model.add(Conv2D(filters=64, kernel_size=3,activation="relu"))
# # # # cnn_model.add(MaxPool2D(pool_size=2, strides=2))
# # # # cnn_model.add(Dropout(0.15))
# # # # cnn_model.add(Conv2D(filters=128, kernel_size=3, activation="relu"))
# # # # cnn_model.add(MaxPool2D(pool_size=2, strides=2))
# # # # cnn_model.add(Conv2D(filters=128, kernel_size=3, activation="relu"))
# # # # cnn_model.add(MaxPool2D(pool_size=2, strides=2))
# # # # cnn_model.add(Dropout(0.15))
# # # # cnn_model.add(Flatten())
# # # # cnn_model.add(Dense(units=512, activation="relu"))
# # # # cnn_model.add(Dropout(0.15))

inputs = Input(shape=(height,width,channel))
base_model = EfficientNetB0(include_top=False, weights='imagenet',pooling='avg')(inputs)
outputs = Dense(5, activation='softmax')(base_model)
model_eff = Model(inputs, outputs)

In [17]:
model_eff.summary()

In [18]:
# 모델 컴파일 진행 - 아까와 달리 categorical_crossentropy 사용 > label이 숫자형 데이터이므로
model_eff.compile(optimizer = Adam(learning_rate= 0.001),
                  loss = 'categorical_crossentropy',
                  metrics=['accuracy'])


# 조기멈춤
stop = EarlyStopping(patience=5)

# 자동 저장
checkpoint = ModelCheckpoint("./best_model",monitor='val_accuracy',
                            save_weights_only=True, mode='max',verbose=1,save_best_only=True)
callbacks = [stop, checkpoint]

# 모델 fitting
save = model_eff.fit(train_generator,epochs = 20, validation_data=valid_generator, callbacks=callbacks)

In [19]:
model_eff.compile(optimizer = Adam( learning_rate= 0.00001),loss = 'categorical_crossentropy',metrics=['accuracy'])

# 조기멈춤
stop = EarlyStopping(patience=5)

# 자동 저장
checkpoint = ModelCheckpoint("./best_model",monitor='val_accuracy',
                            save_weights_only=True, mode='max',verbose=1,save_best_only=True)
callbacks = [stop, checkpoint]

model_eff.summary()

In [20]:
history = model_eff.fit(train_generator, validation_data=valid_generator,initial_epoch=save.epoch[-1],callbacks=callbacks, epochs=20)

In [21]:
# 정확도와 손실값 확인

acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(len(acc))

plt.plot(epochs, acc, 'r', label='Training accuracy')
plt.plot(epochs, val_acc, 'b', label='Validation accuracy')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()

plt.plot(epochs, loss, 'r', label='Training Loss')
plt.plot(epochs, val_loss, 'b', label='Validation Loss')
plt.title('Training and validation loss')
plt.legend()

plt.show()

# 테스트

In [22]:
model_eff.load_weights('best_model')

In [23]:
test_df = pd.read_csv(base_path + "/test.csv")
test_df.head(5)

In [24]:
batch_size = len(test_df["image"])
height, width, channel = 224, 224, 3

In [25]:
# 테스트 이미지를 메모리에 올림.
test_path = base_path + '/test'

test_image = np.zeros((batch_size, height, width, channel))
print(test_image.shape)
cnt=0
for i in range(len(test_df["image"])):
    path_img = test_df['image'][i]
    img = image.load_img(os.path.join(test_path,path_img), target_size=(height, width))
    img_tensor = image.img_to_array(img)
    img_tensor = np.array(img_tensor,dtype="float32")

    img_tensor /= 255
    
    img_tensor = np.expand_dims(img_tensor, axis=0)
    
    test_image[i] = img_tensor

In [26]:
predictions = model_eff.predict(test_image)
predictions.shape

In [27]:
test_df['label'] = [pred2label[np.argmax(pred)] for pred in predictions]
test_df

In [28]:
test_df.to_csv('./submission.csv', index=False)