# Basic CNN with tensorflow and keras over the Plant Pathology Dataset



In [None]:
#넘파이 임포트
import numpy as np
#판다스 불러오기
import pandas as pd
#os라이브러리로드
import os
#텐서플로우 로드
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
import keras
from keras.preprocessing import image
from keras.models import Sequential
from keras.layers import Conv2D, MaxPool2D, Flatten,Dense,Dropout,BatchNormalization
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import VGG16, InceptionResNetV2, ResNet50, Xception
import cv2
from PIL import Image

In [None]:
#이미 리사이즈된 이미지 들고오기
path = '../input/plant-pathology-2021-fgvc8/'
train_dir = path + 'train_images/'
test_dir = path + 'test_images/'
#resized images
train_paths = '../input/resized-plant2021/img_sz_256/' 

In [None]:
df = pd.read_csv('../input/plant-pathology-2021-fgvc8/train.csv')

In [None]:
df.head()

In [None]:
df.labels.value_counts()

In [None]:
#y변수 살펴보기
df['labels'].unique()

In [None]:
#y변수 타입을 문자열로 하기
df['labels'] = df['labels'].astype(str)

In [None]:
# 각 y변수가 몇개씩인지 그래프 그려보기
plt.figure(figsize=(8,5))
sns.countplot(data = df,y='labels')

In [None]:
# 그래프그리기함수
def plot_examples(label):
    #플롯사이즈설정
    fig, ax = plt.subplots(1, 5, figsize=(25, 15))
    ax=ax.ravel()
    for i in range(5):
        idx = df[df['labels']==label].index[i]
        
        image = cv2.imread(train_paths+df.loc[idx, 'image'])
        
        image =cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        ax[i].imshow(image)
        ax[i].set_title(label)
        ax[i].set_xticklabels([])
        ax[i].set_yticklabels([])

In [None]:
#플롯그리기
for labels in list(df['labels'].unique()):
    plot_examples(labels)

In [None]:
#라벨이 여러개인 경우를 해결
#Converting to multi-label problem
#https://www.kaggle.com/shanmukh05/plant-pathology-2k21-baseline-tpu-training
#우선 라벨 종류별 수를 센다
count_dict = df.labels.value_counts()
#라벨에 번호를 매기는 딕셔너리를 만듦
label2id = {
    'scab': 0,
    'frog_eye_leaf_spot' : 1,
    'rust' : 2,
    'complex' : 3,
    'powdery_mildew' : 4,
}
#라벨 종류수 세기
NUM_CLASS = len(label2id)
#
id2label = dict([(value, key) for key, value in label2id.items()])
# 건강하지 않으면 단어간 사이 나누기
df["labels"] = df["labels"].map(lambda x : [i for i in x.split(" ") if i != "healthy"])
# 
df["labels"] = df["labels"].map(lambda x : [label2id[i] for i in x])
df.head()

In [None]:
#트레인 데이터 만들기
train_datagen = ImageDataGenerator(rescale = 1./255.,#크기조정
                                   rotation_range = 40,#회전
                                   width_shift_range = 0.2, #너비조정
                                   height_shift_range = 0.2,#높이 조정
                                   shear_range = 0.2,#
                                   zoom_range = 0.2,#확대
                                   horizontal_flip = True,#뒤집기
                                   validation_split = 0.2) #
test_datagen = ImageDataGenerator(rescale = 1./255,#크기조정
                                  validation_split = 0.2)#

In [None]:
# 위에서 만든 트레인제너레이터 함수에다가 데이터프레임 넣기
train_generator = train_datagen.flow_from_dataframe(dataframe = df,#df넣기
                                                   directory = train_paths,#디렉토리
                                                   target_size = (256,256),#타겟 크기
                                                   x_col = 'image',#x축은 이미지
                                                   y_col = 'labels',#y축은 라벨(y변수)
                                                   batch_size = 128,#배치사이즈
                                                   color_mode = 'rgb', #색깔 모드
                                                   class_mode = 'categorical',#범주형으로 분류
                                                   subset = 'training')#트레이님

test_generator = test_datagen.flow_from_dataframe(dataframe = df,
                                                 directory = train_paths,
                                                 target_size = (256,256),
                                                 x_col = 'image',
                                                 y_col = 'labels',
                                                 batch_size = 128,
                                                 color_mode = 'rgb',
                                                 class_mode = 'categorical',
                                                 subset = 'validation')

In [None]:
#레이어를 쌓아 모델을 만들자
model = tf.keras.Sequential([
    # convolution으로 피처 찾기, 컨볼루션 필터의 수는 32개, 
    #컨볼루션 커널의 크기는 (3,3),샘플 수를 제외한 입력 형태는 (256,256)이고 채널수 3개,활성화함수는 relu
    tf.keras.layers.Conv2D(32, (3,3), input_shape=(256,256,3), activation='relu'),
    # feature map 크기 줄이고 불필요한 피처 제거 - (2,2)pooling layer 단위에서 가장 밝은 색상 추출
    tf.keras.layers.MaxPooling2D(2,2),
    # 
    tf.keras.layers.Conv2D(64, (3,3), activation='relu'),
    tf.keras.layers.MaxPooling2D(2,2),
    tf.keras.layers.Conv2D(128, (3,3), activation='relu'),
    tf.keras.layers.MaxPooling2D(2,2),
    tf.keras.layers.Conv2D(128, (3,3), activation='relu'),
    tf.keras.layers.MaxPooling2D(2,2),
    tf.keras.layers.Conv2D(128, (3,3), activation='relu'),
    tf.keras.layers.MaxPooling2D(2,2),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(NUM_CLASS, activation='sigmoid')
])
model.summary()

In [None]:
METRIC = "val_f1_score"

def create_callbacks(metric = METRIC):
    
    cpk_path = './best_model.h5'
    
    checkpoint = tf.keras.callbacks.ModelCheckpoint(
        filepath=cpk_path,
        monitor= metric,
        mode='max',
        save_best_only=True,
        verbose=1,
    )

    reducelr = tf.keras.callbacks.ReduceLROnPlateau(
        monitor= metric,
        mode='max',
        factor=0.2,
        patience=3,
        verbose=1
    )

    earlystop = tf.keras.callbacks.EarlyStopping(
        monitor= metric,
        mode='max',
        patience=10, 
        verbose=1
    )
    
    callbacks = [checkpoint, reducelr, earlystop]         
    
    return callbacks

In [None]:
from tensorflow.keras.optimizers import RMSprop,Adam
import tensorflow_addons as tfa

epochs = 40
batch_size = 256
optimizer = Adam(lr = 0.001)
model.compile(optimizer = optimizer,
             loss = 'binary_crossentropy',
             metrics = ['accuracy', tfa.metrics.F1Score(num_classes = NUM_CLASS,average = "macro", name = "f1_score")])

In [None]:
callbacks = create_callbacks()
history = model.fit(train_generator,epochs = epochs,validation_data = test_generator,verbose=1, callbacks = callbacks)

In [None]:
figure, axis = plt.subplots(2, 1, figsize=(15,15))
axis.ravel()
axis[0].plot(history.history['accuracy'],label='Training Data')
axis[0].plot(history.history['val_accuracy'], label='Validation Data')
axis[0].set(xlabel='Epochs',ylabel='Accuracy', title='Accuracy vs Epochs')
axis[0].legend(loc="upper left")

axis[1].plot(history.history['loss'], label='Training Data')
axis[1].plot(history.history['val_loss'], label='Validation Data')
axis[1].set(xlabel='Epochs',ylabel='Loss', title='Categorical Crossentropy Loss vs Epochs')
axis[1].legend(loc="upper left")

plt.show()

# Acknowledgements

Starter code used from this notebook by Ayush: 
https://www.kaggle.com/aayushmishra1512/plant-pathology-starter

Resized dataset used from:
https://www.kaggle.com/ankursingh12/resized-plant2021