# 개요

# 데이터 준비

## Library, Module import

In [None]:
pip install split-folders

In [None]:
import splitfolders

import os

import cv2
import numpy as np

from matplotlib import pyplot as plt
import matplotlib.cm as cm

import tensorflow as tf

import pickle

In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img

from tensorflow.keras.applications.resnet50 import ResNet50

from tensorflow.keras import datasets, layers, models

from tensorflow.keras.layers import Dense, Flatten, MaxPooling2D
from tensorflow.keras import Input
from tensorflow.keras.layers import Dropout, BatchNormalization

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

## 데이터 마운트 
(용량 조절 / 구글드라이브에 있는 자료 접근)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import sys
from PIL import Image

def change_img_qualty(original_path, change_path, qualty=85):
    """
    Change Image Qualty
    :param original_path: 원본 경로
    :param change_path: 변경 후 새롭게 저장될 경로
    :param qualty: Qualty(품질) 퍼센트(기본 : 85%)
    :return:
    """
    if not os.path.exists(change_path):
        os.mkdir(change_path)
    try:
        ims_list = os.listdir(original_path)
        ims_list.sort()
    except FileNotFoundError as e:
        print("이미지 원본 디렉터리가 존재하지 않습니다...")
        sys.exit(0)
    success_cnt = 0
    fail_cnt = 0
    for filename in ims_list:
        file = original_path + filename
        try:
            im = Image.open(file)
            im.save(os.path.join(change_path, filename), qualty=qualty)
            print("+ 성공 : {success}\n  "
                  "- {success_path}"
                  .format(success=file, success_path=os.path.join(change_path, filename))
                  )
            success_cnt += 1
        except Exception as e:
            print("+ 실패 : {fail}".format(fail=file))
            fail_cnt += 1
    print("\n성공 : {success_cnt} 건 / 실패 : {fail_cnt} 건".format(success_cnt=success_cnt, fail_cnt=fail_cnt))
    sys.exit(0)


if __name__ == '__main__':
    original_path = '/Desktop/tmp/0/'
    change_path = '/Desktop/tmp/1/'
    change_img_qualty(original_path, change_path)

In [None]:
folder_path = '/content/drive/MyDrive/imageRec_data/Training'
label_names = os.listdir(folder_path)
label_names

## Train 데이터 불러오기 

In [None]:
path = '/content/drive/MyDrive/imageRec_data'

# 서브 디렉토리 목록 출력
for root, subdirs, files in os.walk(path):
    for d in subdirs:
        fullpath = root + '/' + d
        print(fullpath)

# 서브 디렉토리별 파일 개수 출력
for root, subdirs, files in os.walk(path):
    if len(files) > 0:
        print(root, len(files))

In [None]:
dataset = {}

# 이미지와 라벨 리스트에 담기 (하나의 변수(자료구조)에 모든 이미지 담기)
for label in os.listdir(folder_path):
    sub_path = folder_path+'/'+label+'/'
    dataset[label] = []         # 라벨 키값 부여, 밸류값 빈리스트
    for filename in os.listdir(sub_path):
        dataset[label].append(sub_path+filename)    # 라벨 밸류값 넣기

dataset

# 이미지 전처리

## resize with padding

In [None]:
!mkdir resized

In [None]:
!mkdir resized/Pig_seg_1+
!mkdir resized/Pig_seg_1
!mkdir resized/Pig_seg_2

In [None]:
import time
dataset.items()

In [None]:
for label, filenames in dataset.items():
    for filename in filenames:
        img = cv2.imread(filename)

        percent = 1
        if(img.shape[1] > img.shape[0]) :       
            percent = 128/img.shape[1]
        else :
            percent = 128/img.shape[0]

        img = cv2.resize(img, dsize=(0, 0), fx=percent, fy=percent, interpolation=cv2.INTER_LINEAR)
        y,x,h,w = (0,0,img.shape[0], img.shape[1])

        w_x = (128-(w-x))/2  
        h_y = (128-(h-y))/2

        if(w_x < 0):         
            w_x = 0
        elif(h_y < 0):
            h_y = 0

        M = np.float32([[1,0,w_x], [0,1,h_y]])  
        img_re = cv2.warpAffine(img, M, (128, 128))   

        time.sleep(0.35)
       
        # cv2.imwrite('{0}.jpg',image .format(file)) #파일저장
        cv2.imwrite('/content/resized/{0}/{1}'.format(label, filename.split("/")[-1]) , img_re)

In [None]:
path = '/content/resized'

# 서브 디렉토리 목록 출력
for root, subdirs, files in os.walk(path):
    for d in subdirs:
        fullpath = root + '/' + d
        print(fullpath)

# 서브 디렉토리별 파일 개수 출력
for root, subdirs, files in os.walk(path):
    if len(files) > 0:
        print(root, len(files))

## Data Split

In [None]:
splitfolders.ratio('resized', output='dataset', seed=77, ratio=(0.7, 0.15, 0.15))

## Trainset Augmentation (선택사항)

In [None]:
datagen = ImageDataGenerator(
    rotation_range=40,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    vertical_flip=True,
)
# 회전각도(degree), 너비/높이 전환비율, 사다리꼴, 줌, 뒤집기
# cutmix, mixup 등은 다른 함수(generator)를 사용해야 함.
# 파일명은 기존에 없는 이름으로 부여

In [None]:
folder_path

In [None]:
for label in  os.listdir(folder_path):
    label_path = folder_path + '/' + label + '/'
    for filename in os.listdir(label_path): 
        filepath = label_path + filename

        img = load_img(filepath)
        # img 출력
        # plt.imshow(img)
        # break
        x = img_to_array(img)
        # x.shape 출력
        # print(x.shape)
        # break
        x = x.reshape((1,) + x.shape)
        # 데이터 묶음 개수(1,) : 데이터가 하나여도 (1,) + ...

        i = 0
        # flow : augmentation 함수 (결과를 실제로 output해줌 - 폴더에 이미지가 추가되는 것 확인)
            # 용량이 커지므로 flow함수는 잘 쓰지는 않음... 다만 예시를 보기 좋다.
            # generator (for문 안에서:generator 함수를 먼저 실행 후 종속코드 실행)
            # flow 3번(증강 3번)
        for batch in datagen.flow(x, batch_size=1,
                                save_to_dir=label_path, save_prefix=label, save_format='jpg'):
            i += 1
            if i > 2:
                break  

In [None]:
## 전처리된 데이터 딕셔너리/리스트에 저장하기

folder_path = '/content/dataset/train'
dataset = {}

# 이미지와 라벨 리스트에 담기
for label in os.listdir(folder_path):
    sub_path = folder_path+'/'+label+'/'
    dataset[label] = []
    for filename in os.listdir(sub_path):
        dataset[label].append(sub_path+filename)

dataset

In [None]:
label2index = {'Pig_seg_1+' : 0, 'Pig_seg_1' : 1 , 'Pig_seg_2' : 2}
#labels = list(label2index.keys())
#labels

In [None]:
x_train, y_train = [], []

for label, filenames in dataset.items():
    for filename in filenames:
        image = cv2.imread(filename) # img를 array 형태로 변경
        
        x_train.append(image)
        y_train.append(label2index[label]) # label을 index로 변경
        time.sleep(0.4)

In [None]:
x_train, y_train = np.array(x_train), np.array(y_train)

In [None]:
x_train = x_train.astype('int8')  
# float32 : 메모리 용량이 너무 커짐(실수형)

In [None]:
x_train.shape, y_train.shape

## Zero Centering

In [None]:
def zero_mean(image):
    # zero-centering
    return np.mean(image, axis=0)

In [None]:
zero_mean_img = zero_mean(x_train)

In [None]:
zero_mean_img.shape

In [None]:
x_train -= zero_mean_img

## 검증/시험 데이터 준비 및 pickle 파일 저장

### Test data

In [None]:
folder_path = '/content/dataset/test'
dataset = {}

# 이미지와 라벨 리스트에 담기
for label in os.listdir(folder_path):
    sub_path = folder_path+'/'+label+'/'
    dataset[label] = []
    for filename in os.listdir(sub_path):
        dataset[label].append(sub_path+filename)

dataset

In [None]:
x_test, y_test = [], []

for label, filenames in dataset.items():
    for filename in filenames:
        image = cv2.imread(filename) # img를 array 형태로 변경

        x_test.append(image)
        y_test.append(label2index[label]) # label을 index로 변경
        time.sleep(0.4)

In [None]:
x_test, y_test = np.array(x_test), np.array(y_test)

In [None]:
x_test = x_test.astype('int8')

In [None]:
x_test.shape, y_test.shape

In [None]:
x_test -= zero_mean_img

In [None]:
path = '/content/dataset/test'

# 서브 디렉토리 목록 출력
for root, subdirs, files in os.walk(path):
    for d in subdirs:
        fullpath = root + '/' + d
        print(fullpath)

# 서브 디렉토리별 파일 개수 출력
for root, subdirs, files in os.walk(path):
    if len(files) > 0:
        print(root, len(files))

### Validation data

In [None]:
folder_path = '/content/dataset/val'
dataset = {}

# 이미지와 라벨 리스트에 담기
for label in os.listdir(folder_path):
    sub_path = folder_path+'/'+label+'/'
    dataset[label] = []
    for filename in os.listdir(sub_path):
        dataset[label].append(sub_path+filename)

dataset

In [None]:
dataset.items()

In [None]:
x_val, y_val = [], []

for label, filenames in dataset.items():
    for filename in filenames:
        image = cv2.imread(filename) # img를 array 형태로 변경

        x_val.append(image)
        y_val.append(label2index[label]) # label을 index로 변경
        time.sleep(0.4)

In [None]:
x_val, y_val= np.array(x_val), np.array(y_val)

In [None]:
x_val = x_val.astype('int8')

In [None]:
x_val.shape, y_val.shape

In [None]:
x_val -= zero_mean_img

## 결과데이터 pickle로 저장하기

In [None]:
with open('x_train.pickle', 'wb') as f:
    pickle.dump(x_train, f, protocol=pickle.HIGHEST_PROTOCOL)

with open('y_train.pickle', 'wb') as f:
    pickle.dump(y_train, f, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
!mv x_train.pickle /content/drive/MyDrive/imageRec_data/pickle
!mv y_train.pickle /content/drive/MyDrive/imageRec_data/pickle

In [None]:
with open('x_val.pickle', 'wb') as f:
    pickle.dump(x_val, f, protocol=pickle.HIGHEST_PROTOCOL)

with open('y_val.pickle', 'wb') as f:
    pickle.dump(y_val, f, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
!mv x_val.pickle /content/drive/MyDrive/imageRec_data/pickle
!mv y_val.pickle /content/drive/MyDrive/imageRec_data/pickle

In [None]:
with open('x_test.pickle', 'wb') as f:
    pickle.dump(x_test, f, protocol=pickle.HIGHEST_PROTOCOL)

with open('y_test.pickle', 'wb') as f:
    pickle.dump(y_test, f, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
!mv x_test.pickle /content/drive/MyDrive/imageRec_data/pickle
!mv y_test.pickle /content/drive/MyDrive/imageRec_data/pickle

## pickle 파일 불러오기 
(재사용)

In [None]:

!mv x_train.pickle /content
!mv y_train.pickle /content
!mv x_test.pickle /content
!mv y_test.pickle /content
!mv x_val.pickle /content
!mv y_val.pickle /content


In [None]:
with open('x_train.pickle', 'rb') as x_train_pk:
    pickle.load(x_train_pk)

with open('y_train.pickle', 'rb') as y_train_pk:
    pickle.load(y_train_pk)

with open('x_val.pickle', 'rb') as x_val_pk:
    pickle.load(x_val_pk)

with open('y_val.pickle', 'rb') as y_val_pk:
    pickle.load(y_val_pk)

with open('x_test.pickle', 'rb') as x_test_pk:
    pickle.load(x_test_pk)

with open('y_test.pickle', 'rb') as y_test_pk:
    pickle.load(y_test_pk)